# Occupation

### Introduction:

Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [1]:
import pandas as pd
import numpy as np

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user). 

### Step 3. Assign it to a variable called users.

In [3]:
url = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user"
df = pd.read_csv(url, delimiter = "|")

In [4]:
df.head(5)

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


### Step 4. Discover what is the mean age per occupation

In [9]:
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   user_id     943 non-null    int64 
 1   age         943 non-null    int64 
 2   gender      943 non-null    object
 3   occupation  943 non-null    object
 4   zip_code    943 non-null    object
dtypes: int64(2), object(3)
memory usage: 37.0+ KB


(943, 5)

In [13]:
df.groupby("occupation").mean()[["age"]]

Unnamed: 0_level_0,age
occupation,Unnamed: 1_level_1
administrator,38.746835
artist,31.392857
doctor,43.571429
educator,42.010526
engineer,36.38806
entertainment,29.222222
executive,38.71875
healthcare,41.5625
homemaker,32.571429
lawyer,36.75


### Step 5. Discover the Male ratio per occupation and sort it from the most to the least

In [60]:
df["is_male"] = df["gender"].apply(lambda x: 1 if x=="M" else 0)
df_new = df.groupby("occupation").count()
df_new

Unnamed: 0_level_0,user_id,age,gender,zip_code,is_male
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
administrator,79,79,79,79,79
artist,28,28,28,28,28
doctor,7,7,7,7,7
educator,95,95,95,95,95
engineer,67,67,67,67,67
entertainment,18,18,18,18,18
executive,32,32,32,32,32
healthcare,16,16,16,16,16
homemaker,7,7,7,7,7
lawyer,12,12,12,12,12


In [61]:
df_new["total_male"] = df.groupby("occupation").sum()["is_male"]
df_new

Unnamed: 0_level_0,user_id,age,gender,zip_code,is_male,total_male
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
administrator,79,79,79,79,79,43
artist,28,28,28,28,28,15
doctor,7,7,7,7,7,7
educator,95,95,95,95,95,69
engineer,67,67,67,67,67,65
entertainment,18,18,18,18,18,16
executive,32,32,32,32,32,29
healthcare,16,16,16,16,16,5
homemaker,7,7,7,7,7,1
lawyer,12,12,12,12,12,10


In [62]:
df_new["male_ratio"] = 100*df_new["total_male"] / df_new["user_id"]
df_new = df_new[["male_ratio"]]
df_new

Unnamed: 0_level_0,male_ratio
occupation,Unnamed: 1_level_1
administrator,54.43038
artist,53.571429
doctor,100.0
educator,72.631579
engineer,97.014925
entertainment,88.888889
executive,90.625
healthcare,31.25
homemaker,14.285714
lawyer,83.333333


### Step 6. For each occupation, calculate the minimum and maximum ages

In [36]:
df.groupby("occupation").describe()["age"].loc[:,["min", "max"]].rename(columns = {"min":"min_age", "max":"max_age"})

Unnamed: 0_level_0,min_age,max_age
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1
administrator,21.0,70.0
artist,19.0,48.0
doctor,28.0,64.0
educator,23.0,63.0
engineer,22.0,70.0
entertainment,15.0,50.0
executive,22.0,69.0
healthcare,22.0,62.0
homemaker,20.0,50.0
lawyer,21.0,53.0


### Step 7. For each combination of occupation and gender, calculate the mean age

In [64]:
df.groupby(["occupation", "gender"])["age"].describe().loc[:,["mean"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,mean
occupation,gender,Unnamed: 2_level_1
administrator,F,40.638889
administrator,M,37.162791
artist,F,30.307692
artist,M,32.333333
doctor,M,43.571429
educator,F,39.115385
educator,M,43.101449
engineer,F,29.5
engineer,M,36.6
entertainment,F,31.0


### Step 8.  For each occupation present the percentage of women and men

In [66]:
df_new = df.groupby("occupation").sum()[["is_male"]].rename(columns={"is_male":"male_count"})
df_new["count_all"] = df.groupby("occupation").count()["gender"]
df_new["female_count"] = df_new["count_all"] - df_new["male_count"]
df_new

Unnamed: 0_level_0,male_count,count_all,female_count
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
administrator,43,79,36
artist,15,28,13
doctor,7,7,0
educator,69,95,26
engineer,65,67,2
entertainment,16,18,2
executive,29,32,3
healthcare,5,16,11
homemaker,1,7,6
lawyer,10,12,2


In [67]:
df_new["male_percentage"] = 100*df_new["male_count"] / df_new["count_all"]
df_new["female_percentage"] = 100*df_new["female_count"] / df_new["count_all"]
df_new = df_new[["male_percentage", "female_percentage"]]
df_new

Unnamed: 0_level_0,male_percentage,female_percentage
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1
administrator,54.43038,45.56962
artist,53.571429,46.428571
doctor,100.0,0.0
educator,72.631579,27.368421
engineer,97.014925,2.985075
entertainment,88.888889,11.111111
executive,90.625,9.375
healthcare,31.25,68.75
homemaker,14.285714,85.714286
lawyer,83.333333,16.666667
