# Adult
Predict whether annual income of an individual exceeds $50K/yr based on census data. Also known as "Census Income" dataset. 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

pd.set_option("display.max.columns", 100)
# to draw pictures in jupyter notebook
%matplotlib inline
warnings.filterwarnings("ignore")

In [3]:
DATA_URL = "https://raw.githubusercontent.com/Yorko/mlcourse.ai/main/data/"

In [5]:
data = pd.read_csv(DATA_URL + "adult.data.csv")
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [9]:
data.shape

(32561, 15)

## 1. How many men and women (sex feature) are represented in this dataset?

In [11]:
data["sex"].value_counts()

sex
Male      21790
Female    10771
Name: count, dtype: int64

## 2. What is the average age (age feature) of women?

In [45]:
data[data["sex"] == "Female"]["age"].mean()

36.85823043357163

## 3. What is the percentage of German citizens (native-country feature)?

In [67]:

float((data["native-country"] == "Germany").sum()) / data.shape[0]

0.004207487485028101

## 4-5. What are the mean and standard deviation of age for those who earn more than 50K per year (salary feature) and those who earn less than 50K per year?

In [83]:
data.groupby(["salary"])["age"].agg(["mean", "std"])

Unnamed: 0_level_0,mean,std
salary,Unnamed: 1_level_1,Unnamed: 2_level_1
<=50K,36.783738,14.020088
>50K,44.249841,10.519028


## 6. Is it true that people who earn more than 50K have at least high school education? (education – Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters or Doctorate feature)

In [101]:
data[["education", "education-num"]].value_counts()

education     education-num
HS-grad       9                10501
Some-college  10                7291
Bachelors     13                5355
Masters       14                1723
Assoc-voc     11                1382
11th          7                 1175
Assoc-acdm    12                1067
10th          6                  933
7th-8th       4                  646
Prof-school   15                 576
9th           5                  514
12th          8                  433
Doctorate     16                 413
5th-6th       3                  333
1st-4th       2                  168
Preschool     1                   51
Name: count, dtype: int64

In [218]:
pd.crosstab(data["salary"], data["education-num"] >= 11, normalize=True, margins=True)

education-num,False,True,All
salary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
<=50K,0.575504,0.183686,0.75919
>50K,0.101533,0.139277,0.24081
All,0.677037,0.322963,1.0


## 7. Display age statistics for each race (race feature) and each gender (sex feature). Use groupby() and describe(). Find the maximum age of men of Amer-Indian-Eskimo race.

In [115]:
data.groupby(["race", "sex"])["age"].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
race,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Amer-Indian-Eskimo,Female,119.0,37.117647,13.114991,17.0,27.0,36.0,46.0,80.0
Amer-Indian-Eskimo,Male,192.0,37.208333,12.049563,17.0,28.0,35.0,45.0,82.0
Asian-Pac-Islander,Female,346.0,35.089595,12.300845,17.0,25.0,33.0,43.75,75.0
Asian-Pac-Islander,Male,693.0,39.073593,12.883944,18.0,29.0,37.0,46.0,90.0
Black,Female,1555.0,37.854019,12.637197,17.0,28.0,37.0,46.0,90.0
Black,Male,1569.0,37.6826,12.882612,17.0,27.0,36.0,46.0,90.0
Other,Female,109.0,31.678899,11.631599,17.0,23.0,29.0,39.0,74.0
Other,Male,162.0,34.654321,11.355531,17.0,26.0,32.0,42.0,77.0
White,Female,8642.0,36.811618,14.329093,17.0,25.0,35.0,46.0,90.0
White,Male,19174.0,39.652498,13.436029,17.0,29.0,38.0,49.0,90.0


## 8. Among whom is the proportion of those who earn a lot (>50K) greater: married or single men (marital-status feature)? Consider as married those who have a marital-status starting with Married (Married-civ-spouse, Married-spouse-absent or Married-AF-spouse), the rest are considered bachelors.

In [13]:
marital_status= ["Married-civ-spouse", "Married-spouse-absent", "Married-AF-spouse"]
pd.crosstab(data[data["sex"] == "Male"]["marital-status"].isin(marital_status), data["salary"], normalize="index", margins=True)

salary,<=50K,>50K
marital-status,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.915505,0.084495
True,0.559486,0.440514
All,0.694263,0.305737


## 9. What is the maximum number of hours a person works per week (hours-per-week feature)? How many people work such a number of hours, and what is the percentage of those who earn a lot (>50K) among them?

In [17]:
## max number of hours worked by a person
data["hours-per-week"].max()

99

In [29]:
labor_exploited_people = data[data["hours-per-week"] == 99]
labor_exploited_people.count()

age               85
workclass         85
fnlwgt            85
education         85
education-num     85
marital-status    85
occupation        85
relationship      85
race              85
sex               85
capital-gain      85
capital-loss      85
hours-per-week    85
native-country    85
salary            85
dtype: int64

In [37]:
labor_exploited_people[labor_exploited_people["salary"] == ">50K"].shape[0] / labor_exploited_people.shape[0] * 100

29.411764705882355

## 10. Count the average time of work (hours-per-week) for those who earn a little and a lot (salary) for each country (native-country). What will these be for Japan?

In [77]:
data.groupby(["native-country", "salary"])["hours-per-week"].agg(["mean"]).head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,mean
native-country,salary,Unnamed: 2_level_1
?,<=50K,40.16476
?,>50K,45.547945
Cambodia,<=50K,41.416667
Cambodia,>50K,40.0
Canada,<=50K,37.914634
Canada,>50K,45.641026
China,<=50K,37.381818
China,>50K,38.9
Columbia,<=50K,38.684211
Columbia,>50K,50.0
