In [6]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from scipy import stats

### Load data

In [18]:
df = pd.read_csv('adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [47]:
for col in df.columns:
    print(col, end=', ')

age, workclass, fnlwgt, education, educational-num, marital-status, occupation, relationship, race, gender, capital-gain, capital-loss, hours-per-week, native-country, income, 

In [48]:
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

### 1. How many men and women (sex feature) are represented in this dataset?


In [49]:
df['gender'].unique()

array(['Male', 'Female'], dtype=object)

In [50]:
df.isnull().sum()

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [51]:
df['gender'].value_counts()

Male      21790
Female    10771
Name: gender, dtype: int64

There are 21790 males and 10771 females

### 2. What is the average age (age feature) of women?


In [53]:
df['age'][df['gender'] == 'Female'].mean()

36.85823043357163

The average age of women is ~36.86

### 3. What is the percentage of German citizens (native-country feature)?

In [69]:
# Option 1
(df['native-country'] == 'Germany').sum() / df['native-country'].count() * 100

0.42074874850281013

In [95]:
# Option 2, cleaner
val_counts = df['native-country'].value_counts()
german_cit_percentage = val_counts['Germany'] / val_counts.sum() * 100
german_cit_percentage

0.42074874850281013

Of all citizens, 0.4% are germans

### 4. What are the mean and standard deviation of age for those who earn more than 50K per year (salary feature) and those who earn less than 50K per year?

In [103]:
plus50k_mask = df['income'] == '>50K'

In [104]:
df_50Kplus = df['age'][plus50k_mask]

In [105]:
df_50Kplus.std()

10.519027719851826

In [106]:
df_50Kplus.mean()

44.24984058155847

The standard deviation has a value of ~10.52 and the mean is ~44.25

### 5. Is it true that people who earn more than 50K have at least a high school education? (education – Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters or Doctorate feature)


In [102]:
df['education'][plus50k_mask].value_counts()

Bachelors       2221
HS-grad         1675
Some-college    1387
Masters          959
Prof-school      423
Assoc-voc        361
Doctorate        306
Assoc-acdm       265
10th              62
11th              60
7th-8th           40
12th              33
9th               27
5th-6th           16
1st-4th            6
Name: education, dtype: int64

False, some completed only until grade 11th and some not even that

### 6. Display age statistics for each race (race feature) and each gender (sex feature). Use groupby() and describe(). Find the maximum age of men of Amer-Indian-Eskimo race.


In [112]:
df_2 = df[['age', 'race', 'gender']].groupby(by=['race', 'gender']).describe()

In [114]:
df_2

Unnamed: 0_level_0,Unnamed: 1_level_0,age,age,age,age,age,age,age,age
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max
race,gender,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Amer-Indian-Eskimo,Female,119.0,37.117647,13.114991,17.0,27.0,36.0,46.0,80.0
Amer-Indian-Eskimo,Male,192.0,37.208333,12.049563,17.0,28.0,35.0,45.0,82.0
Asian-Pac-Islander,Female,346.0,35.089595,12.300845,17.0,25.0,33.0,43.75,75.0
Asian-Pac-Islander,Male,693.0,39.073593,12.883944,18.0,29.0,37.0,46.0,90.0
Black,Female,1555.0,37.854019,12.637197,17.0,28.0,37.0,46.0,90.0
Black,Male,1569.0,37.6826,12.882612,17.0,27.0,36.0,46.0,90.0
Other,Female,109.0,31.678899,11.631599,17.0,23.0,29.0,39.0,74.0
Other,Male,162.0,34.654321,11.355531,17.0,26.0,32.0,42.0,77.0
White,Female,8642.0,36.811618,14.329093,17.0,25.0,35.0,46.0,90.0
White,Male,19174.0,39.652498,13.436029,17.0,29.0,38.0,49.0,90.0


In [128]:
df_2['age'].loc['Amer-Indian-Eskimo', 'max']['Male']

82.0

The max age for the Amer-Indian-Eskimo race is 82

### 7. Among whom is the proportion of those who earn a lot (>50K) greater: married or single men (marital-status feature)? Consider as married those who have a marital-status starting with Married (Married-civ-spouse, Married-spouse-absent or Married-AF-spouse), the rest are considered bachelors