In [1]:
import pandas as pd
pd.options.display.max_rows = 100

import warnings
warnings.filterwarnings('ignore')

import numpy as np
SEED = 800

import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('seaborn-white')
%matplotlib inline 

## Data set availiable in:
https://archive.ics.uci.edu/ml/datasets/adult

In [None]:
!rm adult.data
!rm adult.test

!wget https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test

### First row of adult.test has |1x3 Cross validator value so we should skip it

In [2]:
adult_data = pd.read_csv('adult.data', header=None)
adult_test = pd.read_csv('adult.test', header=None, skiprows=1)

## Merge data sets because we will use cross validation

In [3]:
adult_data = pd.concat([adult_data, adult_test], ignore_index=True, axis=0)

## Column's description

* age: continuous.
* workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
* fnlwgt: continuous.
* education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
* education-num: continuous.
* marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
* occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
* relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried. 
* race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
* sex: Female, Male.
* capital-gain: continuous.
* capital-loss: continuous.
* hours-per-week: continuous.
* native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.

In [4]:
columns_name = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 
                'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain',
                'capital_loss', 'hours_per_week', 'native-country', 'income']

adult_data.columns = columns_name

### Check if dataset is OK

In [5]:
total_rows = adult_data.income.count()
if total_rows != 48842:
    print('We should have 48842 in Adult dataset. Please review data load')

### Describe numeric features

In [6]:
adult_data.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,189664.1,10.078089,1079.067626,87.502314,40.422382
std,13.71051,105604.0,2.570973,7452.019058,403.004552,12.391444
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117550.5,9.0,0.0,0.0,40.0
50%,37.0,178144.5,10.0,0.0,0.0,40.0
75%,48.0,237642.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


### Describe categorical features

In [7]:
adult_data.describe(include=['O'])

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,income
count,48842,48842,48842,48842,48842,48842,48842,48842,48842
unique,9,16,7,15,6,5,2,42,4
top,Private,HS-grad,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,<=50K
freq,33906,15784,22379,6172,19716,41762,32650,43832,24720


In [8]:
adult_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital-status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


# TO-DO

* one-hot-encoding
* standardazing
* original / transformed
* missing strategy
* build a model
* test accuracy

## Rename columns to original

In [9]:
for column in adult_data.columns:
    adult_data.rename(columns = {column:column + '_original'}, inplace = True)

## One-hot-encoding

In [10]:
dummies_data = pd.get_dummies(adult_data.drop('income_original', axis=1))

In [11]:
for column in dummies_data.columns:
    dummies_data.rename(columns = {column:column.replace('_original', '_transformed')}, inplace = True)

In [12]:
adult_data = pd.concat([adult_data,dummies_data],axis=1)

In [13]:
columns_original = ['workclass_original', 'education_original', 'marital-status_original', 
                    'occupation_original', 'relationship_original', 'race_original', 
                    'sex_original', 'native-country_original']

adult_data.drop(columns_original, axis=1, inplace=True)

## Standardizing

In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

columns_original = ['age_original', 'fnlwgt_original', 'education_num_original',
            'capital_gain_original', 'capital_loss_original','hours_per_week_original']

columns_scaled = ['age_transformed', 'fnlwgt_transformed', 'education_num_transformed',
            'capital_gain_transformed', 'capital_loss_transformed','hours_per_week_transformed']

adult_data[columns_scaled] = 0

adult_data[columns_scaled] = scaler.fit_transform(adult_data[columns_original])

In [15]:
adult_data.drop(columns_original, axis=1, inplace=True)

## Format target

In [16]:
adult_data['income_transformed'] = 0
adult_data['income_transformed'][adult_data['income_original'].str.contains('<=50K')] = 0
adult_data['income_transformed'][adult_data['income_original'].str.contains('>50K')] = 1

adult_data.drop(['income_original'], axis=1, inplace=True)

## Person Correlation
Here there a lot of features and it is hard to visualize with seaborn or other graphical tool
We separated only features that are correlated to target

We can see that marital-status , relationship and education level are the most correlated to target

In [17]:
corr = adult_data.astype(float).corr().abs()
unstack = corr.unstack()
unstack.income_transformed.sort_values(ascending=False)

income_transformed                                        1.000000
marital-status_transformed_ Married-civ-spouse            0.445853
relationship_transformed_ Husband                         0.403791
education_num_transformed                                 0.332613
marital-status_transformed_ Never-married                 0.318782
age_transformed                                           0.230369
hours_per_week_transformed                                0.227687
relationship_transformed_ Own-child                       0.225691
capital_gain_transformed                                  0.223013
sex_transformed_ Male                                     0.214628
sex_transformed_ Female                                   0.214628
occupation_transformed_ Exec-managerial                   0.210938
relationship_transformed_ Not-in-family                   0.190372
occupation_transformed_ Prof-specialty                    0.188793
education_transformed_ Bachelors                          0.18

## KNN model

In [18]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [19]:
for column in adult_data.columns:
    adult_data.rename(columns = {column:column.replace('_transformed', '')}, inplace = True)    

In [21]:
X = adult_data.drop(['income'],axis=1)
y = adult_data.income

## Try split into train and test

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED)

In [23]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train) 

y_pred = knn.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))

df_confusion = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
df_conf_norm = df_confusion / df_confusion.values.sum()
df_conf_norm

0.836688732682727
              precision    recall  f1-score   support

           0       0.88      0.91      0.89     11209
           1       0.67      0.61      0.64      3444

   micro avg       0.84      0.84      0.84     14653
   macro avg       0.78      0.76      0.77     14653
weighted avg       0.83      0.84      0.83     14653

[[10167  1042]
 [ 1351  2093]]


Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.693851,0.071112
1,0.0922,0.142838


## Cross Validation

In [25]:
scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy',verbose=0)

In [27]:
print('Accuracy: {}'.format(scores.mean()))

Accuracy: 0.8322755990992249
