In [1]:
import pandas as pd
pd.options.display.max_rows = 100

import warnings
warnings.filterwarnings('ignore')

import numpy as np
SEED = 800

import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('seaborn-white')
%matplotlib inline 

## Data set availiable in:
https://archive.ics.uci.edu/ml/datasets/adult

In [2]:
!rm adult.data
!rm adult.test

!wget https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test

--2019-03-01 22:03:34--  https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
Resolvendo archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.249
Conectando-se a archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.249|:443... conectado.
A requisição HTTP foi enviada, aguardando resposta... 200 OK
Tamanho: 3974305 (3,8M) [text/plain]
Salvando em: “adult.data”


2019-03-01 22:03:51 (258 KB/s) - “adult.data” salvo [3974305/3974305]

--2019-03-01 22:03:51--  https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test
Resolvendo archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.249
Conectando-se a archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.249|:443... conectado.
A requisição HTTP foi enviada, aguardando resposta... 200 OK
Tamanho: 2003153 (1,9M) [text/plain]
Salvando em: “adult.test”


2019-03-01 22:04:05 (154 KB/s) - “adult.test” salvo [2003153/2003153]



### First row of adult.test has |1x3 Cross validator value so we should skip it

In [3]:
adult_data = pd.read_csv('adult.data', header=None)
adult_test = pd.read_csv('adult.test', header=None, skiprows=1)

In [4]:
adult_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Column's description

* age: continuous.
* workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
* fnlwgt: continuous.
* education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
* education-num: continuous.
* marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
* occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
* relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried. 
* race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
* sex: Female, Male.
* capital-gain: continuous.
* capital-loss: continuous.
* hours-per-week: continuous.
* native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.

In [5]:
columns_name = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 
                'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain',
                'capital_loss', 'hours_per_week', 'native-country', 'income']

adult_data.columns = columns_name
adult_test.columns = columns_name

### Check if dataset is OK

In [6]:
total_rows = adult_data.income.count() + adult_test.income.count()

if total_rows != 48842:
    print('We should have 48842 in Adult dataset. Please review data load')

### Describe numeric features

In [7]:
adult_data.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


### Describe categorical features

In [8]:
adult_data.describe(include=['O'])

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,income
count,32561,32561,32561,32561,32561,32561,32561,32561,32561
unique,9,16,7,15,6,5,2,42,2
top,Private,HS-grad,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,<=50K
freq,22696,10501,14976,4140,13193,27816,21790,29170,24720


In [9]:
adult_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital-status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


# TO-DO

* one-hot-encoding
* standardazing
* original / transformed
* missing strategy
* build a model
* test accuracy

## Rename columns to original

In [10]:
for column in adult_data.columns:
    adult_data.rename(columns = {column:column + '_original'}, inplace = True)
    adult_test.rename(columns = {column:column + '_original'}, inplace = True)

## One-hot-encoding

In [11]:
dummies_data = pd.get_dummies(adult_data.drop('income_original', axis=1))
dummies_test = pd.get_dummies(adult_test.drop('income_original', axis=1))

In [12]:
for column in dummies_data.columns:
    dummies_data.rename(columns = {column:column.replace('_original', '_transformed')}, inplace = True)
    dummies_test.rename(columns = {column:column.replace('_original', '_transformed')}, inplace = True)

In [13]:
adult_data = pd.concat([adult_data,dummies_data],axis=1)

In [14]:
columns_original = ['workclass_original', 'education_original', 'marital-status_original', 
                    'occupation_original', 'relationship_original', 'race_original', 
                    'sex_original', 'native-country_original']

adult_data.drop(columns_original, axis=1, inplace=True)

## Standardizing

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

columns_original = ['age_original', 'fnlwgt_original', 'education_num_original',
            'capital_gain_original', 'capital_loss_original','hours_per_week_original']

columns_scaled = ['age_transformed', 'fnlwgt_transformed', 'education_num_transformed',
            'capital_gain_transformed', 'capital_loss_transformed','hours_per_week_transformed']

adult_data[columns_scaled] = 0

adult_data[columns_scaled] = scaler.fit_transform(adult_data[columns_original])

In [16]:
adult_data.drop(columns_original, axis=1, inplace=True)

## Format target

In [17]:
adult_data['income_transformed'] = 0
adult_data['income_transformed'][adult_data['income_original'].str.contains('<=50K')] = 0
adult_data['income_transformed'][adult_data['income_original'].str.contains('>50K')] = 1

adult_data.drop(['income_original'], axis=1, inplace=True)

## Person Correlation
Here there a lot of features and it is hard to visualize with seaborn or other graphical tool
We separated only features that are correlated to target

We can see that marital-status , relationship and education level are the most correlated to target

In [18]:
corr = adult_data.astype(float).corr().abs()
unstack = corr.unstack()
unstack.income_transformed.sort_values(ascending=False)

income_transformed                                        1.000000
marital-status_transformed_ Married-civ-spouse            0.444696
relationship_transformed_ Husband                         0.401035
education_num_transformed                                 0.335154
marital-status_transformed_ Never-married                 0.318440
age_transformed                                           0.234037
hours_per_week_transformed                                0.229689
relationship_transformed_ Own-child                       0.228532
capital_gain_transformed                                  0.223329
sex_transformed_ Female                                   0.215980
sex_transformed_ Male                                     0.215980
occupation_transformed_ Exec-managerial                   0.214861
relationship_transformed_ Not-in-family                   0.188497
occupation_transformed_ Prof-specialty                    0.185866
education_transformed_ Bachelors                          0.18

## KNN model

In [19]:
for column in adult_data.columns:
    adult_data.rename(columns = {column:column.replace('_transformed', '')}, inplace = True)    

In [20]:
adult_data.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,...,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia,income
0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0.837109,-1.008707,1.134739,-0.14592,-0.21666,-2.222153,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,-0.042642,0.245079,-0.42006,-0.14592,-0.21666,-0.035429,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,1.057047,0.425801,-1.197459,-0.14592,-0.21666,-0.035429,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,-0.775768,1.408176,1.134739,-0.14592,-0.21666,-0.035429,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
X = adult_data.drop(['income'],axis=1)
y = adult_data.income

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [29]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=30)
knn.fit(X_train, y_train) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=30, p=2,
           weights='uniform')

## Report

In [30]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

y_pred = knn.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))

df_confusion = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
df_conf_norm = df_confusion / df_confusion.values.sum()
df_conf_norm

0.8389375095961922
              precision    recall  f1-score   support

           0       0.87      0.92      0.90      4919
           1       0.70      0.59      0.64      1594

   micro avg       0.84      0.84      0.84      6513
   macro avg       0.79      0.76      0.77      6513
weighted avg       0.83      0.84      0.83      6513

[[4519  400]
 [ 649  945]]


Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.693843,0.061416
1,0.099647,0.145094


In [26]:
knn

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')