In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn import svm
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score, cross_val_predict, validation_curve
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from pandas.tools.plotting import scatter_matrix
from sklearn import preprocessing

from numpy import nan
%matplotlib inline
from sklearn import metrics
plt.rcParams['figure.figsize'] = (30,30)

Start by importing the dataset and assigning column names

In [49]:
df_train = pd.read_csv('adult.csv', header=None, names=["age", "wclass", "fnlwgt", "education", "education-num", "mstatus", \
                                                 "occ", "relationship", "race", "sex", "capgain", "caploss", \
                                                  "hperweek", "country", "class"])
df_test = pd.read_csv('adult.test.csv', header=None, names=["age", "wclass", "fnlwgt", "education", "education-num", "mstatus", \
                                                 "occ", "relationship", "race", "sex", "capgain", "caploss", \
                                                  "hperweek", "country", "class"])
print df_train.shape
print df_test.shape

(32561, 15)
(16281, 15)


We need to check for missing values.

In [50]:
df_train.isnull().sum()

age              0
wclass           0
fnlwgt           0
education        0
education-num    0
mstatus          0
occ              0
relationship     0
race             0
sex              0
capgain          0
caploss          0
hperweek         0
country          0
class            0
dtype: int64

Missing values could be there in other ways. One of the most common ways is '?'. 

In [51]:
#cat adult.csv | grep '?' | wc -l
#cat adult.test.csv | grep '?' | wc -l

We get the output as 2399 and 1221, which are exactly the number of missing values given at http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names. Hence these are the only missing values. We replace them with nan.

In [52]:
df_train = df_train.replace({' ?':nan})
print df_train.isnull().sum()
df_test = df_test.replace({' ?':nan})
print "\n", df_test.isnull().sum()

age                 0
wclass           1836
fnlwgt              0
education           0
education-num       0
mstatus             0
occ              1843
relationship        0
race                0
sex                 0
capgain             0
caploss             0
hperweek            0
country           583
class               0
dtype: int64

age                0
wclass           963
fnlwgt             0
education          0
education-num      0
mstatus            0
occ              966
relationship       0
race               0
sex                0
capgain            0
caploss            0
hperweek           0
country          274
class              0
dtype: int64


To figure out how to replace missing values, lets look at distribution of values of each "missing value" column.

In [53]:
print df_train['wclass'].value_counts().head(3)
print "\n", df_train['occ'].value_counts().head(3)
print "\n", df_train['country'].value_counts().head(3)

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
Name: wclass, dtype: int64

 Prof-specialty     4140
 Craft-repair       4099
 Exec-managerial    4066
Name: occ, dtype: int64

 United-States    29170
 Mexico             643
 Philippines        198
Name: country, dtype: int64


For wclass and country, it seems ok to replace it by the mode - since the probability of that value being the mode is high. (Ideally, if it is greater than our accuracy, we should be good). 'Private' is 70% of training, country is '90%'. We can also look at how much each variable is contributing to the separability of the classes by plotting a scatter matrix. For now, avoid premature optimization, we'll just replace it by mode.

In [54]:
cols = ['wclass','occ','country']
df_train[cols] = df_train[cols].fillna(df_train.mode().iloc[0])
df_test[cols] = df_test[cols].fillna(df_train.mode().iloc[0])

Now, we'll transform the categorical features to numerical. The features which need to be transformed are 'wclass', 'mstatus', 'occ', 'relationship', 'race', 'sex', 'country', 'education' and 'class'

In [55]:
categorical_columns = ['wclass', 'mstatus', 'occ', 'relationship', 'race', 'sex', 'country', 'education', 'class']
categorical_values_train = np.array(df_train[categorical_columns])
categorical_values_test = np.array(df_test[categorical_columns])

le = LabelEncoder()
le.fit(categorical_values_train[:,0]) 
data_train = le.transform(categorical_values_train[:,0])
data_test = le.transform(categorical_values_test[:,0])
for i in range(1, categorical_values_train.shape[1]):
    le = LabelEncoder()
    le.fit(categorical_values_train[:,i])
    data_train = np.column_stack((data_train, le.transform(categorical_values_train[:,i])))
    data_test = np.column_stack((data_test, le.transform(categorical_values_test[:,i])))

df_train_new = pd.DataFrame(data_train.astype(float), columns=categorical_columns)
df_test_new = pd.DataFrame(data_test.astype(float), columns=categorical_columns)
print df_train_new.head()
print df_test_new.head()

   wclass  mstatus  occ  relationship  race  sex  country  education  class
0     6.0      4.0  0.0           1.0   4.0  1.0     38.0        9.0    0.0
1     5.0      2.0  3.0           0.0   4.0  1.0     38.0        9.0    0.0
2     3.0      0.0  5.0           1.0   4.0  1.0     38.0       11.0    0.0
3     3.0      2.0  5.0           0.0   2.0  1.0     38.0        1.0    0.0
4     3.0      2.0  9.0           5.0   2.0  0.0      4.0        9.0    0.0
   wclass  mstatus   occ  relationship  race  sex  country  education  class
0     3.0      4.0   6.0           3.0   2.0  1.0     38.0        1.0    0.0
1     3.0      2.0   4.0           0.0   4.0  1.0     38.0       11.0    0.0
2     1.0      2.0  10.0           0.0   4.0  1.0     38.0        7.0    1.0
3     3.0      2.0   6.0           0.0   2.0  1.0     38.0       15.0    1.0
4     3.0      4.0   9.0           3.0   4.0  0.0     38.0       15.0    0.0


Of these, 'Education' and 'class' are features which don't require farther transformation (since it makes sense to preserve the ordinality of education, and class is something which is binray valued). For the rest, we will use onehotencoder to transform them to a non ordinal form.

non_ordinal_columns = ['wclass', 'mstatus', 'occ', 'relationship', 'race', 'sex', 'country']
non_ordinal_values_train = np.array(df_train_new[non_ordinal_columns])
non_ordinal_values_test = np.array(df_test_new[non_ordinal_columns])
ordinal_columns = ['education', 'class']

enc = OneHotEncoder()
enc.fit(non_ordinal_values_train)

data_train = enc.transform(non_ordinal_values_train)
data_test = enc.transform(non_ordinal_values_test)

cols = [non_ordinal_columns[i] + '_' + str(j) for i in range(0,len(non_ordinal_columns)) for j in range(0,enc.n_values_[i]) ]
df_train_new2 = pd.DataFrame(data_train.toarray(),columns=cols)
df_test_new2 = pd.DataFrame(data_test.toarray(),columns=cols)

numerical_cols = ['age', 'fnlwgt', 'education-num', 'capgain', 'caploss', 'hperweek']
df_train_cleaned = pd.concat([df_train_new2, df_train_new[ordinal_columns], df_train[numerical_cols]], axis=1)
df_test_cleaned = pd.concat([df_test_new2, df_test_new[ordinal_columns], df_test[numerical_cols]], axis=1)
print df_train_cleaned.shape
print df_test_cleaned.shape
print df_train_cleaned.columns

In [None]:
Now, we normalize our dataset.

In [64]:
x_train = df_train_cleaned.drop('class', 1)
y_train = df_train_cleaned['class']
x_test = df_test_cleaned.drop('class', 1)
y_test = df_test_cleaned['class']
scaler = preprocessing.StandardScaler().fit(x_train)
x_train_norm = scaler.transform(x_train)
x_test_norm = scaler.transform(x_test)

(32561, 90)
