In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import model_selection
from datetime import datetime
from sklearn.impute import SimpleImputer

from sklearn import svm
from sklearn.tree import DecisionTreeClassifier 
from sklearn import ensemble
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [2]:
col_names = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation',
    'relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','income']

In [3]:
income_train_df = pd.read_csv('adult.data', sep=",\s", names=col_names, engine = 'python')
income_test_df = pd.read_csv('adult.test', sep=",\s", names=col_names, engine = 'python', skiprows=1)
income_df = pd.concat([income_train_df,income_test_df])
income_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
#First lets properly encode the target variable
income_df['income'] = income_df['income'].map({'<=50K': 0, '>50K': 1, '<=50K.': 0, '>50K.': 1})

In [5]:
income_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [6]:
#Checking any null values
income_df.replace(r'^\s*$', np.nan, regex=True, inplace = True)
income_df.replace('?', np.nan, inplace = True)
print(income_df.isnull().sum())

age                  0
workclass         2799
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        2809
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     857
income               0
dtype: int64


In [7]:
#We have three columns with high number of missing values - (workclass, occupation, native-country)
#We will simply impute them using a simple imputer
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
missing_val_cols = ['workclass', 'occupation', 'native-country']
imputer.fit(income_df[["workclass"]])
income_df["workclass"]=imputer.transform(income_df[["workclass"]]).ravel()
imputer.fit(income_df[["occupation"]])
income_df["occupation"]=imputer.transform(income_df[["occupation"]]).ravel()
imputer.fit(income_df[["native-country"]])
income_df["native-country"]=imputer.transform(income_df[["native-country"]]).ravel()

In [8]:
#Checking type of available features
print(income_df.info())
#As we can see there is a lot of categorical data preset in the data set which we need to encode to numeric values.
#So we define a custom multi col encoder class to do that.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48842 entries, 0 to 16280
Data columns (total 15 columns):
age               48842 non-null int64
workclass         48842 non-null object
fnlwgt            48842 non-null int64
education         48842 non-null object
education-num     48842 non-null int64
marital-status    48842 non-null object
occupation        48842 non-null object
relationship      48842 non-null object
race              48842 non-null object
sex               48842 non-null object
capital-gain      48842 non-null int64
capital-loss      48842 non-null int64
hours-per-week    48842 non-null int64
native-country    48842 non-null object
income            48842 non-null int64
dtypes: int64(7), object(8)
memory usage: 6.0+ MB
None


In [9]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self 
    
    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = preprocessing.LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = preprocessing.LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [10]:
income_df = MultiColumnLabelEncoder(columns = list(set(income_df.columns) - set(income_df.describe().columns))).fit_transform(income_df)

In [11]:
#Preparing training and testing datasets
income_data = income_df.values
income_data = income_data.astype(np.float)
X_train, X_test, y_train, y_test = train_test_split(income_data[:,:14],income_data[:,14], test_size=0.33, random_state=0)

In [27]:
#k-Nearest neighbours classification
print("now ="+str(datetime.now()))
knn_model = KNeighborsClassifier(n_jobs=-1)
param_grid = {'n_neighbors':(np.arange(2,52,5))}
mdls = model_selection.GridSearchCV(knn_model, param_grid, verbose=1, cv=3, n_jobs=-1).fit(X_train, y_train)
print(mdls.best_estimator_)
y_pred = mdls.best_estimator_.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))
print("now ="+str(datetime.now()))

now =2019-11-06 15:45:31.907708
Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   38.3s finished


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=22, p=2,
                     weights='uniform')
0.8040079414319394
now =2019-11-06 15:46:15.976270


In [28]:
#Logistic regression (for classification)
print("now ="+str(datetime.now()))
logistic_model = linear_model.LogisticRegression(n_jobs=-1,random_state=0)
param_grid = { "fit_intercept":[True], "solver":['newton-cg', 'lbfgs', 'saga'], 
             "max_iter":np.arange(100,400, 100)}
mdls = model_selection.GridSearchCV(logistic_model, param_grid, verbose=1,cv=3,n_jobs=-1).fit(X_train, y_train)
print(mdls.best_estimator_)
y_pred = mdls.best_estimator_.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))
print("now ="+str(datetime.now()))

now =2019-11-06 15:46:15.989265
Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  1.4min finished


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='warn', n_jobs=-1, penalty='l2', random_state=0,
                   solver='newton-cg', tol=0.0001, verbose=0, warm_start=False)
0.8253505397692021
now =2019-11-06 15:48:48.624306


In [29]:
#Decision tree classification
print("now ="+str(datetime.now()))
DTC_model = DecisionTreeClassifier(random_state=0)
Max_features = ['auto', 'sqrt', 'log2']
Min_samples_leafs = np.linspace(0.01, 0.05, 5, endpoint=True)
param_grid = {'max_features': Max_features, 'min_samples_leaf': Min_samples_leafs}
mdls = model_selection.GridSearchCV(DTC_model, param_grid, verbose=1,cv=3,n_jobs=-1).fit(X_train, y_train)
print(mdls.best_estimator_)
y_pred = mdls.best_estimator_.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))
print("now ="+str(datetime.now()))

now =2019-11-06 15:48:48.639296
Fitting 3 folds for each of 15 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    1.8s finished


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=0.01, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')
0.8308102742275717
now =2019-11-06 15:48:50.878012


In [30]:
#Random forest classification
print("now ="+str(datetime.now()))
RFC_model = ensemble.RandomForestClassifier(random_state=0)
Estimators = np.arange(100,105,5)
Min_samples_leafs = np.linspace(0.01, 0.05, 5, endpoint=True)
Max_features = ['auto', 'sqrt', 'log2']
param_grid = {'n_estimators': Estimators,'max_features': Max_features, 'min_samples_leaf': Min_samples_leafs}
mdls = model_selection.GridSearchCV(RFC_model, param_grid, verbose=1,cv=3,n_jobs=-1).fit(X_train, y_train)
print(mdls.best_estimator_)
y_pred = mdls.best_estimator_.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))
print("now ="+str(datetime.now()))

now =2019-11-06 15:48:50.970959
Fitting 3 folds for each of 15 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  1.3min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=0.01, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)
0.8407370641518799
now =2019-11-06 15:50:17.108051


In [31]:
#AdaBoost classification
print("now ="+str(datetime.now()))
ABC_model = ensemble.AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=0),random_state=0)
Estimators = np.arange(50,110,10)
Learning_rates = [0.05,0.1,0.3,1]
param_grid = {'n_estimators': Estimators, 'learning_rate': Learning_rates}
mdls = model_selection.GridSearchCV(ABC_model, param_grid, verbose=1,cv=3,n_jobs=-1).fit(X_train, y_train)
print(mdls.best_estimator_)
y_pred = mdls.best_estimator_.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))
print("now ="+str(datetime.now()))

now =2019-11-06 15:50:17.187005
Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   17.4s finished


AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(class_weight=None,
                                                         criterion='gini',
                                                         max_depth=None,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort=False,
                                                         random_state=0,
                             

In [32]:
#Gaussian naive Bayes classification
print("now ="+str(datetime.now()))
zero_prob = y_train[y_train == 0].shape[0]/y_train.shape[0]
one_prob = 1 - zero_prob
prob = np.array([zero_prob,one_prob])
GNB_model = GaussianNB(priors = prob)
GNB_model.fit(X_train, y_train)
# mdls = model_selection.GridSearchCV(GNB_model, param_grid, verbose=1,cv=5,, n_jobs=-1).fit(X_train, y_train)
# print(mdls.best_estimator_)
y_pred = GNB_model.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))
print("now ="+str(datetime.now()))

now =2019-11-06 15:50:35.938258
0.7998510981511354
now =2019-11-06 15:50:36.132148


In [34]:
#Neural network classification
print("now ="+str(datetime.now()))
NNC_model = MLPClassifier()
Hidden_Layer_Sizes = [1, 5, 10, (5,5), (10,5)]
Learning_rates = ['constant','adaptive']
Learning_rates_init = [0.001, 0.01, 0.1]
Activations = ['logistic', 'tanh', 'relu']
Alphas = [0.0001,0.002]
param_grid = {'learning_rate': Learning_rates, 'learning_rate_init': Learning_rates_init, 'hidden_layer_sizes': Hidden_Layer_Sizes, 'activation': Activations, 'alpha': Alphas}
mdls = model_selection.GridSearchCV(NNC_model, param_grid, verbose=1,cv=3,n_jobs=-1).fit(X_train, y_train)
print(mdls.best_estimator_)
y_pred = mdls.best_estimator_.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))
print("now ="+str(datetime.now()))

now =2019-11-06 15:51:37.139807
Fitting 3 folds for each of 180 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   53.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 15.7min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 23.3min finished


MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=10, learning_rate='adaptive',
              learning_rate_init=0.01, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)
0.8012160317657278
now =2019-11-06 16:24:25.708662


In [None]:
#SVM classifier
print("now ="+str(datetime.now()))
svm_model = svm.SVC()
Kernels = ['linear', 'poly', 'rbf']
Gammas = [0.001, 0.01, 0.1, 1]
param_grid = {'kernel':Kernels, 'gamma' : Gammas}
mdls = model_selection.GridSearchCV(svm_model, param_grid, verbose=1, cv=3, n_jobs=-1).fit(X_train, y_train)
print(mdls.best_estimator_)
y_pred = mdls.best_estimator_.predict(X_test)
metrics.accuracy_score(y_test, y_pred)
print("now ="+str(datetime.now()))

now =2019-11-06 16:24:25.728656
Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
