# Adult Data Set
## A Simple Multivariate Classification Task
### MLPClassifier
#### Dataset available at: http://archive.ics.uci.edu/ml/datasets/Adult

In [2]:
# Provide support for both versions of Python
# from __future__ import division, print_function, unicdoe_literals

# Do all the imports at once
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from sys import executable

In [3]:
print(executable)

/home/javidi/anaconda3/bin/python


In [4]:
adult_names = ['age', 'workclass', 'fnlwgt','education', 'education-num',
               'marital-status', 'occupation', 'relationship', 'race','sex',
               'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income']

In [5]:
adult_df = pd.read_csv('../ics-uci-dataset/adult/adult.data', names = adult_names, index_col=False)
adult_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education-num     32561 non-null int64
marital-status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital-gain      32561 non-null int64
capital-loss      32561 non-null int64
hours-per-week    32561 non-null int64
native-country    32561 non-null object
income            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [6]:
adult_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [38]:
list(adult_df.columns.values)

['age',
 'workclass',
 'fnlwgt',
 'education',
 'education-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country',
 'income']

In [39]:
colnames = ['workclass', 'education', 'marital-status', 'occupation',
            'relationship', 'race', 'sex', 'native-country', 'income']
for col in colnames:
    adult_df[col] = adult_df[col].astype('category')
adult_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null category
fnlwgt            32561 non-null int64
education         32561 non-null category
education-num     32561 non-null int64
marital-status    32561 non-null category
occupation        32561 non-null category
relationship      32561 non-null category
race              32561 non-null category
sex               32561 non-null category
capital-gain      32561 non-null int64
capital-loss      32561 non-null int64
hours-per-week    32561 non-null int64
native-country    32561 non-null category
income            32561 non-null category
dtypes: category(9), int64(6)
memory usage: 1.8 MB


In [40]:
for col in colnames:
    adult_df[col] = adult_df[col].cat.codes
adult_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null int8
fnlwgt            32561 non-null int64
education         32561 non-null int8
education-num     32561 non-null int64
marital-status    32561 non-null int8
occupation        32561 non-null int8
relationship      32561 non-null int8
race              32561 non-null int8
sex               32561 non-null int8
capital-gain      32561 non-null int64
capital-loss      32561 non-null int64
hours-per-week    32561 non-null int64
native-country    32561 non-null int8
income            32561 non-null int8
dtypes: int64(6), int8(9)
memory usage: 1.8 MB


In [51]:
feature_cols = ['age',
 'workclass',
 'fnlwgt',
 'education',
 'education-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country']
x_df = adult_df[list(feature_cols)]
y_df = adult_df['income']
X_train, X_test, y_train, y_test = train_test_split(x_df, y_df,  test_size=0.33, random_state=42)

In [52]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21815 entries, 27585 to 23654
Data columns (total 14 columns):
age               21815 non-null int64
workclass         21815 non-null int8
fnlwgt            21815 non-null int64
education         21815 non-null int8
education-num     21815 non-null int64
marital-status    21815 non-null int8
occupation        21815 non-null int8
relationship      21815 non-null int8
race              21815 non-null int8
sex               21815 non-null int8
capital-gain      21815 non-null int64
capital-loss      21815 non-null int64
hours-per-week    21815 non-null int64
native-country    21815 non-null int8
dtypes: int64(6), int8(8)
memory usage: 1.3 MB


In [53]:
len(X_test)

10746

In [54]:
from sklearn import preprocessing

In [55]:
X_train = preprocessing.scale(X_train,axis=0)
X_test = preprocessing.scale(X_test, axis=0)

In [66]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(20, 20,20),random_state=1, max_iter=2000)

In [None]:
mlp.fit(X=X_train,y=y_train)

In [None]:
prediction = mlp.predict(X_test)
print(prediction)

In [None]:
from sklearn.metrics import accuracy_score
n_correclty_classified = accuracy_score(y_test, prediction, normalize=False)
n_correclty_classified_percent = accuracy_score(y_test, prediction)

In [None]:
print('Out of %d, total of %d were correclty classified, which means accuracy percent of %0.1f percent'
      %(len(X_test),n_correclty_classified,n_correclty_classified_percent*100))