# Adult Data Set
## A Simple Multivariate Classification Task
### Based on cencus data, income exceeds $50K/year? Yes/No
#### Dataset available at: http://archive.ics.uci.edu/ml/datasets/Adult

In [3]:
# Provide support for both versions of Python
#from __future__ import division, print_function, unicode_literals

# Do all the imports at once
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

In [4]:
adult_names = ['age', 'workclass', 'fnlwgt','education', 'education-num',
               'marital-status', 'occupation', 'relationship', 'race','sex',
               'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income']

In [5]:
adult_df = pd.read_csv('../ics-uci-dataset/adult/adult.data',names=adult_names, index_col=False)

In [6]:
adult_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [7]:
adult_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education-num     32561 non-null int64
marital-status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital-gain      32561 non-null int64
capital-loss      32561 non-null int64
hours-per-week    32561 non-null int64
native-country    32561 non-null object
income            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [8]:
adult_df['workclass'] = adult_df['workclass'].astype('category')
adult_df['education'] = adult_df['education'].astype('category')
adult_df['marital-status'] = adult_df['marital-status'].astype('category')
adult_df['occupation'] = adult_df['occupation'].astype('category')
adult_df['relationship'] = adult_df['relationship'].astype('category')
adult_df['race'] = adult_df['race'].astype('category')
adult_df['sex'] = adult_df['sex'].astype('category')
adult_df['native-country'] = adult_df['native-country'].astype('category')
adult_df['income'] = adult_df['income'].astype('category')

In [9]:
adult_df['workclass'] = adult_df['workclass'].cat.codes
adult_df['education'] = adult_df['education'].cat.codes
adult_df['marital-status'] = adult_df['marital-status'].cat.codes
adult_df['occupation'] = adult_df['occupation'].cat.codes
adult_df['relationship'] = adult_df['relationship'].cat.codes
adult_df['race'] = adult_df['race'].cat.codes
adult_df['sex'] = adult_df['sex'].cat.codes
adult_df['native-country'] = adult_df['native-country'].cat.codes
adult_df['income'] = adult_df['income'].cat.codes
adult_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null int8
fnlwgt            32561 non-null int64
education         32561 non-null int8
education-num     32561 non-null int64
marital-status    32561 non-null int8
occupation        32561 non-null int8
relationship      32561 non-null int8
race              32561 non-null int8
sex               32561 non-null int8
capital-gain      32561 non-null int64
capital-loss      32561 non-null int64
hours-per-week    32561 non-null int64
native-country    32561 non-null int8
income            32561 non-null int8
dtypes: int64(6), int8(9)
memory usage: 1.8 MB


In [10]:
x_df = adult_df.drop(['income'],axis=1)
y_df = adult_df['income']
X_train, X_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.33, random_state=42)

In [24]:
len(X_train)

21815

In [25]:
len(X_test)

10746

In [11]:
tree = DecisionTreeClassifier(criterion='gini', 
                              max_depth=4, 
                              random_state=1)

In [12]:
from sklearn import preprocessing
from sklearn import tree
label_encoder = preprocessing.LabelEncoder()


In [13]:
features = X_train.values


In [14]:
features
encoded_income = label_encoder.fit_transform(y_train)


In [15]:
tree_model = tree.DecisionTreeClassifier()

In [16]:
# Train the model
tree_model = tree_model.fit(X = features, 
               y = encoded_income)

In [17]:
tree.export_graphviz(tree_model, out_file='tree.dot') 

In [20]:
prediction = tree_model.predict(X_test)

In [22]:
from sklearn.metrics import accuracy_score

0.80913828401265586

In [55]:
n_correclty_classified = accuracy_score(y_test, prediction, normalize=False)
n_correclty_classified_percent = accuracy_score(y_test, prediction)

In [60]:
print('Out of %d, total of %d were correclty classified, which means accuracy percent of %0.1f percent'
      %(len(X_test),n_correclty_classified,n_correclty_classified_percent*100))

Out of 10746, total of 8695 were correclty classified, which means accuracy percent of 80.9 percent
