# Use Decision tree and Random Forest model to predict adult dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
cols=['age','workclass','fnlwgt','education','education-num','marital-status','occupation',
     'relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','income']
data=pd.read_csv('desktop/data mining project/adult.data',names=cols)
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


# Data Preprocess

In [3]:
def preprocess(data):
    data=data.drop('native-country',axis=1)
    data = data[(data.astype(str) != ' ?').all(axis=1)]
    data['income_binary'] = data.apply(lambda row: 1 if '>50K'in row['income'] else 0, axis=1)
    data=data.drop('income',axis=1)
    data=pd.get_dummies(data,columns=['workclass','education','marital-status','occupation','relationship',
                                 'race','sex'])
    return data

In [4]:
data=preprocess(data)

In [5]:
data.shape

(30718, 64)

# Built Decision tree and Random Forest to train data

In [6]:
# Random Forest model
ensemble=RandomForestClassifier(max_depth=7)
X2=data.drop('income_binary',axis=1).values
y2=data['income_binary'].values
ensemble.fit(X2,y2)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=7, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [28]:
# Decision tree model
tree=DecisionTreeClassifier(max_depth=8)
X=data.drop('income_binary',axis=1).values
y=data['income_binary'].values
tree.fit(X,y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

# Load Test data and preprocess it

extract the test real label

In [8]:
test=pd.read_csv('desktop/data mining project/adult.test',names=cols)
# drop first rows 
test=test.drop([0],axis=0)

In [9]:
test=preprocess(test)

In [10]:
test.shape

(15315, 64)

In [11]:
test_outcome=test['income_binary']

In [12]:
test=test.drop('income_binary',axis=1)

# Prediction

In [29]:
# Decision tree prediction
prediction=tree.predict(test.values)
predict=pd.DataFrame(prediction,columns=['predict'])

In [30]:
def Eval(predict):
    test2=pd.read_csv('desktop/data mining project/adult.test',names=cols)
    # drop first rows 
    test2=test2.drop([0],axis=0)
    test2=test2.drop('native-country',axis=1)
    test2=test2[(test2.astype(str) != ' ?').all(axis=1)]
    test2=test2.reset_index(drop=True)
    predict=predict.reset_index(drop=True)
    Eval_origion=pd.concat([test2,predict],axis=1)
    return Eval_origion

In [31]:
Eval_tree=Eval(predict)

In [32]:
# Random Forest Prediction
prediction2=ensemble.predict(test.values)
predict2=pd.DataFrame(prediction2,columns=['predict'])

In [33]:
Eval_esemble=Eval(predict2)

Eval_tree and Eval_esemble are outcomes 

Both shapes are (15315,15)

# Evaluate the accuracy

In [41]:
test_outcome=test_outcome.reset_index(drop=True)

In [42]:
# Decision tree model
Eval_tree_acc=pd.concat([Eval_tree,test_outcome],axis=1)
Eval_tree_acc['accuracy']=Eval_tree_acc['predict']-Eval_tree_acc['income_binary']
Eval_tree_acc['accuracy'].value_counts(normalize = True, dropna = False)

 0    0.850669
-1    0.115051
 1    0.034280
Name: accuracy, dtype: float64

In [43]:
# Ensemble tree model
Eval_esemble_acc=pd.concat([Eval_esemble,test_outcome],axis=1)
Eval_esemble_acc['accuracy']=Eval_esemble_acc['predict']-Eval_esemble_acc['income_binary']
Eval_esemble_acc['accuracy'].value_counts(normalize = True, dropna = False)

 0    0.845968
-1    0.115181
 1    0.038851
Name: accuracy, dtype: float64

The accuracy of Decision tree is 0.8506

The accuracy of Random forest is 0.8459