# Assignment_Week6

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 6)
plt.rcParams['font.size'] = 14
import pandas as pd

In [2]:
df = pd.read_csv('adult.data', index_col=False)

In [3]:
golden = pd.read_csv('adult.test', index_col=False)

In [4]:
from sklearn import preprocessing

In [5]:
# Columns we want to transform
transform_columns = ['sex']

#Columns we can't use because non-numerical
non_num_columns = ['workclass', 'education', 'marital-status', 
                     'occupation', 'relationship', 'race', 'sex', 
                     'native-country']

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix, auc, roc_curve
)

# For the following use the above `adult` dataset. 

# 1. Show the RandomForest outperforms the DecisionTree for a fixed `max_depth` by training using the train set and calculate `precision`, `recall`, `f1`, `confusion matrix` on golden-test set. Start with only numerical features/columns. (age, education-num, capital-gain, capital-loss, hours-per-week) 

## 1-A. preprocessing data

In [7]:
enc = preprocessing.OrdinalEncoder()

In [8]:
#Using x as datatable 
x = df.copy()

#Drop non-numerical variables
x = x.drop(non_num_columns, axis=1)

#Transfrom salary column into ordinal variable
enc.fit(df[["salary"]])
x["salary"] = enc.transform(df[["salary"]])

x.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary
0,39,77516,13,2174,0,40,0.0
1,50,83311,13,0,0,13,0.0
2,38,215646,9,0,0,40,0.0
3,53,234721,7,0,0,40,0.0
4,28,338409,13,0,0,40,0.0


In [9]:
#Do the same process to test set

xt = golden.copy()
xt = xt.drop(non_num_columns, axis=1)

enc.fit(golden[["salary"]])
xt["salary"] = enc.transform(golden[["salary"]])

xt.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary
0,25,226802,7,0,0,40,0.0
1,38,89814,9,0,0,50,0.0
2,28,336951,12,0,0,40,1.0
3,44,160323,10,7688,0,40,1.0
4,18,103497,10,0,0,30,0.0


## 1-B. Random Forest

In [10]:
#Define 'fmodel' for RandomForestClassifier model
fmodel = RandomForestClassifier(criterion='entropy')

#Fit the model
fmodel.fit(x.drop(['fnlwgt','salary'], axis=1), x.salary)

#Make prediction values
predictionsf = fmodel.predict(xt.drop(['fnlwgt','salary'], axis=1))

print(list(zip(x.drop(['fnlwgt','salary'], axis=1).columns, fmodel.feature_importances_)))

[('age', 0.3474655310361071), ('education-num', 0.1751361868719826), ('capital-gain', 0.2078324438313034), ('capital-loss', 0.08199394121222843), ('hours-per-week', 0.1875718970483784)]


In [11]:
#Print Classification Report and Confusion matrix
print(classification_report(xt.salary, predictionsf))
print(confusion_matrix(xt.salary, predictionsf))

              precision    recall  f1-score   support

         0.0       0.85      0.93      0.89     12435
         1.0       0.68      0.47      0.56      3846

    accuracy                           0.82     16281
   macro avg       0.77      0.70      0.72     16281
weighted avg       0.81      0.82      0.81     16281

[[11592   843]
 [ 2032  1814]]


## 1-C. Decision Tree

### 1-C-1. Max_depth = None

In [12]:
#Define 'tmodel0' for DecisionTreeClassifier model of no max_depth value
tmodel0 = DecisionTreeClassifier(criterion='entropy', max_depth=None)

#Fit the model
tmodel0.fit(x.drop(['fnlwgt','salary'], axis=1), x.salary)

#Make prediction values
predictionst0 = tmodel0.predict(xt.drop(['fnlwgt','salary'], axis=1))

print(list(zip(x.drop(['fnlwgt','salary'], axis=1).columns, tmodel0.feature_importances_)))

[('age', 0.3280408409581943), ('education-num', 0.16582943303430064), ('capital-gain', 0.2482042931250784), ('capital-loss', 0.09803481103178795), ('hours-per-week', 0.15989062185063868)]


In [13]:
# Results
print(classification_report(xt.salary, predictionst0))
print(confusion_matrix(xt.salary, predictionst0))

              precision    recall  f1-score   support

         0.0       0.85      0.93      0.89     12435
         1.0       0.66      0.46      0.54      3846

    accuracy                           0.82     16281
   macro avg       0.75      0.69      0.72     16281
weighted avg       0.80      0.82      0.81     16281

[[11519   916]
 [ 2063  1783]]


### Result: RandomForest(cell 11) outperforms DecisionTree(cell 13). precision, recall, f1-score are same or higher)

### 1-C-2. Max_depth=5

In [14]:
#Define 'tmodel5' for DecisionTreeClassifier model of max_depth=5
tmodel5 = DecisionTreeClassifier(criterion='entropy', max_depth=5)

#Fit the model
tmodel5.fit(x.drop(['fnlwgt','salary'], axis=1), x.salary)

#Make prediction values
predictionst5 = tmodel5.predict(xt.drop(['fnlwgt','salary'], axis=1))

print(list(zip(x.drop(['fnlwgt','salary'], axis=1).columns, tmodel5.feature_importances_)))

[('age', 0.28770534587003693), ('education-num', 0.2189135293354957), ('capital-gain', 0.36402587048698065), ('capital-loss', 0.10304106489113109), ('hours-per-week', 0.02631418941635568)]


In [15]:
# Results
print(classification_report(xt.salary, predictionst5))
print(confusion_matrix(xt.salary, predictionst5))

              precision    recall  f1-score   support

         0.0       0.84      0.95      0.89     12435
         1.0       0.74      0.42      0.53      3846

    accuracy                           0.83     16281
   macro avg       0.79      0.69      0.71     16281
weighted avg       0.82      0.83      0.81     16281

[[11862   573]
 [ 2245  1601]]


### Result: f1 score is same or higher in RandomForest(Cell 11), but precision score of 1.0 and recall score of 0.0 are higher in DecisionTree(Cell 15)

### 1-C-3. Max_depth=10

In [16]:
#Define 'tmodel10' for DecisionTreeClassifier model of max_depth=10
tmodel10 = DecisionTreeClassifier(criterion='entropy', max_depth=10)

#Fit the model
tmodel10.fit(x.drop(['fnlwgt','salary'], axis=1), x.salary)

#Make prediction values
predictionst10 = tmodel10.predict(xt.drop(['fnlwgt','salary'], axis=1))

print(list(zip(x.drop(['fnlwgt','salary'], axis=1).columns, tmodel10.feature_importances_)))

[('age', 0.27954188436800814), ('education-num', 0.19633853301365783), ('capital-gain', 0.31790245343892365), ('capital-loss', 0.12593663111804534), ('hours-per-week', 0.08028049806136497)]


In [17]:
# Results
print(classification_report(xt.salary, predictionst10))
print(confusion_matrix(xt.salary, predictionst10))

              precision    recall  f1-score   support

         0.0       0.85      0.94      0.90     12435
         1.0       0.72      0.48      0.58      3846

    accuracy                           0.83     16281
   macro avg       0.79      0.71      0.74     16281
weighted avg       0.82      0.83      0.82     16281

[[11704   731]
 [ 1995  1851]]


### Result: DecisionTree(Cell 17) outperforms RandomForest(Cell 11)

# 2. Use a RandomForest or DecisionTree and the `adult` dataset, systematically add new columns, one by one, that are non-numerical but converted using the feature-extraction techniques we learned. Using the golden-test set show [`precision`, `recall`, `f1`, `confusion matrix`] for each additional feature added.

## I will use DecisionTree model and Use OrdinalEncoder to convert non-numerical values into numerical values

## 2-A. Add non-numerical column "workclass" to numerical columns

In [18]:
#Using OrdincalEncoder to convert non-numerical values into numerical values.
enc = preprocessing.OrdinalEncoder()
enc.fit(df[["workclass"]])

#make new train set table(x2)
x2 = x.copy()
x2["workclass"] = enc.fit_transform(df[["workclass"]])
x2.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary,workclass
0,39,77516,13,2174,0,40,0.0,7.0
1,50,83311,13,0,0,13,0.0,6.0
2,38,215646,9,0,0,40,0.0,4.0
3,53,234721,7,0,0,40,0.0,4.0
4,28,338409,13,0,0,40,0.0,4.0


In [19]:
#Same Process for test set
enc.fit(golden[["workclass"]])

#make new test set table(xt2)
xt2 = xt.copy()
xt2["workclass"] = enc.fit_transform(golden[["workclass"]])
xt2.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary,workclass
0,25,226802,7,0,0,40,0.0,4.0
1,38,89814,9,0,0,50,0.0,4.0
2,28,336951,12,0,0,40,1.0,2.0
3,44,160323,10,7688,0,40,1.0,4.0
4,18,103497,10,0,0,30,0.0,0.0


In [20]:
#Define 'tmodel2' for DecisionTreeClassifier model
tmodel2 = DecisionTreeClassifier(criterion='entropy', max_depth=None)

#Fit the model
tmodel2.fit(x2.drop(['fnlwgt','salary'], axis=1), x2.salary)

#Make prediction values
predictionst2 = tmodel2.predict(xt2.drop(['fnlwgt','salary'], axis=1))

print(list(zip(x2.drop(['fnlwgt','salary'], axis=1).columns, tmodel2.feature_importances_)))

[('age', 0.3291803636831587), ('education-num', 0.1544565119154207), ('capital-gain', 0.21296920223089014), ('capital-loss', 0.08238463949882367), ('hours-per-week', 0.14653256297798908), ('workclass', 0.0744767196937175)]


In [21]:
# Classification Report and Confusion Matrix
print(classification_report(xt2.salary, predictionst2))
print(confusion_matrix(xt2.salary, predictionst2))

              precision    recall  f1-score   support

         0.0       0.85      0.91      0.88     12435
         1.0       0.63      0.48      0.55      3846

    accuracy                           0.81     16281
   macro avg       0.74      0.70      0.71     16281
weighted avg       0.80      0.81      0.80     16281

[[11360  1075]
 [ 2002  1844]]


## Result : f1-score (Cell 21) became worse compared to the original prediction using Decision Tree (Cell 13) (0.89 -> 0.88 for predicting 0.0, 0.55 -> 0.54 for predicting 1.0)

## 2-B. Add non-numerial column "education" to numerical columns

In [22]:
#Using OrdincalEncoder to convert non-numerical values into numerical values.
enc.fit(df[["education"]])

#make new train set table(x3)
x3 = x.copy()
x3["education"] = enc.fit_transform(df[["education"]])
x3.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary,education
0,39,77516,13,2174,0,40,0.0,9.0
1,50,83311,13,0,0,13,0.0,9.0
2,38,215646,9,0,0,40,0.0,11.0
3,53,234721,7,0,0,40,0.0,1.0
4,28,338409,13,0,0,40,0.0,9.0


In [23]:
#Same Process for test set
enc.fit(golden[["education"]])

#make new test set table(xt3)
xt3 = xt.copy()
xt3["education"] = enc.fit_transform(golden[["education"]])
xt3.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary,education
0,25,226802,7,0,0,40,0.0,1.0
1,38,89814,9,0,0,50,0.0,11.0
2,28,336951,12,0,0,40,1.0,7.0
3,44,160323,10,7688,0,40,1.0,15.0
4,18,103497,10,0,0,30,0.0,15.0


In [24]:
#Define 'tmodel3' for DecisionTreeClassifier model
tmodel3 = DecisionTreeClassifier(criterion='entropy', max_depth=None)

#Fit the model
tmodel3.fit(x3.drop(['fnlwgt','salary'], axis=1), x3.salary)

#Make prediction values
predictionst3 = tmodel3.predict(xt3.drop(['fnlwgt','salary'], axis=1))

print(list(zip(x3.drop(['fnlwgt','salary'], axis=1).columns, tmodel3.feature_importances_)))

[('age', 0.32620517171267527), ('education-num', 0.14727495341034383), ('capital-gain', 0.24715878399582936), ('capital-loss', 0.09728804423160807), ('hours-per-week', 0.16316591913325745), ('education', 0.018907127516285822)]


In [25]:
# Classification Report and Confusion Matrix
print(classification_report(xt3.salary, predictionst3))
print(confusion_matrix(xt3.salary, predictionst3))

              precision    recall  f1-score   support

         0.0       0.85      0.93      0.89     12435
         1.0       0.66      0.46      0.54      3846

    accuracy                           0.82     16281
   macro avg       0.75      0.69      0.71     16281
weighted avg       0.80      0.82      0.80     16281

[[11509   926]
 [ 2063  1783]]


## Result :  f1-score (Cell 25) became worse compared to the original prediction(Cell 13) (0.89 -> 0.89 for predicting 0.0, 0.55 -> 0.54 for predicting 1.0)

## 2-C. To find out input variables which could improve the performance, add all non-numerical columns to numerical columns at once

In [26]:
#make new train set table(x4)

x4 = x.copy()

#Using OrdincalEncoder to convert all non-numerical values into numerical values.

enc.fit(df[["workclass"]])
x4["workclass"] = enc.fit_transform(df[["workclass"]])

enc.fit(df[["education"]])
x4["education"] = enc.fit_transform(df[["education"]])

enc.fit(df[["marital-status"]])
x4["marital-status"] = enc.fit_transform(df[["marital-status"]])

enc.fit(df[["occupation"]])
x4["occupation"] = enc.fit_transform(df[["occupation"]])

enc.fit(df[["relationship"]])
x4["relationship"] = enc.fit_transform(df[["relationship"]])

enc.fit(df[["race"]])
x4["race"] = enc.fit_transform(df[["race"]])

enc.fit(df[["sex"]])
x4["sex"] = enc.fit_transform(df[["sex"]])

enc.fit(df[["native-country"]])
x4["native-country"] = enc.fit_transform(df[["native-country"]])

x4.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,39,77516,13,2174,0,40,0.0,7.0,9.0,4.0,1.0,1.0,4.0,1.0,39.0
1,50,83311,13,0,0,13,0.0,6.0,9.0,2.0,4.0,0.0,4.0,1.0,39.0
2,38,215646,9,0,0,40,0.0,4.0,11.0,0.0,6.0,1.0,4.0,1.0,39.0
3,53,234721,7,0,0,40,0.0,4.0,1.0,2.0,6.0,0.0,2.0,1.0,39.0
4,28,338409,13,0,0,40,0.0,4.0,9.0,2.0,10.0,5.0,2.0,0.0,5.0


In [27]:
#make new test set table(xt4)

xt4 = xt.copy()

#Same Process for test set

enc.fit(golden[["workclass"]])
xt4["workclass"] = enc.fit_transform(golden[["workclass"]])

enc.fit(golden[["education"]])
xt4["education"] = enc.fit_transform(golden[["education"]])

enc.fit(golden[["marital-status"]])
xt4["marital-status"] = enc.fit_transform(golden[["marital-status"]])

enc.fit(golden[["occupation"]])
xt4["occupation"] = enc.fit_transform(golden[["occupation"]])

enc.fit(golden[["relationship"]])
xt4["relationship"] = enc.fit_transform(golden[["relationship"]])

enc.fit(golden[["race"]])
xt4["race"] = enc.fit_transform(golden[["race"]])

enc.fit(golden[["sex"]])
xt4["sex"] = enc.fit_transform(golden[["sex"]])

enc.fit(golden[["native-country"]])
xt4["native-country"] = enc.fit_transform(golden[["native-country"]])

xt4.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,25,226802,7,0,0,40,0.0,4.0,1.0,4.0,7.0,3.0,2.0,1.0,38.0
1,38,89814,9,0,0,50,0.0,4.0,11.0,2.0,5.0,0.0,4.0,1.0,38.0
2,28,336951,12,0,0,40,1.0,2.0,7.0,2.0,11.0,0.0,4.0,1.0,38.0
3,44,160323,10,7688,0,40,1.0,4.0,15.0,2.0,7.0,0.0,2.0,1.0,38.0
4,18,103497,10,0,0,30,0.0,0.0,15.0,4.0,0.0,3.0,4.0,0.0,38.0


In [28]:
#Define 'tmodel4' for DecisionTreeClassifier model
tmodel4 = DecisionTreeClassifier(criterion='entropy', max_depth=None)

#Fit the model
tmodel4.fit(x4.drop(['fnlwgt','salary'], axis=1), x4.salary)

#Make prediction values
predictionst4 = tmodel4.predict(xt4.drop(['fnlwgt','salary'], axis=1))

print(list(zip(x4.drop(['fnlwgt','salary'], axis=1).columns, tmodel4.feature_importances_)))

[('age', 0.19212454432609516), ('education-num', 0.11450307248287368), ('capital-gain', 0.11874474753609517), ('capital-loss', 0.040279203774549625), ('hours-per-week', 0.10969545154679286), ('workclass', 0.04853534230881835), ('education', 0.017847339999083316), ('marital-status', 0.015765647814632373), ('occupation', 0.08119163086971316), ('relationship', 0.21343544275073023), ('race', 0.02030245057426389), ('sex', 0.004622654457148367), ('native-country', 0.022952471559203838)]


## Result : found out that non-numerical value 'relationship' has the highest importance(0.214) in predicting 'salary'

## 2-D. To find out how the value 'relationship' could improve the model, add column "relationship" to numerical columns

In [29]:
#Using OrdincalEncoder to convert non-numerical values into numerical values.
enc = preprocessing.OrdinalEncoder()
enc.fit(df[["relationship"]])

#make new train set table(x5)
x5 = x.copy()
x5["relationship"] = enc.fit_transform(df[["relationship"]])
x5.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary,relationship
0,39,77516,13,2174,0,40,0.0,1.0
1,50,83311,13,0,0,13,0.0,0.0
2,38,215646,9,0,0,40,0.0,1.0
3,53,234721,7,0,0,40,0.0,0.0
4,28,338409,13,0,0,40,0.0,5.0


In [30]:
#Using OrdincalEncoder to convert non-numerical values into numerical values.
enc.fit(golden[["relationship"]])

#make new test set table(xt5)
xt5 = x.copy()
xt5["relationship"] = enc.fit_transform(df[["relationship"]])
xt5.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary,relationship
0,39,77516,13,2174,0,40,0.0,1.0
1,50,83311,13,0,0,13,0.0,0.0
2,38,215646,9,0,0,40,0.0,1.0
3,53,234721,7,0,0,40,0.0,0.0
4,28,338409,13,0,0,40,0.0,5.0


In [31]:
#Define 'tmodel5' for DecisionTreeClassifier model
tmodel5 = DecisionTreeClassifier(criterion='entropy', max_depth=None)

#Fit the model
tmodel5.fit(x5.drop(['fnlwgt','salary'], axis=1), x5.salary)

#Make prediction values
predictionst5 = tmodel5.predict(xt5.drop(['fnlwgt','salary'], axis=1))

print(list(zip(x5.drop(['fnlwgt','salary'], axis=1).columns, tmodel5.feature_importances_)))

[('age', 0.2053599756583411), ('education-num', 0.15957969823965365), ('capital-gain', 0.1597256360131642), ('capital-loss', 0.057046689962673244), ('hours-per-week', 0.13804012715016595), ('relationship', 0.280247872976002)]


In [32]:
# Classification Report and Confusion Matrix
print(classification_report(xt5.salary, predictionst5))
print(confusion_matrix(xt5.salary, predictionst5))

              precision    recall  f1-score   support

         0.0       0.92      0.97      0.95     24720
         1.0       0.90      0.74      0.81      7841

    accuracy                           0.92     32561
   macro avg       0.91      0.86      0.88     32561
weighted avg       0.92      0.92      0.91     32561

[[24055   665]
 [ 2053  5788]]


## Result :  f1-score (Cell 32) was significantly improved compared to the original prediction(Cell 13) (0.89 -> 0.95 for predicting 0.0, 0.55 -> 0.81 for predicting 1.0)

# Overall Result of Assignment 2: Adding high-relevant input variable into model would improve performance; adding low-relevant input variable may not improve or harm performance 