# Assignment is below at the end

- https://scikit-learn.org/stable/modules/tree.html
- https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
- https://scikit-learn.org/stable/modules/generated/sklearn.tree.plot_tree.html

In [194]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 6)
plt.rcParams['font.size'] = 14
import pandas as pd


In [195]:
df = pd.read_csv('C:\\Users\\Michael\\Downloads\\adult.data', index_col=False)


In [196]:
golden = pd.read_csv('C:\\Users\\Michael\\Downloads\\adult.test', index_col=False)


In [197]:
golden.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [198]:
df.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [199]:
df.columns


Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

In [200]:
from sklearn import preprocessing


In [201]:
# Columns we want to transform
transform_columns = ['sex']

#Columns we can't use because non-numerical
non_num_columns = ['workclass', 'education', 'marital-status', 
                     'occupation', 'relationship', 'race', 'sex', 
                     'native-country']


## First let's try using `pandas.get_dummies()` to transform columns

In [202]:
dummies = pd.get_dummies(df[transform_columns])
dummies


Unnamed: 0,sex_ Female,sex_ Male
0,0,1
1,0,1
2,0,1
3,0,1
4,1,0
...,...,...
32556,1,0
32557,0,1
32558,1,0
32559,0,1


In [203]:
dummies.shape


(32561, 2)

## sklearn has a similar process for OneHot Encoding features

In [204]:
onehot = preprocessing.OneHotEncoder(handle_unknown="ignore", sparse=False)
onehot.fit(df[transform_columns])


OneHotEncoder(handle_unknown='ignore', sparse=False)

In [205]:
onehot.categories_


[array([' Female', ' Male'], dtype=object)]

In [206]:
sex = onehot.transform(df[transform_columns])
sex


array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [0., 1.],
       [1., 0.]])

In [207]:
sex.shape


(32561, 2)

## In addition to OneHot encoding there is Ordinal Encoding 

In [208]:
enc = preprocessing.OrdinalEncoder()
enc.fit(df[["salary"]])
salary = enc.transform(df[["salary"]])
salary


array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [1.]])

In [209]:
enc.categories_[0]


array([' <=50K', ' >50K'], dtype=object)

In [210]:
x = df.copy()

# transformed = pd.get_dummies(df[transform_columns])


onehot = preprocessing.OneHotEncoder(handle_unknown="ignore", sparse=False).fit(df[transform_columns])

enc = preprocessing.OrdinalEncoder()

enc.fit(df[["salary"]])


transformed = onehot.transform(df[transform_columns])
new_cols = list(onehot.categories_[0].flatten())
df_trans = pd.get_dummies(df[transform_columns])


x = pd.concat(
    [
        x.drop(non_num_columns, axis=1), 
        df_trans
    ], 
    axis=1,)


x["salary"] = enc.transform(df[["salary"]])


In [211]:
x.head()


Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary,sex_ Female,sex_ Male
0,39,77516,13,2174,0,40,0.0,0,1
1,50,83311,13,0,0,13,0.0,0,1
2,38,215646,9,0,0,40,0.0,0,1
3,53,234721,7,0,0,40,0.0,0,1
4,28,338409,13,0,0,40,0.0,1,0


In [212]:
xt = golden.copy()

transformed = onehot.transform(xt[transform_columns])
new_cols = list(onehot.categories_[0].flatten())
df_trans = pd.DataFrame(transformed, columns=new_cols)

xt = pd.concat(
    [
        xt.drop(non_num_columns, axis=1), 
        df_trans
    ], 
    axis=1,)

xt["salary"] = enc.fit_transform(golden[["salary"]])


In [213]:
xt.salary.value_counts()


0.0    12435
1.0     3846
Name: salary, dtype: int64

In [214]:
x.head()


Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary,sex_ Female,sex_ Male
0,39,77516,13,2174,0,40,0.0,0,1
1,50,83311,13,0,0,13,0.0,0,1
2,38,215646,9,0,0,40,0.0,0,1
3,53,234721,7,0,0,40,0.0,0,1
4,28,338409,13,0,0,40,0.0,1,0


In [215]:
enc.categories_


[array([' <=50K.', ' >50K.'], dtype=object)]

In [216]:
xt.head()


Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary,Female,Male
0,25,226802,7,0,0,40,0.0,0.0,1.0
1,38,89814,9,0,0,50,0.0,0.0,1.0
2,28,336951,12,0,0,40,1.0,0.0,1.0
3,44,160323,10,7688,0,40,1.0,0.0,1.0
4,18,103497,10,0,0,30,0.0,1.0,0.0


In [217]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


#### Choose the model of your preference: DecisionTree or RandomForest

In [218]:
model = RandomForestClassifier(criterion='entropy')


In [219]:
model = DecisionTreeClassifier(criterion='entropy', max_depth=None)


In [220]:
model.fit(x.drop(['fnlwgt','salary'], axis=1), x.salary)


DecisionTreeClassifier(criterion='entropy')

In [221]:
model.tree_.node_count


8331

In [222]:
list(zip(x.drop(['fnlwgt','salary'], axis=1).columns, model.feature_importances_))


[('age', 0.3235601358938428),
 ('education-num', 0.16088618730446752),
 ('capital-gain', 0.22743335847003868),
 ('capital-loss', 0.07927833851211612),
 ('hours-per-week', 0.15328552389207126),
 ('sex_ Female', 0.021479698431940683),
 ('sex_ Male', 0.03407675749552305)]

In [223]:
list(zip(x.drop(['fnlwgt','salary'], axis=1).columns, model.feature_importances_))


[('age', 0.3235601358938428),
 ('education-num', 0.16088618730446752),
 ('capital-gain', 0.22743335847003868),
 ('capital-loss', 0.07927833851211612),
 ('hours-per-week', 0.15328552389207126),
 ('sex_ Female', 0.021479698431940683),
 ('sex_ Male', 0.03407675749552305)]

In [224]:
x.drop(['fnlwgt','salary'], axis=1).head()


Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,sex_ Female,sex_ Male
0,39,13,2174,0,40,0,1
1,50,13,0,0,13,0,1
2,38,9,0,0,40,0,1
3,53,7,0,0,40,0,1
4,28,13,0,0,40,1,0


In [225]:
set(x.columns) - set(xt.columns)


{'sex_ Female', 'sex_ Male'}

In [226]:
list(x.drop('salary', axis=1).columns)


['age',
 'fnlwgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'sex_ Female',
 'sex_ Male']

In [227]:
    predictions = model.predict(xt.drop(['fnlwgt','salary'], axis=1))
    predictionsx = model.predict(x.drop(['fnlwgt','salary'], axis=1))
    

Feature names unseen at fit time:
-  Female
-  Male
Feature names seen at fit time, yet now missing:
- sex_ Female
- sex_ Male



In [228]:
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix, auc, roc_curve
)


In [229]:
accuracy_score(xt.salary, predictions)


0.8202198882132548

In [230]:
accuracy_score(xt.salary, predictions)


0.8202198882132548

In [231]:
confusion_matrix(xt.salary, predictions)


array([[11447,   988],
       [ 1939,  1907]], dtype=int64)

In [232]:
confusion_matrix(xt.salary, predictions)


              precision    recall  f1-score   support

         0.0       0.86      0.92      0.89     12435
         1.0       0.66      0.50      0.57      3846

    accuracy                           0.82     16281
   macro avg       0.76      0.71      0.73     16281
weighted avg       0.81      0.82      0.81     16281



In [233]:
print(classification_report(xt.salary, predictions))


              precision    recall  f1-score   support

         0.0       0.86      0.92      0.89     12435
         1.0       0.66      0.50      0.57      3846

    accuracy                           0.82     16281
   macro avg       0.76      0.71      0.73     16281
weighted avg       0.81      0.82      0.81     16281



In [234]:
accuracy_score(x.salary, predictionsx)


0.8955806025613464

In [235]:
confusion_matrix(x.salary, predictionsx)


array([[24097,   623],
       [ 2777,  5064]], dtype=int64)

In [236]:
print(classification_report(x.salary, predictionsx))


              precision    recall  f1-score   support

         0.0       0.90      0.97      0.93     24720
         1.0       0.89      0.65      0.75      7841

    accuracy                           0.90     32561
   macro avg       0.89      0.81      0.84     32561
weighted avg       0.90      0.90      0.89     32561



In [237]:
print(classification_report(x.salary, predictionsx))


              precision    recall  f1-score   support

         0.0       0.90      0.97      0.93     24720
         1.0       0.89      0.65      0.75      7841

    accuracy                           0.90     32561
   macro avg       0.89      0.81      0.84     32561
weighted avg       0.90      0.90      0.89     32561



# For the following use the above `adult` dataset. 

# 1. Show the RandomForest outperforms the DecisionTree for a fixed `max_depth` by training using the train set and calculate `precision`, `recall`, `f1`, `confusion matrix` on golden-test set. Start with only numerical features/columns. (age, education-num, capital-gain, capital-loss, hours-per-week) 

In [246]:
import pandas as pd
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

#Transform salary column by tidying up the data (removing impurities related to the string)
df['salary'] = df['salary'].astype(str).str.strip().str.replace('\.', '', regex=True)
golden['salary'] = golden['salary'].astype(str).str.strip().str.replace('\.', '', regex=True)

#Apply OrdinalEncoder to the salary column
enc = preprocessing.OrdinalEncoder()
enc.fit(df[['salary']])
df['salary'] = enc.transform(df[['salary']])
golden['salary'] = enc.transform(golden[['salary']])

#Apply OneHotEncoder for categorical variables
transform_columns = ['sex']
OH = preprocessing.OneHotEncoder(handle_unknown="ignore", sparse=False)
OH.fit(df[transform_columns])
trans_train = OH.transform(df[transform_columns])
trans_test = OH.transform(golden[transform_columns])
new_cols = OH.get_feature_names_out(transform_columns)
properTrans_train = pd.DataFrame(trans_train, columns=new_cols)
properTrans_test = pd.DataFrame(trans_test, columns=new_cols)

#Prepare datasets by concatenate + defining categorical columns
x = pd.concat([df.drop(transform_columns, axis=1), properTrans_train], axis=1)
xt = pd.concat([golden.drop(transform_columns, axis=1), properTrans_test], axis=1)
categories = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'native-country', 'fnlwgt']

#Define x dataframe and xt dataframe + defining numerical columns
x = x.drop(categories, axis=1)
xt = xt.drop(categories, axis=1)
actual_numbers = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

#Define train/test split
xtrain = x[actual_numbers]
ytrain = x['salary']
ptest = xt[actual_numbers]
ytest = xt['salary']

#Define DecisionTree model by setting up max_depth_setting and training the model on the proper dataframes + obtain metrics
originalDecisionTree = DecisionTreeClassifier(criterion='entropy', max_depth=max_depth_setting)
max_depth_setting = 15
originalDecisionTree.fit(xtrain, ytrain)
originalDecisionTree_pred = originalDecisionTree.predict(ptest)
originalDecisionTreeprec = precision_score(ytest, originalDecisionTree_pred, average='macro')
originalDecisionTreerecall = recall_score(ytest, originalDecisionTree_pred, average='macro')
originalDecisionTreef1 = f1_score(ytest, originalDecisionTree_pred, average='macro')
originalDecisionTreeconfmatrix = confusion_matrix(ytest, originalDecisionTree_pred)

#Define RandomForest model by training the model on the proper dataframes + obtain metrics
originalRandomForest = RandomForestClassifier(criterion='entropy', max_depth=max_depth_setting)
max_depth_setting = 15
originalRandomForest.fit(xtrain, ytrain)
originalRandomForest_pred = originalRandomForest.predict(ptest)
originalRandomForestprec = precision_score(ytest, originalRandomForest_pred, average='macro')
originalRandomForestrecall = recall_score(ytest, originalRandomForest_pred, average='macro')
originalRandomForestf1 = f1_score(ytest, originalRandomForest_pred, average='macro')
originalRandomForestconfmatrix = confusion_matrix(ytest, originalRandomForest_pred)

#DecisionTree metrics
print("DecisionTree Evaluation:")
print(f"Precision: {originalDecisionTreeprec}")
print(f"Recall: {originalDecisionTreerecall}")
print(f"F1-Score: {originalDecisionTreef1}")
print("Confusion Matrix:")
print(originalDecisionTreeconfmatrix)

#RandomForest metrics
print("Random Forest Evaluation:")
print(f"Precision: {originalRandomForestprec}")
print(f"Recall: {originalRandomForestrecall}")
print(f"F1-Score: {originalRandomForestf1}")
print("Confusion Matrix:")
print(originalRandomForestconfmatrix)


DecisionTree Evaluation:
Precision: 0.7787284848145027
Recall: 0.697490382124332
F1-Score: 0.7225636221392884
Confusion Matrix:
[[11724   711]
 [ 2107  1739]]
Random Forest Evaluation:
Precision: 0.8109770725267429
Recall: 0.705891384026893
F1-Score: 0.7360758196650257
Confusion Matrix:
[[11920   515]
 [ 2103  1743]]


In [247]:
print("In the information presented above, The RandomForest model seems to be performing better when compared to the DecisionTree. It has higher values associated with Precision, this is evident in the confusion matrix. The difference between the two models with regards to F1 score and recall are fairly small, with the DecisionTree model sometimes having higher recall/F1 scores relative to the RandomForest model. This particular output showcases higher precision, recall, F1 values for RandomForest.")

In the information presented above, The RandomForest model seems to be performing better when compared to the DecisionTree. It has higher values associated with Precision, this is evident in the confusion matrix. The difference between the two models with regards to F1 score and recall are fairly small, with the DecisionTree model sometimes having higher recall/F1 scores relative to the RandomForest model. This particular output showcases higher precision, recall, F1 values for RandomForest.


# 2. Use a RandomForest or DecisionTree and the `adult` dataset, systematically add new columns, one by one, that are non-numerical but converted using the feature-extraction techniques we learned. Using the golden-test set show [`precision`, `recall`, `f1`, `confusion matrix`] for each additional feature added.

In [248]:
import pandas as pd
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

#Encoding/Transforming native country + sex

#Clean salary column by tidying up the properties of the string
df['salary'] = df['salary'].astype(str).str.strip().str.replace('\.', '', regex=True)
golden['salary'] = golden['salary'].astype(str).str.strip().str.replace('\.', '', regex=True)

#Apply OrdinalEncoder to the salary column
enc = preprocessing.OrdinalEncoder()
enc.fit(df[['salary']])
df['salary'] = enc.transform(df[['salary']])
golden['salary'] = enc.transform(golden[['salary']])

#Define columns to transform + conduct OneHotEncoder for columns specified for transformation
transform_columns = ['sex', 'native-country']
OH2 = preprocessing.OneHotEncoder(handle_unknown="ignore", sparse=False)
OH2.fit(df[transform_columns])

#Transformation of the columns + production of dataframe with transformed values
transformed_train = OH2.transform(df[transform_columns])
transformed_test = OH2.transform(golden[transform_columns])
new_cols = OH2.get_feature_names_out(transform_columns)
properTrans_train1 = pd.DataFrame(transformed_train, columns=new_cols)
properTrans_test1 = pd.DataFrame(transformed_test, columns=new_cols)

#Prepare final datasets by concatenating numerical features with encoded categorical features
x = pd.concat([df.drop(transform_columns, axis=1), properTrans_train1], axis=1)
xt = pd.concat([golden.drop(transform_columns, axis=1), properTrans_test1], axis=1)

#Specify columns to exclude (non-numerical and not transformed)
categories_1 = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'fnlwgt']

#Define dataframes properties for RandomForest model
x = x.drop(categories_1 + ['salary'], axis=1)
xt = xt.drop(categories_1 + ['salary'], axis=1)

#Define RandomForest model + prediction
V1RandomForest = RandomForestClassifier(criterion='entropy', max_depth=max_depth_setting)
max_depth_setting = 15
V1RandomForest.fit(x, df['salary'])
V1RandomForestpred = V1RandomForest.predict(xt)

#V1RandomForest model evaluation
V1RandomForestprec = precision_score(golden['salary'], V1RandomForestpred, average='macro')
V1RandomForestrecall = recall_score(golden['salary'], V1RandomForestpred, average='macro')
V1RandomForestf1 = f1_score(golden['salary'], V1RandomForestpred, average='macro')
V1RandomForestconfmatrix = confusion_matrix(golden['salary'], V1RandomForestpred)

#############################################################################
#Encoding/Transforming native country + sex + marital-status

#Define columns to transform + conduct OneHotEncoder for columns specified for transformation
transform_columns2 = ['sex', 'native-country', 'marital-status']
OH3 = preprocessing.OneHotEncoder(handle_unknown="ignore", sparse=False)
OH3.fit(df[transform_columns2])

#Transformation of the columns + production of dataframe with transformed values
transformed_train2 = OH3.transform(df[transform_columns2])
transformed_test2 = OH3.transform(golden[transform_columns2])
new_cols2 = OH3.get_feature_names_out(transform_columns2)
properTrans_train2 = pd.DataFrame(transformed_train2, columns=new_cols2)
properTrans_test2 = pd.DataFrame(transformed_test2, columns=new_cols2)

#Prepare final datasets by concatenating numerical features with encoded categorical features
x2 = pd.concat([df.drop(transform_columns2, axis=1), properTrans_train2], axis=1)
xt2 = pd.concat([golden.drop(transform_columns2, axis=1), properTrans_test2], axis=1)

#Specify columns to exclude (non-numerical and not transformed)
categories_2 = ['workclass', 'education', 'occupation', 'relationship', 'race', 'fnlwgt']

#Define dataframes properties for RandomForest model
x2 = x2.drop(categories_2 + ['salary'], axis=1)
xt2 = xt2.drop(categories_2 + ['salary'], axis=1)

#Define RandomForest model + prediction
V2RandomForest = RandomForestClassifier(criterion='entropy', max_depth=max_depth_setting)
max_depth_setting = 15
V2RandomForest.fit(x2, df['salary'])
V2RandomForestpred = V2RandomForest.predict(xt2)

#V2RandomForest model evaluation
V2RandomForestprec = precision_score(golden['salary'], V2RandomForestpred, average='macro')
V2RandomForestrecall = recall_score(golden['salary'], V2RandomForestpred, average='macro')
V2RandomForestf1 = f1_score(golden['salary'], V2RandomForestpred, average='macro')
V2RandomForestconfmatrix = confusion_matrix(golden['salary'], V2RandomForestpred)

######################################################################################
#Encoding/Transforming native country + sex + marital-status + working class

#Define columns to transform + conduct OneHotEncoder for columns specified for transformation
transform_columns3 = ['sex', 'native-country', 'marital-status', 'workclass']
OH4 = preprocessing.OneHotEncoder(handle_unknown="ignore", sparse=False)
OH4.fit(df[transform_columns3])

#Transformation of the columns + production of dataframe with transformed values
transformed_train3 = OH4.transform(df[transform_columns3])
transformed_test3 = OH4.transform(golden[transform_columns3])
new_cols3 = OH4.get_feature_names_out(transform_columns3)
properTrans_train3 = pd.DataFrame(transformed_train3, columns=new_cols3)
properTrans_test3 = pd.DataFrame(transformed_test3, columns=new_cols3)

#Prepare final datasets by concatenating numerical features with encoded categorical features
x3 = pd.concat([df.drop(transform_columns3, axis=1), properTrans_train3], axis=1)
xt3 = pd.concat([golden.drop(transform_columns3, axis=1), properTrans_test3], axis=1)

#Specify columns to exclude (non-numerical and not transformed)
non_num_columns3 = ['education', 'occupation', 'relationship', 'race', 'fnlwgt']

#Define dataframes properties for RandomForest model
x3 = x3.drop(non_num_columns3 + ['salary'], axis=1)
xt3 = xt3.drop(non_num_columns3 + ['salary'], axis=1)

#Define RandomForest model + prediction
V3RandomForest = RandomForestClassifier(criterion='entropy', max_depth=max_depth_setting)
max_depth_setting = 15
V3RandomForest.fit(x3, df['salary'])
V3RandomForestpred = V3RandomForest.predict(xt3)

#V3RandomForest model evaluation
V3RandomForestprec = precision_score(golden['salary'], V3RandomForestpred, average='macro')
V3RandomForestrecall = recall_score(golden['salary'], V3RandomForestpred, average='macro')
V3RandomForestf1 = f1_score(golden['salary'], V3RandomForestpred, average='macro')
V3RandomForestconfmatrix = confusion_matrix(golden['salary'], V3RandomForestpred)

#######################################################################################
#Metrics for RandomForest Model 1
print("Random Forest Evaluation (First Model):")
print(f"Precision: {V1RandomForestprec}")
print(f"Recall: {V1RandomForestrecall}")
print(f"F1-Score: {V1RandomForestf1}")
print("Confusion Matrix:")
print(V1RandomForestconfmatrix)

#Metrics for RandomForest Model 2
print("\nRandom Forest Evaluation (Second Model):")
print(f"Precision: {V2RandomForestprec}")
print(f"Recall: {V2RandomForestrecall}")
print(f"F1-Score: {V2RandomForestf1}")
print("Confusion Matrix:")
print(V2RandomForestconfmatrix)

#Metrics for RandomForest Model 3
print("\nRandom Forest Evaluation (Third Model):")
print(f"Precision: {V3RandomForestprec}")
print(f"Recall: {V3RandomForestrecall}")
print(f"F1-Score: {V3RandomForestf1}")
print("Confusion Matrix:")
print(V3RandomForestconfmatrix)


Random Forest Evaluation (First Model):
Precision: 0.8143845257484501
Recall: 0.7119212102621619
F1-Score: 0.7422005474065758
Confusion Matrix:
[[11918   517]
 [ 2056  1790]]

Random Forest Evaluation (Second Model):
Precision: 0.8372118866514666
Recall: 0.7465669531485724
F1-Score: 0.7769227918160979
Confusion Matrix:
[[11939   496]
 [ 1796  2050]]

Random Forest Evaluation (Third Model):
Precision: 0.8347740279675124
Recall: 0.752238880870072
F1-Score: 0.7809542498174833
Confusion Matrix:
[[11899   536]
 [ 1740  2106]]


In [250]:
print("For the first model, I decided to encode and transform sex and native-country variables. Once the transformation was completed, I integrated these values back to the proper dataframe and conducted a RandomTree analysis (after defining the model). Results for model 1 indicate a precision value of .814, with a recall rate of .711 and an F1 score of .742. We will use the confusion matrix values to compare and contrast against future models.For the second model, I decided to encode and transform the sex, native-country, and marital-status variables. Once again, I integrated these encoded/transformed variables back to the proper dataframe and defined my RandomTree model and conducted subsequent analysis on the model. The results from model 2 represent an improvement across precision, recall and F1 score compared to the previous model. Regarding the confusion matrix comparison vs. model 1, the second model improves with regards to reducing the frequency of false positive and negatives. For the final model, I decided to encode and transform, the sex, native-country, martial-status, and working class variables. After encoding and transforming the variables, I integrated them back to the original dataframe and once again defined my RandomTree model and conducted analysis. In comparison to model 2, model 3 achieved higher values for precision, recall, and F1 score. Additionally, it has a lower frequency count for false positive/negatives relative to the second model. Overall, I would say that Model 3 is an improvement over the initial model that I created.")

For the first model, I decided to encode and transform sex and native-country variables. Once the transformation was completed, I integrated these values back to the proper dataframe and conducted a RandomTree analysis (after defining the model). Results for model 1 indicate a precision value of .814, with a recall rate of .711 and an F1 score of .742. We will use the confusion matrix values to compare and contrast against future models.For the second model, I decided to encode and transform the sex, native-country, and marital-status variables. Once again, I integrated these encoded/transformed variables back to the proper dataframe and defined my RandomTree model and conducted subsequent analysis on the model. The results from model 2 represent an improvement across precision, recall and F1 score compared to the previous model. Regarding the confusion matrix comparison vs. model 1, the second model improves with regards to reducing the frequency of false positive and negatives. For t