In [None]:
## <center>Practice Project Question I
**1.** Download the data and read the descriptions in the file adult.names. Remove records with unknown (?) values from both train and test data sets and remove all continuous attributes. For each multi-domain categorical attribute, you can use one-hot encoding to transform data (this step is needed if you choose scikit-learn to build decision tree and naïve classifier; it is optional if you choose Weka). In your report, describe briefly how you develop your algorithm or apply software on the following two tasks and include 2-4 screenshots about your algorithm settings and output
- Build a decision tree classifier (single tree) and report accuracy by class including (TP rate, FP rate, precision, recall, F1) on the test data.
- Build a naïve Bayesian classifier and report accuracy by class including (TP rate, FP rate, precision, recall, F1) on the test data.
## Training Data Cleaning
# import pandas and numpy
import pandas as pd
import numpy as np
# read in csv dataset
train_df = pd.read_csv("dataset/adult_training.csv")
train_df
# added column keys since there is none provided from original dataset
train_df.columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", ">50k"]
train_df.keys()
### Dropping continuous attributes

# dropping continuous attributes
train_df.drop(["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"], axis = 1, inplace=True)
train_df.head(20)
### Removing all rows that contains " ?" in data
# removing all rows that contains " ?" in data
for col in train_df.columns:
    train_df.drop(train_df.index[train_df[col] == " ?"], inplace=True)

train_df.head(20)
### One-hot encoding on training dataset
# one-hot encoding on training dataset
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
train_df = train_df.apply(le.fit_transform)

train_df
## Testing Data Cleaning
# loading in testing data
testing_df = pd.read_csv("dataset/adult_test.csv")
# added column keys since there is none provided from original dataset
testing_df.columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", ">50k"]
### Dropping continuous attributes
# dropping continuous attributes
testing_df.drop(["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"], axis = 1, inplace=True)
# removing all rows that contains " ?" in data
for col in testing_df.columns:
    testing_df.drop(testing_df.index[testing_df[col] == " ?"], inplace=True)
### One-hot encoding
# one-hot encoding on testing dataset
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
testing_df = testing_df.apply(le.fit_transform)

testing_df
### Loading Data into CSV
#test data clean
testing_df.to_csv('dataset/part1_data_clean/adult_test_clean.csv',index = False)

train_df.to_csv('dataset/part1_data_clean/adult_train_clean.csv',index = False)
## Build Decision Classifier(Single Tree)
## code here
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

# defining FEATURES | TARGET
FEATURES = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]
TARGET = ">50k"

# training set 90% from adult_training.csv
X_train, X_test, y_train, y_test = train_test_split(train_df[FEATURES], train_df[TARGET], test_size=0.1, random_state=1)
# testing set 90% from adult_test.csv
X_train2, X_test2, y_train2, y_test2 = train_test_split(testing_df[FEATURES], testing_df[TARGET], test_size=0.9, random_state=1)

# create and fit training data into decision tree classifier
model = DecisionTreeClassifier()
test = model.fit(X_train, y_train)

# get prediction from model based on testing set
y_pred = test.predict(X_test2)

# output accuracy of model
print(f'Accuracy: {metrics.accuracy_score(y_test2, y_pred)*100:.4f}%')
## Decision Tree Accuracy
- TP Rate
## code for TP Rate
from sklearn.metrics import confusion_matrix

""" 2X2 Confusion Matrix
TP | FN
--- ---
FP | TN
"""

CM = confusion_matrix(y_test2, y_pred)

TP = CM[0][0]
FN = CM[1][0]
TN = CM[1][1]
FP = CM[0][1]


print(f'True Positive: {TP}')
- FP Rate
## code fpr FP Rate
CM = confusion_matrix(y_test2, y_pred)

""" 2X2 Confusion Matrix
TP | FN
--- ---
FP | TN
"""

TP = CM[0][0]
FN = CM[1][0]
TN = CM[1][1]
FP = CM[0][1]

print(f'False Positive: {FP}')
- Precision
## code for Precision
from sklearn.metrics import precision_score

#finding precision score with "weighted"
precision = precision_score(y_test2, y_pred, average='weighted')


print(f'Precison: {precision*100:.4f}%')
- Recall
## code Recall
from sklearn.metrics import recall_score

# finding recall score with "weighted"
recall = recall_score(y_test2, y_pred, average='weighted') 


print(f'Recall: {recall*100:.4f}%')
- F1
## code for F1
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_val_score

# finding "F1_Score" with "weighted"
scores = cross_val_score(
    model,
    X_train[FEATURES],
    y_train,
    cv=5,
    scoring = make_scorer(f1_score, average ='weighted')
)

print(f'F1 Score: {scores.mean()*100:.4f}%')
## Build Naïve Bayesian Classifier
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

FEATURES = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]
TARGET = ">50k"

X_train, X_test, y_train, y_test = train_test_split(train_df[FEATURES], train_df[TARGET], test_size=0.1, random_state=1)
X_train2, X_test2, y_train2, y_test2 = train_test_split(testing_df[FEATURES], testing_df[TARGET], test_size=0.9, random_state=1)


#Create a Gaussian Classifier
bayModel = GaussianNB()
bayModel.fit(X_train, y_train)


bay_pred = test.predict(X_test2)


print(f"Naïve Bayesian Accuracy: {metrics.accuracy_score(y_test2, bay_pred)*100:.4f}%")
## Naïve Bayesian Accuracy
- TP Rate
## code TP Rate
from sklearn.metrics import confusion_matrix

CM = confusion_matrix(y_test2, bay_pred)

TP = CM[0][0]
FN = CM[1][0]
TN = CM[1][1]
FP = CM[0][1]


print(f'True Positive: {TP}')
- FP Rate
## code for FP Rate
from sklearn.metrics import confusion_matrix

CM = confusion_matrix(y_test2, bay_pred)

TP = CM[0][0]
FN = CM[1][0]
TN = CM[1][1]
FP = CM[0][1]


print(f'False Positive: {FP}')
- Precision
## code for Precision
from sklearn.metrics import precision_score

#finding precision score with "weighted"
precision = precision_score(y_test2, bay_pred, average='weighted')

print(f'Precison: {precision*100:.4f}%')
- Recall
## code for Recall
from sklearn.metrics import recall_score

# finding recall score with "weighted"
recall = recall_score(y_test2, bay_pred, average='weighted') 


print(f'Recall: {recall*100:.4f}%')
- F1
## code for F1
from sklearn.metrics import f1_score, make_scorer

# finding "F1_Score" with "weighted"
scores = cross_val_score(
    bayModel,
    X_train[FEATURES],
    y_train,
    cv=5,
    scoring = make_scorer(f1_score, average ='weighted')
)

print(f'F1 Score: {scores.mean()*100:.4f}%')