In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn import naive_bayes
from sklearn import metrics
import seaborn as sns

In [2]:
drug_data = pd.read_csv("C:/Users/Rui Wu/Documents/Programming C Drive/Final Project/cleaned_drug.csv")
drug_data.shape

(1885, 20)

In [3]:
drug_data_y = drug_data['Illicit Drugs']
drug_data_x = drug_data.drop(['Illicit Drugs', 'ID'], axis = 1)
# drug_data_x = drug_data_x.astype('category')

drug_dummies = pd.get_dummies(drug_data_x[drug_data_x.columns])
drug_dummies.head()

Unnamed: 0,Age,Education,Nscore,Escore,Oscore,Ascore,Cscore,Impulsive,SS,Gender_Female,...,Legal High_CL4,Legal High_CL5,Legal High_CL6,Nicotine_CL0,Nicotine_CL1,Nicotine_CL2,Nicotine_CL3,Nicotine_CL4,Nicotine_CL5,Nicotine_CL6
0,0.49788,-0.05921,0.31287,-0.57545,-0.58331,-0.91699,-0.00665,-0.21712,-1.18084,1,...,0,0,0,0,0,1,0,0,0,0
1,-0.07854,1.98437,-0.67825,1.93886,1.43533,0.76096,-0.14277,-0.71126,-0.21575,0,...,0,0,0,0,0,0,0,1,0,0
2,0.49788,-0.05921,-0.46725,0.80523,-0.84732,-1.6209,-1.0145,-1.37983,0.40148,0,...,0,0,0,1,0,0,0,0,0,0
3,-0.95197,1.16365,-0.14882,-0.80615,-0.01928,0.59042,0.58489,-1.37983,-1.18084,1,...,0,0,0,0,0,1,0,0,0,0
4,0.49788,1.98437,0.73545,-1.6334,-0.45174,-0.30172,1.30612,-0.21712,-0.21575,1,...,0,0,0,0,0,1,0,0,0,0


In [4]:
kf = KFold(n_splits=4, shuffle=True, random_state=0)
i = 0
scores = []
total_cm = [[0,0],[0,0]]
for train_index, test_index in kf.split(drug_dummies):
    i = i + 1
    print("Iteration " + str(i))
    x_train, x_test = drug_dummies.iloc[train_index], drug_dummies.iloc[test_index]
    y_train, y_test = drug_data_y.iloc[train_index], drug_data_y.iloc[test_index]

    gnb = naive_bayes.GaussianNB()
    gnb.fit(x_train, y_train)
    predictions = gnb.predict(x_test)
    score = gnb.score(x_test, y_test)
    scores.append(score)
    print("Accuracy is: " + str(score))
    
    cm = metrics.confusion_matrix(y_test, predictions)
    print(cm)
    
    for a in range(len(cm)):
        for b in range(len(cm)):
            total_cm[a][b] += cm[a][b]

print("True Negatives Rate: " + str(total_cm[0][0] / 1154))
print("True Positives Rate: " + str(total_cm[1][1] / 731))
print("False Negatives Rate: " + str(total_cm[1][0] / 731))
print("False Positives Rate: " + str(total_cm[0][1] / 1154))

scores_mean = sum(scores) / len(scores)
scores_std = np.std(np.array(scores))
print(scores)
print("Overall Accuracy: %0.4f (+/- %0.4f)" % (scores_mean, scores_std * 2))

Iteration 1
Accuracy is: 0.7584745762711864
[[248  47]
 [ 67 110]]
Iteration 2
Accuracy is: 0.7600849256900213
[[225  40]
 [ 73 133]]
Iteration 3
Accuracy is: 0.7813163481953291
[[251  47]
 [ 56 117]]
Iteration 4
Accuracy is: 0.7898089171974523
[[252  44]
 [ 55 120]]
True Negatives Rate: 0.8457538994800693
True Positives Rate: 0.6566347469220246
False Negatives Rate: 0.3433652530779754
False Positives Rate: 0.15424610051993068
[0.7584745762711864, 0.7600849256900213, 0.7813163481953291, 0.7898089171974523]
Overall Accuracy: 0.7724 (+/- 0.0270)


In [5]:
kf = KFold(n_splits=4, shuffle=True, random_state=0)
i = 0
scores = []
total_cm = [[0,0],[0,0]]
for train_index, test_index in kf.split(drug_dummies):
    i = i + 1
    print("Iteration " + str(i))
    x_train, x_test = drug_dummies.iloc[train_index], drug_dummies.iloc[test_index]
    y_train, y_test = drug_data_y.iloc[train_index], drug_data_y.iloc[test_index]

    bnb = naive_bayes.BernoulliNB()
    bnb.fit(x_train, y_train)
    predictions = bnb.predict(x_test)
    score = bnb.score(x_test, y_test)
    scores.append(score)
    print("Accuracy is: " + str(score))
    
    cm = metrics.confusion_matrix(y_test, predictions)
    print(cm)
    
    for a in range(len(cm)):
        for b in range(len(cm)):
            total_cm[a][b] += cm[a][b]

print("True Negatives Rate: " + str(total_cm[0][0] / 1154))
print("True Positives Rate: " + str(total_cm[1][1] / 731))
print("False Negatives Rate: " + str(total_cm[1][0] / 731))
print("False Positives Rate: " + str(total_cm[0][1] / 1154))

scores_mean = sum(scores) / len(scores)
scores_std = np.std(np.array(scores))
print(scores)
print("Overall Accuracy: %0.4f (+/- %0.4f)" % (scores_mean, scores_std * 2))

Iteration 1
Accuracy is: 0.8029661016949152
[[231  64]
 [ 29 148]]
Iteration 2
Accuracy is: 0.7898089171974523
[[203  62]
 [ 37 169]]
Iteration 3
Accuracy is: 0.8089171974522293
[[239  59]
 [ 31 142]]
Iteration 4
Accuracy is: 0.7961783439490446
[[233  63]
 [ 33 142]]
True Negatives Rate: 0.7850953206239168
True Positives Rate: 0.8221614227086184
False Negatives Rate: 0.17783857729138167
False Positives Rate: 0.21490467937608318
[0.8029661016949152, 0.7898089171974523, 0.8089171974522293, 0.7961783439490446]
Overall Accuracy: 0.7995 (+/- 0.0143)
