In [30]:
# importing libraries
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

# defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'webster-data445-bucket'
bucket = s3.Bucket(bucket_name)

# defining the csv file
file_key = 'drug200.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading the csv file
drug = pd.read_csv(file_content_stream)
# dropping missing values
drug = drug.dropna()
drug.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


In [31]:
# frequency table of Drug
drug['Drug'].value_counts(normalize = True)

DrugY    0.455
drugX    0.270
drugA    0.115
drugB    0.080
drugC    0.080
Name: Drug, dtype: float64

In [32]:
## creating Drug_numb
drug['Drug_numb'] = np.where(drug['Drug'] == 'drugA', 1, 
                                np.where(drug['Drug'] == 'drugB', 2,
                                        np.where(drug['Drug'] == 'drugC', 3,
                                                np.where(drug['Drug'] == 'drugX', 4, 5))))

drug.head(10)

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug,Drug_numb
0,23,F,HIGH,HIGH,25.355,DrugY,5
1,47,M,LOW,HIGH,13.093,drugC,3
2,47,M,LOW,HIGH,10.114,drugC,3
3,28,F,NORMAL,HIGH,7.798,drugX,4
4,61,F,LOW,HIGH,18.043,DrugY,5
5,22,F,NORMAL,HIGH,8.607,drugX,4
6,49,F,NORMAL,HIGH,16.275,DrugY,5
7,41,M,LOW,HIGH,11.037,drugC,3
8,60,M,NORMAL,HIGH,15.171,DrugY,5
9,43,M,LOW,NORMAL,19.368,DrugY,5


In [33]:
# changing sex to dummy variables
drug['Sex_numb'] = np.where(drug['Sex'] == 'F', 0, 1)

# changing BP to dummy variables
drug = pd.concat([drug, pd.get_dummies(drug['BP'])], axis = 1)
drug = drug.rename(columns = {'HIGH' : 'BP_HIGH', 'LOW' : 'BP_LOW', 'NORMAL' : 'BP_NORMAL'})

# changing cholesterol to dummy variables
drug = pd.concat([drug, pd.get_dummies(drug['Cholesterol'])], axis = 1)
drug = drug.rename(columns = {'HIGH' : 'CHO_HIGH', 'NORMAL' : 'CHO_NORMAL'})

drug.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug,Drug_numb,Sex_numb,BP_HIGH,BP_LOW,BP_NORMAL,CHO_HIGH,CHO_NORMAL
0,23,F,HIGH,HIGH,25.355,DrugY,5,0,1,0,0,1,0
1,47,M,LOW,HIGH,13.093,drugC,3,1,0,1,0,1,0
2,47,M,LOW,HIGH,10.114,drugC,3,1,0,1,0,1,0
3,28,F,NORMAL,HIGH,7.798,drugX,4,0,0,0,1,1,0
4,61,F,LOW,HIGH,18.043,DrugY,5,0,0,1,0,1,0


In [37]:
# defining input and target variables
X = drug[['Age', 'Sex_numb', 'BP_HIGH', 'BP_LOW', 'CHO_HIGH', 'Na_to_K']]
Y = drug['Drug_numb']

# splitting into training and testing
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [38]:
# random forest
one_vs_rest_RF = OneVsRestClassifier(estimator = RandomForestClassifier(n_estimators = 500, max_depth = 3)).fit(X_train, Y_train)

# predicting on test
one_vs_rest_RF_pred = one_vs_rest_RF.predict(X_test)

# compute classification resport
print(classification_report(Y_test, one_vs_rest_RF_pred))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         3
           3       1.00      0.67      0.80         3
           4       0.92      1.00      0.96        11
           5       1.00      1.00      1.00        18

    accuracy                           0.97        40
   macro avg       0.98      0.93      0.95        40
weighted avg       0.98      0.97      0.97        40



In [39]:
# ada boost
one_vs_rest_AB = OneVsRestClassifier(estimator = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 500, 
                                                                   learning_rate = 0.01)).fit(X_train, Y_train)

# prediction on test
one_vs_rest_AB_pred = one_vs_rest_AB.predict(X_test)

# compute classification report
print(classification_report(Y_test, one_vs_rest_AB_pred))

              precision    recall  f1-score   support

           1       1.00      0.80      0.89         5
           2       1.00      1.00      1.00         3
           3       1.00      1.00      1.00         3
           4       0.92      1.00      0.96        11
           5       1.00      1.00      1.00        18

    accuracy                           0.97        40
   macro avg       0.98      0.96      0.97        40
weighted avg       0.98      0.97      0.97        40



In [None]:
## In my case, I would choose to use AdaBoost because it has a slightly higher average recall between all five dummy variables, while maintaining
## the same accuracy. The models are very close, however. 