In [None]:
###### Set Up #####
# verify our folder with the data and module assets is installed
# if it is installed make sure it is the latest
!test -e ds-assets && cd ds-assets && git pull && cd ..
# if it is not installed clone it 
!test ! -e ds-assets && git clone https://github.com/IndraniMandal/ds-assets.git
# point to the folder with the assets
home = "ds-assets/assets/" 
import sys
sys.path.append(home)  

Cloning into 'ds-assets'...
remote: Enumerating objects: 176, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 176 (delta 3), reused 2 (delta 0), pack-reused 164[K
Receiving objects: 100% (176/176), 9.35 MiB | 12.29 MiB/s, done.
Resolving deltas: 100% (63/63), done.


##Obtaining the data set 

In [None]:
import pandas as pd
url = "https://vincentarelbundock.github.io/Rdatasets/csv/robustbase/CrohnD.csv"
df = pd.read_csv(url)
df

Unnamed: 0.1,Unnamed: 0,ID,nrAdvE,BMI,height,country,sex,age,weight,treat
0,1,19908,4,25.22,163,c1,F,47,67,placebo
1,2,19909,4,23.80,164,c1,F,53,64,d1
2,3,19910,1,23.05,164,c1,F,68,62,placebo
3,4,20908,1,25.71,165,c1,F,48,70,d2
4,5,20909,2,25.95,170,c1,F,67,75,placebo
...,...,...,...,...,...,...,...,...,...,...
112,113,54933,2,26.45,165,c2,F,73,72,placebo
113,114,54934,1,19.11,150,c2,F,49,43,d2
114,115,54935,1,44.06,158,c2,F,47,110,d1
115,116,54936,0,25.81,155,c2,F,66,62,d1


##Data cleaning eliminating the "ID" and "Unnamed" columns as it doesn't carry any important information. Replacing the "sex" and "country" data with numerical data instead of categorical.

In [None]:
df = df.drop(columns = "ID")
df = df.drop(columns = "Unnamed: 0")
df['sex'].replace(["F", "M"], [0,1], inplace=True)
df['country'].replace(["c1", "c2"], [0,1], inplace=True)
df

Unnamed: 0,nrAdvE,BMI,height,country,sex,age,weight,treat
0,4,25.22,163,0,0,47,67,placebo
1,4,23.80,164,0,0,53,64,d1
2,1,23.05,164,0,0,68,62,placebo
3,1,25.71,165,0,0,48,70,d2
4,2,25.95,170,0,0,67,75,placebo
...,...,...,...,...,...,...,...,...
112,2,26.45,165,1,0,73,72,placebo
113,1,19.11,150,1,0,49,43,d2
114,1,44.06,158,1,0,47,110,d1
115,0,25.81,155,1,0,66,62,d1


#SVM Grid Search

In [None]:
# set up
import pandas as pd
import numpy as np
np.set_printoptions(formatter={'float_kind':"{:3.2f}".format})
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

##Finding the best SVM model

##Kernel : linear

In [None]:
#get data
X  = df.drop(['treat'],axis=1)
actual_y = df['treat']


# SVM model
model = SVC(kernel='linear', C=0.001, max_iter=10000)

# do the 5-fold cross validation
scores = cross_val_score(model, X, actual_y, cv=5)
print("Fold Accuracies: {}".format(scores))
print("Accuracy: {:3.2f}".format(scores.mean()))

model.fit(X, actual_y)

# evaluate the best model

predict_y = model.predict(X)
acc = accuracy_score(actual_y, predict_y)

# build the confusion matrix
labels = ['placebo', 'd1', 'd2']
cm = confusion_matrix(actual_y, predict_y, labels=labels)
cm_df = pd.DataFrame(cm, index=labels, columns=labels)
print("Confusion Matrix:\n{}".format(cm_df))

Fold Accuracies: [0.42 0.33 0.26 0.39 0.30]
Accuracy: 0.34
Confusion Matrix:
         placebo  d1  d2
placebo       27   4   8
d1            21  10   8
d2            15   4  20


###The result of the linear kernel are not great as it has an acurracy rate of just 34 percent, a number that can be considered not as reliable when terminating treatment for patients needing a special treatment, as there is a significant amount of false-positive and false negative

##Kernel : Poly

In [None]:
#get data
X  = df.drop(['treat'],axis=1)
actual_y = df['treat']


# SVM model
model = SVC(kernel='poly', C=0.001, max_iter=10000, degree = 4, coef0 = 0.99)

# do the 5-fold cross validation
scores = cross_val_score(model, X, actual_y, cv=5)
print("Fold Accuracies: {}".format(scores))
print("Accuracy: {:3.2f}".format(scores.mean()))

model.fit(X, actual_y)

# evaluate the best model

predict_y = model.predict(X)
acc = accuracy_score(actual_y, predict_y)

# build the confusion matrix
labels = ['placebo', 'd1', 'd2']
cm = confusion_matrix(actual_y, predict_y, labels=labels)
cm_df = pd.DataFrame(cm, index=labels, columns=labels)
print("Confusion Matrix:\n{}".format(cm_df))

Fold Accuracies: [0.54 0.33 0.30 0.30 0.30]
Accuracy: 0.36
Confusion Matrix:
         placebo  d1  d2
placebo       14  21   4
d1            13  23   3
d2             7  22  10


###The result of the poly kernel are not great as it has an acurracy rate of 36 percent, a number that can be considered not as reliable when terminating treatment for patients needing a special treatment, as there is a significant amount of false-positive and false negative. However, the the poly kernel showed some improvement from the linear representing better result.






##Kernel : RBF

In [None]:
#get data
X  = df.drop(['treat'],axis=1)
actual_y = df['treat']


# SVM model
model = SVC(kernel='rbf', C=10000, max_iter=10000, gamma = "scale")

# do the 5-fold cross validation
scores = cross_val_score(model, X, actual_y, cv=5)
print("Fold Accuracies: {}".format(scores))
print("Accuracy: {:3.2f}".format(scores.mean()))

model.fit(X, actual_y)

# evaluate the best model

predict_y = model.predict(X)
acc = accuracy_score(actual_y, predict_y)

# build the confusion matrix
labels = ['placebo', 'd1', 'd2']
cm = confusion_matrix(actual_y, predict_y, labels=labels)
cm_df = pd.DataFrame(cm, index=labels, columns=labels)
print("Confusion Matrix:\n{}".format(cm_df))



Fold Accuracies: [0.38 0.29 0.35 0.30 0.17]
Accuracy: 0.30
Confusion Matrix:
         placebo  d1  d2
placebo       20  12   7
d1            16  17   6
d2            12   8  19


###The result of the rbf kernel are not great as it has an acurracy rate of just 30 percent, a number that is showing decrement from the linear and poly kernel. Indicating that this model is not the best to utilized especially the medical field.

##Kernel : Sigmoid

In [None]:
#get data
X  = df.drop(['treat'],axis=1)
actual_y = df['treat']


# SVM model
model = SVC(kernel='sigmoid', C=0.001, max_iter=10000, coef0= .80)

# do the 5-fold cross validation

scores = cross_val_score(model, X, actual_y, cv=5)
print("Fold Accuracies: {}".format(scores))
print("Accuracy: {:3.2f}".format(scores.mean()))

model.fit(X, actual_y)

# evaluate the best model

predict_y = model.predict(X)
acc = accuracy_score(actual_y, predict_y)

# build the confusion matrix
# the confusion matric need to be fix at the moment as the value of the pareameter are incorrect
labels = ['placebo', 'd1', 'd2']
cm = confusion_matrix(actual_y, predict_y, labels=labels)
cm_df = pd.DataFrame(cm, index=labels, columns=labels)
print("Confusion Matrix:\n{}".format(cm_df))

Fold Accuracies: [0.33 0.33 0.30 0.30 0.30]
Accuracy: 0.32
Confusion Matrix:
         placebo  d1  d2
placebo       15  14  10
d1            13  13  13
d2             8  15  16


###The result of the sigmoid are not great as it has an acurracy rate of just 32 percent, a number that is showing decrement from the linear and poly kernel. Indicating that this model is not the best to utilized especially the medical field.

#Finding the Best Paremeters

In [None]:
#get data
X  = df.drop(['treat'],axis=1)
actual_y = df['treat']

# SVM model
model = SVC(max_iter=10000)

# grid search
param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['poly']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['sigmoid']}
 ]
grid = GridSearchCV(model, param_grid, cv=5)
grid.fit(X, actual_y)
print("Grid Search: best parameters: {}".format(grid.best_params_))

# evaluate the best model
best_model = grid.best_estimator_
predict_y = best_model.predict(X)
acc = accuracy_score(actual_y, predict_y)
lb,ub = classification_confint(acc,X.shape[0])
print("Accuracy: {:3.2f} ({:3.2f},{:3.2f})".format(acc,lb,ub))

# build the confusion matrix
#the confusion matrix need to be fix at the moment as the result are all the same for the four models
labels = ['placebo', 'd1', 'd2']
cm = confusion_matrix(actual_y, predict_y, labels=labels)
cm_df = pd.DataFrame(cm, index=labels, columns=labels)
print("Confusion Matrix:\n{}".format(cm_df))



Grid Search: best parameters: {'C': 10, 'gamma': 0.001, 'kernel': 'poly'}
Accuracy: 0.30 (0.22,0.38)
Confusion Matrix:
         placebo  d1  d2
placebo       15  15   9
d1            15  15   9
d2            16  18   5




##The best paremeter is kernel : poly, gamma : 0.001, and C : 10

#MLP Model

In [None]:
# set up
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from confint import classification_confint

# get data
df = pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/robustbase/CrohnD.csv")
df = df.drop(['ID'],axis=1)
df = df.drop(columns = "Unnamed: 0")
df['sex'].replace(["F", "M"], [0,1], inplace=True)
df['country'].replace(["c1", "c2"], [0,1], inplace=True)
X  = df.drop(['treat'],axis=1)
y = df['treat']

# neural network
model = MLPClassifier(max_iter=10000, random_state=1)

# grid search
# We set up a grid search over the architecture and activation functions.
# In the architecture search we limit ourselves to node values that are multiples
# of the number of independent variables in the training data.  Also, we
# limit ourselves to a maximum of two hidden layers.
param_grid = {
    # search over different architectures
    'hidden_layer_sizes': 
      [ 
      (10,), (20,),            # single layer MLP
      (10,10), (20, 10),
      (20,10), (20,20)
      ],
    # search different activation functions
    'activation' : ['relu', 'logistic']   
}

# use 3-fold cross-validation otherwse grid search takes too long
grid = GridSearchCV(model, param_grid, cv=3) 
grid.fit(X, y)
print("Grid Search: best parameters: {}".format(grid.best_params_))

# evaluate the best model
best_model = grid.best_estimator_
predict_y = best_model.predict(X)
acc = accuracy_score(y, predict_y)
lb,ub = classification_confint(acc,X.shape[0])
print("Accuracy: {:3.2f} ({:3.2f},{:3.2f})".format(acc,lb,ub))

Grid Search: best parameters: {'activation': 'logistic', 'hidden_layer_sizes': (20,)}
Accuracy: 0.91 (0.86,0.97)
