In [1]:
# libraries for data manipulation
import numpy as np
import pandas as pd

# helper functions
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import roc_auc_score,accuracy_score,confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.neighbors import KNeighborsClassifier

# support vector classifier (used for support vector machines and maximal margin classifiers as well)
from sklearn.svm import SVC

import warnings
warnings.filterwarnings('ignore')


In [37]:
# load dataset
df = pd.read_csv("/Users/jaimesong/desktop/cpsc393/assignment_01_data.csv")
print(df.head())

X = df[['X1','X2','X3','X4','X5','X6','X7','X8']]
y = df['Y']

# 80/20 Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, train_size=0.8)

         X1        X2        X3        X4        X5        X6        X7  \
0 -0.604285 -0.610629  0.026014  0.019710  0.406532  0.678796  0.524360   
1 -0.111772 -1.125178  0.744157  0.078315  0.088176  0.891009  0.018460   
2 -0.916802  1.965494  0.150022  0.388770  0.179276  0.064449  0.159279   
3 -0.280479  0.920669  0.208949  0.940153  0.854437  0.688172  0.365126   
4  1.856025  1.043214  0.167088  0.207002  0.979049  0.641019  0.628764   

         X8  Y  
0  0.404739  B  
1  0.536511  B  
2  0.951204  B  
3  0.985259  A  
4  0.045912  B  


In [38]:
pipe = Pipeline([
    ('scaler',StandardScaler()), # standard scale the data
    ('svm',SVC(probability=True)) # fit the SVM model
])

# initialize hyperparameters
param_grid = [
    {'svm__kernel': ['linear'], 
    'svm__C': [0.001, 0.01, 1, 5, 25, 50]}, # try different values of C
    {'svm__kernel': ['rbf'],
    'svm__gamma': [0.001, 0.01, 0.1, 0.5, 1, 2, 5]} # try different values of gamma
]

grid_search_svm = GridSearchCV(pipe, param_grid, cv=5)
grid_search_svm.fit(X_train, y_train)

y_train_pred = grid_search_svm.predict(X_train)
y_test_pred = grid_search_svm.predict(X_test)

print("Best Parameters:", grid_search_svm.best_params_)
print("Train Accuracy:",accuracy_score(y_train,y_train_pred))
print("Test Accuracy:",accuracy_score(y_test,y_test_pred))

y_test_prob = grid_search_svm.predict_proba(X_test)[:,1]

roc_auc = roc_auc_score(y_test, y_test_prob)
print("\nTest ROC AUC:", roc_auc)

cm = confusion_matrix(y_test, y_test_pred)
print("\nConfusion Matrix:\n", cm)


Best Parameters: {'svm__C': 25, 'svm__kernel': 'linear'}
Train Accuracy: 0.7625
Test Accuracy: 0.745

Test ROC AUC: 0.828768544454819

Confusion Matrix:
 [[53 28]
 [23 96]]


In [34]:
pipe = Pipeline([
    ('scaler',StandardScaler()), 
    ('logreg', LogisticRegression()) 
])

param_grid = {
    'logreg__C': [0.001, 0.01, 1, 5, 25, 50]
}

grid_search_log = GridSearchCV(pipe, param_grid, cv=5)
grid_search_log.fit(X_train, y_train)

y_train_pred = grid_search_log.predict(X_train)
y_test_pred = grid_search_log.predict(X_test)

print("Best Parameters:", grid_search_log.best_params_)
print("Train Accuracy:",accuracy_score(y_train,y_train_pred))
print("Test Accuracy:",accuracy_score(y_test,y_test_pred))

y_test_prob = grid_search_log.predict_proba(X_test)[:,1]

roc_auc = roc_auc_score(y_test, y_test_prob)
print("\nTest ROC AUC:", roc_auc)

cm = confusion_matrix(y_test, y_test_pred)
print("\nConfusion Matrix:\n", cm)

Best Parameters: {'logreg__C': 5}
Train Accuracy: 0.76
Test Accuracy: 0.77

Test ROC AUC: 0.832244008714597

Confusion Matrix:
 [[55 26]
 [20 99]]


In [39]:
pipe = Pipeline([
    ('scaler',StandardScaler()), 
    ('knn', KNeighborsClassifier()) 
])

param_grid = {
    'knn__n_neighbors': [1, 3, 5, 7, 9, 11, 15, 20]
}

grid_search_knn = GridSearchCV(pipe, param_grid, cv=5)
grid_search_knn.fit(X_train, y_train)

y_train_pred = grid_search_knn.predict(X_train)
y_test_pred = grid_search_knn.predict(X_test)

print("Best Parameters:", grid_search_knn.best_params_)
print("Train Accuracy:",accuracy_score(y_train,y_train_pred))
print("Test Accuracy:",accuracy_score(y_test,y_test_pred))

y_test_prob = grid_search_knn.predict_proba(X_test)[:,1]

roc_auc = roc_auc_score(y_test, y_test_prob)
print("\nTest ROC AUC:", roc_auc)

cm = confusion_matrix(y_test, y_test_pred)
print("\nConfusion Matrix:\n", cm)

Best Parameters: {'knn__n_neighbors': 7}
Train Accuracy: 0.8
Test Accuracy: 0.725

Test ROC AUC: 0.8181865338728085

Confusion Matrix:
 [[49 32]
 [23 96]]
