In [15]:
# install dependencies
import sys
# !{sys.executable} -m pip install pandas

In [239]:
# import dependencies
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.svm import SVC
from sklearn.datasets import load_iris

import seaborn as sns
import matplotlib.pyplot as plt

In [126]:
##### data loading and feature extraction

dataset_1_path = './datasets/project3_dataset1.txt'
# load the CSV file as a numpy matrix
dataset_1 = np.loadtxt(dataset_1_path, delimiter="\t")
dataset_1_features =  dataset_1[:, 0:30]
dataset_1_label =  dataset_1[:, 30:31]



In [242]:
##### model implementation


test_split_ratio = 0.2
x_train, x_test, y_train, y_test = train_test_split(dataset_1_features, dataset_1_label, test_size=test_split_ratio, random_state=0)

# logistic regression with ridge regression
def logistic_regression(x_train,x_test,y_train,y_test,reg_param):
    # all parameters not specified are set to their defaults
    if(reg_param > 0):
        logisticRegr = LogisticRegression(penalty="l2",C=reg_param)
    else:
        logisticRegr = LogisticRegression(penalty="none") # default l2 reg param 
        
    scaler = preprocessing.StandardScaler().fit(x_train)
    x_scaled_train = scaler.transform(x_train)
    x_scaled_test = scaler.transform(x_test)

    logisticRegr.fit(x_scaled_train, y_train.ravel())

    prediction = logisticRegr.predict(x_scaled_test)

    cnf_matrix = metrics.confusion_matrix(y_test, prediction)
    score = logisticRegr.score(x_scaled_test, y_test)
    print("Logistic Regression")
    print("Regularization Parameter : {0}\t Accuracy: {1}\n".format(reg_param,score))
#     print(metrics.classification_report(prediction, y_test))

#     plt.figure(figsize=(9,9))
#     sns.heatmap(cnf_matrix, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
#     plt.ylabel('Actual label');
#     plt.xlabel('Predicted label');
#     all_sample_title = 'Accuracy Score: {0}'.format(score)
#     plt.title(all_sample_title, size = 15);
    

# KNN
def knn(x_train,x_test,y_train,y_test,k):
    
    scaler = preprocessing.StandardScaler().fit(x_train)
    x_scaled_train = scaler.transform(x_train)
    x_scaled_test = scaler.transform(x_test)
    
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_scaled_train, y_train.ravel())
    
    prediction = knn.predict(x_scaled_test)
    # Use score method to get accuracy of model
    score = knn.score(x_scaled_test, y_test)
    print("K Nearest Neighbor")
    print("No of Neighbors : {0}\n".format(k,score))
    print(metrics.classification_report(prediction, y_test))
    print("***\n")

    
def decision_tree(x_train,x_test,y_train,y_test):
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(x_train, y_train)
    prediction = clf.predict(x_test)
    print("Decision Tree")
    print(metrics.classification_report(prediction, y_test))

    
def svm(x_train,x_test,y_train,y_test,reg_param):
    clf = SVC(C=1/reg_param)
    clf = clf.fit(x_train, y_train.ravel())
    prediction = clf.predict(x_test)
    print("Support Vector Machine - SVM")
    print(metrics.classification_report(prediction, y_test))

In [254]:
# training and verification
print("\n*******************\n")

# logistic regression
logistic_regression(x_train,x_test,y_train,y_test,10**-5) # weak regularization leads to overfitting
logistic_regression(x_train,x_test,y_train,y_test,5)  # right amount of regularization improves perfomance 
logistic_regression(x_train,x_test,y_train,y_test,20)  # too strong regularization leads to underfitting

print("\n*******************\n")

# KNN classification
knn(x_train,x_test,y_train,y_test,1) # 1 neighbor would give perfect accuracy for training set, overfitting
knn(x_train,x_test,y_train,y_test,3) # 5 neighbors seems to generalize well
knn(x_train,x_test,y_train,y_test,100) # 100 neighbors will lead to underfitting

print("\n*******************\n")


# decision tree
decision_tree(x_train,x_test,y_train,y_test)


print("\n*******************\n")


# svm
svm(x_train,x_test,y_train,y_test,0.001)


*******************

Logistic Regression
Regularization Parameter : 1e-05	 Accuracy: 0.6491228070175439

Logistic Regression
Regularization Parameter : 5	 Accuracy: 0.9736842105263158

Logistic Regression
Regularization Parameter : 20	 Accuracy: 0.956140350877193


*******************

K Nearest Neighbor
No of Neighbors : 1

              precision    recall  f1-score   support

         0.0       0.93      0.93      0.93        74
         1.0       0.88      0.88      0.88        40

    accuracy                           0.91       114
   macro avg       0.90      0.90      0.90       114
weighted avg       0.91      0.91      0.91       114

***

K Nearest Neighbor
No of Neighbors : 3

              precision    recall  f1-score   support

         0.0       0.99      0.95      0.97        77
         1.0       0.90      0.97      0.94        37

    accuracy                           0.96       114
   macro avg       0.94      0.96      0.95       114
weighted avg       0.96     