In [1]:
# install dependencies
import sys
# !{sys.executable} -m pip install pandas

In [2]:
# import dependencies
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.preprocessing import LabelEncoder

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate

import warnings

#### Exploratory Data Analysis

In [3]:
##### data loading and feature extraction

dataset_1_path = './datasets/project3_dataset1.txt'
dataset_1 = pd.read_csv(dataset_1_path,sep='\t',header=None)
num_columns = len(dataset_1.columns)
num_features = num_columns - 1
dataset_1.columns=["F"+str(i) for i in range(1, num_columns + 1)]
label_column = "F{0}".format(num_columns)
dataset_1_features = dataset_1.loc[:, dataset_1.columns != label_column]
dataset_1_label = dataset_1.loc[:, dataset_1.columns == label_column]



dataset_2_path = './datasets/project3_dataset2.txt'
# load the CSV file as a dataframe
dataset_2 = pd.read_csv(dataset_2_path,sep='\t',header=None)
num_columns = len(dataset_2.columns)
num_features = num_columns - 1
dataset_2.columns=["F"+str(i) for i in range(1, num_columns + 1)]
label_column = "F{0}".format(num_columns)
custom_encoding = {'Present':1, 'Absent':0}
dataset_2["F5"] = dataset_2["F5"].map(custom_encoding)
dataset_2_features = dataset_2.loc[:, dataset_2.columns != label_column]
dataset_2_label = dataset_2.loc[:, dataset_2.columns == label_column]


In [4]:
X, Y = dataset_1_features, dataset_1_label

In [5]:
X2, Y2 = dataset_2_features, dataset_2_label

In [6]:
#X

In [7]:
#Y

In [8]:
#dataset_1

In [9]:
#dataset_2

In [10]:
##### model implementation


# test_split_ratio = 0.2
# x_train, x_test, y_train, y_test = train_test_split(dataset_1_features, dataset_1_label, test_size=test_split_ratio, random_state=0)

# # logistic regression with ridge regression
# def logistic_regression(x_train,x_test,y_train,y_test,reg_param):
#     # all parameters not specified are set to their defaults
#     if(reg_param > 0):
#         logisticRegr = LogisticRegression(penalty="l2",C=reg_param)
#     else:
#         logisticRegr = LogisticRegression(penalty="none") # default l2 reg param 
        
#     scaler = preprocessing.StandardScaler().fit(x_train)
#     x_scaled_train = scaler.transform(x_train)
#     x_scaled_test = scaler.transform(x_test)

#     logisticRegr.fit(x_scaled_train, y_train.values.ravel())

#     prediction = logisticRegr.predict(x_scaled_test)

#     cnf_matrix = metrics.confusion_matrix(y_test, prediction)
#     score = logisticRegr.score(x_scaled_test, y_test)
#     print("Logistic Regression")
#     print("Regularization Parameter : {0}\t Accuracy: {1}\n".format(reg_param,score))

    
#     print(metrics.classification_report(prediction, y_test))

#     plt.figure(figsize=(9,9))
#     sns.heatmap(cnf_matrix, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
#     plt.ylabel('Actual label');
#     plt.xlabel('Predicted label');
#     all_sample_title = 'Accuracy Score: {0}'.format(score)
#     plt.title(all_sample_title, size = 15);
    

# KNN
# def knn(x_train,x_test,y_train,y_test,k):
    
#     scaler = preprocessing.StandardScaler().fit(x_train)
#     x_scaled_train = scaler.transform(x_train)
#     x_scaled_test = scaler.transform(x_test)
    
#     knn = KNeighborsClassifier(n_neighbors=k)
#     knn.fit(x_scaled_train, y_train.values.ravel())
    
#     prediction = knn.predict(x_scaled_test)
#     # Use score method to get accuracy of model
#     score = knn.score(x_scaled_test, y_test)
#     print("K Nearest Neighbor")
#     print("No of Neighbors : {0}\n".format(k,score))
#     print(metrics.classification_report(prediction, y_test))
#     print("***\n")

    
# def decision_tree(x_train,x_test,y_train,y_test):
#     clf = tree.DecisionTreeClassifier()
#     clf = clf.fit(x_train, y_train)
#     prediction = clf.predict(x_test)
#     print("Decision Tree")
#     print(metrics.classification_report(prediction, y_test))


# def adaboost(x_train,x_test,y_train,y_test):
#     clf = AdaBoostClassifier(n_estimators=100, random_state=0)
#     clf = clf.fit(x_train, y_train.values.ravel())
#     prediction = clf.predict(x_test)
#     print("AdaBoost")
#     print(metrics.classification_report(prediction, y_test))
    
    
# def svm(x_train,x_test,y_train,y_test,reg_param):
#     clf = SVC(C=1/reg_param)
#     clf = clf.fit(x_train, y_train.values.ravel())
#     prediction = clf.predict(x_test)
#     print("Support Vector Machine - SVM")
#     print(metrics.classification_report(prediction, y_test))

#### Model Definitions

In [11]:
def svm(X,Y,reg_param):
    clf = SVC(C=1/reg_param)
    cv_results = cross_validate(clf, X,Y.values.ravel(), cv=10, scoring=('accuracy', 'precision', 'recall', 'f1','roc_auc'), return_train_score=True)
    return cv_results

In [12]:
def adaboost(X,Y):
    clf = AdaBoostClassifier(n_estimators=100, random_state=0)
    clf = clf.fit(X, Y.values.ravel())
    cv_results = cross_validate(clf, X,Y.values.ravel(), cv=10, scoring=('accuracy', 'precision', 'recall', 'f1','roc_auc'), return_train_score=True)
    return cv_results

In [13]:
def logistic_regression(X,y,reg_param, max_iter=100):
    # all parameters not specified are set to their defaults
    if(reg_param > 0):
        logisticRegr = LogisticRegression(penalty="l2",C=reg_param, max_iter=max_iter)
    else:
        logisticRegr = LogisticRegression(penalty="none", max_iter=max_iter) # default l2 reg param 
        
    scaler = preprocessing.StandardScaler().fit(X)
    x_scaled = scaler.transform(X)
    cv_results = cross_validate(logisticRegr, x_scaled,y.values.ravel(), cv=10, scoring=('accuracy', 'precision', 'recall', 'f1','roc_auc'), return_train_score=True)
    return cv_results

In [14]:
# TODO: Can I use the sklearn implementation? Or should I use the decision tree classifier to create custom random forest methodß
def random_forest(X,y):
    clf = RandomForestClassifier(max_depth=2, random_state=0)
    cv_results = cross_validate(clf, X,y.values.ravel(), cv=10, scoring=('accuracy', 'precision', 'recall', 'f1','roc_auc'), return_train_score=True)
    return cv_results

In [15]:
def knn(X,y,k):
    
    scaler = preprocessing.StandardScaler().fit(X)
    x_scaled = scaler.transform(X)
    #x_scaled_test = scaler.transform(x_test)
    
    knn = KNeighborsClassifier(n_neighbors=k)
    
    cv_results = cross_validate(knn, x_scaled,y.values.ravel(), cv=10, scoring=('accuracy', 'precision', 'recall', 'f1','roc_auc'), return_train_score=True)
    return cv_results

In [16]:
def decision_tree(X,y):
    clf = tree.DecisionTreeClassifier()
    cv_results = cross_validate(clf, X,y, cv=10, scoring=('accuracy', 'precision', 'recall', 'f1','roc_auc'), return_train_score=True)
    return cv_results

#### DataSet 1 Training and Evaluation

In [17]:
# logistic regression
#logistic_regression(x_train,x_test,y_train,y_test,10**-5) # weak regularization leads to overfitting
#logistic_regression(x_train,x_test,y_train,y_test,5)  # right amount of regularization improves perfomance 
#logistic_regression(x_train,x_test,y_train,y_test,20)  # too strong regularization leads to underfitting

In [18]:
warnings.filterwarnings('ignore')
logistic_regression(X,Y,10**-5) # weak regularization leads to overfitting
# Warning is due to overfitting: https://github.com/Berkeley-Data/hpt/issues/52#issuecomment-803665466
# Suppressed warning according to: https://stackoverflow.com/questions/43162506/undefinedmetricwarning-f-score-is-ill-defined-and-being-set-to-0-0-in-labels-wi

{'fit_time': array([0.0081501 , 0.00448203, 0.003685  , 0.00308299, 0.00206208,
        0.00291514, 0.00336289, 0.00205708, 0.00213814, 0.00210786]),
 'score_time': array([0.00300097, 0.00201797, 0.00181103, 0.0023551 , 0.001791  ,
        0.00190806, 0.00198507, 0.00179195, 0.00178194, 0.001791  ]),
 'test_accuracy': array([0.61403509, 0.61403509, 0.63157895, 0.63157895, 0.63157895,
        0.63157895, 0.63157895, 0.63157895, 0.63157895, 0.625     ]),
 'train_accuracy': array([0.62890625, 0.62890625, 0.62695312, 0.62695312, 0.62695312,
        0.62695312, 0.62695312, 0.62695312, 0.62695312, 0.62768031]),
 'test_precision': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'train_precision': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'test_recall': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'train_recall': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'test_f1': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'train_f1': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

In [19]:
warnings.filterwarnings('always')

In [20]:
logistic_regression(X,Y,5)  # right amount of regularization improves perfomance 

{'fit_time': array([0.03279591, 0.0117631 , 0.01478815, 0.01623487, 0.01253414,
        0.00794435, 0.01182103, 0.0109489 , 0.01152515, 0.00843191]),
 'score_time': array([0.0020113 , 0.00187588, 0.00199175, 0.00184011, 0.00183082,
        0.00176883, 0.00180101, 0.00175905, 0.00176167, 0.00366998]),
 'test_accuracy': array([0.98245614, 1.        , 1.        , 0.98245614, 0.96491228,
        0.98245614, 0.94736842, 0.98245614, 0.96491228, 1.        ]),
 'train_accuracy': array([0.99023438, 0.98828125, 0.98828125, 0.9921875 , 0.98632812,
        0.9921875 , 0.99023438, 0.99023438, 0.99023438, 0.98830409]),
 'test_precision': array([0.95652174, 1.        , 1.        , 1.        , 1.        ,
        1.        , 0.90909091, 1.        , 0.95238095, 1.        ]),
 'train_precision': array([0.99465241, 0.99462366, 0.99465241, 1.        , 0.99462366,
        1.        , 0.99468085, 0.98947368, 0.99468085, 0.99465241]),
 'test_recall': array([1.        , 1.        , 1.        , 0.95238095, 0.9

In [21]:
logistic_regression(X,Y,20, max_iter=300)  # too strong regularization leads to underfitting
# Rishabh - I had to increase max_iter because the model wasn't converging earlier
# https://stackoverflow.com/questions/62658215/convergencewarning-lbfgs-failed-to-converge-status-1-stop-total-no-of-iter
# TODO: Should we increase max_iter for all cases?

{'fit_time': array([0.02870488, 0.01867795, 0.01326489, 0.01514196, 0.01462889,
        0.01723528, 0.01634097, 0.02004886, 0.02353907, 0.02004194]),
 'score_time': array([0.00197411, 0.00190496, 0.00180101, 0.00179076, 0.0017941 ,
        0.00179791, 0.00183201, 0.00184703, 0.00201488, 0.00190902]),
 'test_accuracy': array([0.96491228, 1.        , 1.        , 0.96491228, 0.96491228,
        0.98245614, 0.94736842, 0.96491228, 0.94736842, 0.96428571]),
 'train_accuracy': array([0.99023438, 0.99023438, 0.99023438, 0.9921875 , 0.98828125,
        0.99414062, 0.9921875 , 0.9921875 , 0.99023438, 0.99025341]),
 'test_precision': array([0.95454545, 1.        , 1.        , 1.        , 1.        ,
        1.        , 0.90909091, 0.95238095, 0.95      , 0.95238095]),
 'train_precision': array([0.99465241, 0.99465241, 0.99468085, 1.        , 0.99465241,
        1.        , 0.99470899, 0.99470899, 0.99468085, 0.99468085]),
 'test_recall': array([0.95454545, 1.        , 1.        , 0.9047619 , 0.9

In [22]:
# KNN classification
# knn(x_train,x_test,y_train,y_test,1) # 1 neighbor would give perfect accuracy for training set, overfitting
# knn(x_train,x_test,y_train,y_test,3) # 5 neighbors seems to generalize well
# knn(x_train,x_test,y_train,y_test,100) # 100 neighbors will lead to underfitting

In [23]:
knn(X,Y,1) # 1 neighbor would give perfect accuracy for training set, overfitting

{'fit_time': array([0.00033307, 0.00027084, 0.00028419, 0.00026083, 0.00030208,
        0.00029016, 0.00025988, 0.00024295, 0.00025296, 0.00024819]),
 'score_time': array([0.00645685, 0.0035131 , 0.00348997, 0.0039742 , 0.00477123,
        0.00369215, 0.00497389, 0.00358725, 0.00345588, 0.00542307]),
 'test_accuracy': array([0.96491228, 0.94736842, 0.98245614, 0.92982456, 0.92982456,
        0.94736842, 0.9122807 , 0.96491228, 0.94736842, 0.94642857]),
 'train_accuracy': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_precision': array([0.95454545, 0.91304348, 0.95454545, 0.94736842, 0.9047619 ,
        0.95      , 1.        , 1.        , 0.95      , 0.875     ]),
 'train_precision': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_recall': array([0.95454545, 0.95454545, 1.        , 0.85714286, 0.9047619 ,
        0.9047619 , 0.76190476, 0.9047619 , 0.9047619 , 1.        ]),
 'train_recall': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_f1': array([0.95454545, 

In [24]:
knn(X,Y,3) # 5 neighbors seems to generalize well

{'fit_time': array([0.00030589, 0.00027108, 0.00027084, 0.00024605, 0.00024819,
        0.0002377 , 0.00024796, 0.00024509, 0.00023818, 0.0002408 ]),
 'score_time': array([0.0047791 , 0.00399494, 0.0037272 , 0.00367498, 0.00366211,
        0.00365114, 0.0038209 , 0.00367904, 0.00365901, 0.00413895]),
 'test_accuracy': array([0.98245614, 0.96491228, 0.96491228, 0.9122807 , 0.98245614,
        0.98245614, 0.94736842, 0.96491228, 0.96491228, 0.96428571]),
 'train_accuracy': array([0.98046875, 0.97851562, 0.97851562, 0.98632812, 0.98242188,
        0.98046875, 0.984375  , 0.98242188, 0.98242188, 0.98245614]),
 'test_precision': array([1.        , 0.95454545, 0.95238095, 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 0.91304348]),
 'train_precision': array([0.99450549, 0.99447514, 0.99450549, 1.        , 0.99456522,
        1.        , 0.99459459, 0.99456522, 0.99456522, 0.99456522]),
 'test_recall': array([0.95454545, 0.95454545, 0.95238095, 0.76190476, 0.9

In [25]:
knn(X,Y,100) # 100 neighbors will lead to underfitting

{'fit_time': array([0.00028419, 0.00026512, 0.00026798, 0.00024796, 0.00026298,
        0.00026894, 0.0002501 , 0.00026107, 0.00026488, 0.00026011]),
 'score_time': array([0.00610685, 0.00539184, 0.00514388, 0.00501704, 0.00500393,
        0.00493598, 0.00492978, 0.00557899, 0.00490189, 0.00484776]),
 'test_accuracy': array([0.96491228, 0.9122807 , 0.94736842, 0.85964912, 0.96491228,
        0.94736842, 0.9122807 , 0.94736842, 0.92982456, 0.94642857]),
 'train_accuracy': array([0.92773438, 0.93359375, 0.93359375, 0.94335938, 0.93164062,
        0.93359375, 0.9375    , 0.9296875 , 0.93554688, 0.93957115]),
 'test_precision': array([1.        , 0.94736842, 1.        , 1.        , 1.        ,
        1.        , 0.94444444, 1.        , 1.        , 1.        ]),
 'train_precision': array([0.98726115, 0.99367089, 0.99371069, 0.99390244, 0.9875    ,
        0.98757764, 1.        , 0.99363057, 0.99375   , 0.99382716]),
 'test_recall': array([0.90909091, 0.81818182, 0.85714286, 0.61904762, 0.9

In [26]:
# decision tree
#decision_tree(x_train,x_test,y_train,y_test)
decision_tree(X,Y)

{'fit_time': array([0.01040316, 0.00732517, 0.00635505, 0.00665975, 0.00657988,
        0.00608301, 0.006001  , 0.00612879, 0.00934887, 0.00630808]),
 'score_time': array([0.00454402, 0.00363398, 0.00349689, 0.00345397, 0.00348401,
        0.0035789 , 0.00342894, 0.00341892, 0.00336814, 0.00339484]),
 'test_accuracy': array([0.92982456, 0.84210526, 0.98245614, 0.87719298, 0.96491228,
        0.89473684, 0.9122807 , 0.94736842, 0.94736842, 0.94642857]),
 'train_accuracy': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_precision': array([0.875     , 0.76      , 0.95454545, 0.9375    , 1.        ,
        0.8       , 0.86363636, 0.95      , 0.95      , 0.90909091]),
 'train_precision': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_recall': array([0.95454545, 0.86363636, 1.        , 0.71428571, 0.9047619 ,
        0.95238095, 0.9047619 , 0.9047619 , 0.9047619 , 0.95238095]),
 'train_recall': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_f1': array([0.91304348, 

In [27]:
# svm
#svm(x_train,x_test,y_train,y_test,0.001)
svm(X,Y,0.001)

{'fit_time': array([0.00851393, 0.00681591, 0.00646114, 0.00605798, 0.00568604,
        0.00606918, 0.0066812 , 0.00598979, 0.0066011 , 0.00611615]),
 'score_time': array([0.00487685, 0.00376201, 0.00373888, 0.00358677, 0.00365806,
        0.00378513, 0.00359702, 0.00406718, 0.00396705, 0.00362802]),
 'test_accuracy': array([0.96491228, 0.94736842, 0.98245614, 0.9122807 , 0.92982456,
        0.96491228, 0.92982456, 0.98245614, 1.        , 0.92857143]),
 'train_accuracy': array([0.95898438, 0.95703125, 0.953125  , 0.96484375, 0.96289062,
        0.95507812, 0.95703125, 0.95507812, 0.95507812, 0.96296296]),
 'test_precision': array([0.95454545, 1.        , 1.        , 0.9       , 0.9047619 ,
        1.        , 0.9047619 , 1.        , 1.        , 0.9047619 ]),
 'train_precision': array([0.96174863, 0.9516129 , 0.94652406, 0.9726776 , 0.9673913 ,
        0.9516129 , 0.95187166, 0.95652174, 0.9516129 , 0.96236559]),
 'test_recall': array([0.95454545, 0.86363636, 0.95238095, 0.85714286, 0.9

In [28]:
#adaboost(x_train,x_test,y_train,y_test)
adaboost(X,Y)

{'fit_time': array([0.16954303, 0.16697717, 0.16718793, 0.16753387, 0.16661811,
        0.16650009, 0.16641688, 0.16662574, 0.1671629 , 0.16692185]),
 'score_time': array([0.01539993, 0.01510096, 0.01518798, 0.01512098, 0.01498795,
        0.01503181, 0.01502419, 0.01495218, 0.0152812 , 0.01506734]),
 'test_accuracy': array([0.96491228, 0.98245614, 0.98245614, 0.94736842, 0.98245614,
        0.98245614, 0.98245614, 0.94736842, 0.94736842, 0.98214286]),
 'train_accuracy': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_precision': array([0.95454545, 1.        , 1.        , 0.95      , 1.        ,
        1.        , 1.        , 0.95      , 0.95      , 0.95454545]),
 'train_precision': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_recall': array([0.95454545, 0.95454545, 0.95238095, 0.9047619 , 0.95238095,
        0.95238095, 0.95238095, 0.9047619 , 0.9047619 , 1.        ]),
 'train_recall': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_f1': array([0.95454545, 

In [29]:
#random_forest(x_train,x_test,y_train,y_test)
random_forest(X,Y)

{'fit_time': array([0.0853579 , 0.08226204, 0.08298492, 0.08229208, 0.08250999,
        0.08231592, 0.08229113, 0.08229303, 0.08249211, 0.08323812]),
 'score_time': array([0.01145792, 0.01112008, 0.01118183, 0.01107216, 0.01115084,
        0.01103115, 0.01101899, 0.01100898, 0.01115823, 0.01107693]),
 'test_accuracy': array([0.94736842, 0.92982456, 0.98245614, 0.9122807 , 0.94736842,
        0.94736842, 0.96491228, 0.96491228, 0.94736842, 0.92857143]),
 'train_accuracy': array([0.96289062, 0.96484375, 0.95898438, 0.96289062, 0.9609375 ,
        0.95703125, 0.95898438, 0.95703125, 0.95703125, 0.96101365]),
 'test_precision': array([0.95238095, 0.95      , 1.        , 0.94444444, 0.95      ,
        0.90909091, 1.        , 1.        , 0.90909091, 0.86956522]),
 'train_precision': array([0.97237569, 0.9673913 , 0.96703297, 0.96236559, 0.95238095,
        0.96174863, 0.95698925, 0.96174863, 0.95675676, 0.96721311]),
 'test_recall': array([0.90909091, 0.86363636, 0.95238095, 0.80952381, 0.9

#### DataSet 2 Training and Evaluation

In [30]:
#import warnings
warnings.filterwarnings('ignore')
logistic_regression(X2,Y2,10**-5) # weak regularization leads to overfitting
# Warning is due to overfitting: https://github.com/Berkeley-Data/hpt/issues/52#issuecomment-803665466
# Suppressed warning according to: https://stackoverflow.com/questions/43162506/undefinedmetricwarning-f-score-is-ill-defined-and-being-set-to-0-0-in-labels-wi

{'fit_time': array([0.00190115, 0.00212908, 0.00210404, 0.00188708, 0.00173807,
        0.00186491, 0.00176215, 0.00171804, 0.00198388, 0.00171089]),
 'score_time': array([0.00174999, 0.00231981, 0.00210309, 0.00169897, 0.00165081,
        0.00170803, 0.00166392, 0.00163412, 0.00163293, 0.00162196]),
 'test_accuracy': array([0.65957447, 0.65957447, 0.65217391, 0.65217391, 0.65217391,
        0.65217391, 0.65217391, 0.65217391, 0.65217391, 0.65217391]),
 'train_accuracy': array([0.65301205, 0.65301205, 0.65384615, 0.65384615, 0.65384615,
        0.65384615, 0.65384615, 0.65384615, 0.65384615, 0.65384615]),
 'test_precision': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'train_precision': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'test_recall': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'train_recall': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'test_f1': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'train_f1': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

In [31]:
warnings.filterwarnings('always')

In [32]:
logistic_regression(X2,Y2,5)  # right amount of regularization improves perfomance 

{'fit_time': array([0.00187492, 0.00241518, 0.00179195, 0.00178313, 0.00164509,
        0.00163889, 0.0016551 , 0.00170541, 0.00172496, 0.00177932]),
 'score_time': array([0.00183725, 0.0018189 , 0.00187206, 0.00169182, 0.00162411,
        0.00172114, 0.00162792, 0.00161886, 0.00162816, 0.00160789]),
 'test_accuracy': array([0.74468085, 0.70212766, 0.7173913 , 0.73913043, 0.63043478,
        0.7173913 , 0.7826087 , 0.73913043, 0.67391304, 0.7173913 ]),
 'train_accuracy': array([0.73493976, 0.73493976, 0.73798077, 0.73798077, 0.74038462,
        0.75240385, 0.72836538, 0.74038462, 0.74038462, 0.73798077]),
 'test_precision': array([0.6       , 0.58333333, 0.58823529, 0.625     , 0.45454545,
        0.66666667, 0.8       , 0.64285714, 0.54545455, 0.61538462]),
 'train_precision': array([0.6440678 , 0.64166667, 0.64957265, 0.65486726, 0.65789474,
        0.67521368, 0.6302521 , 0.65517241, 0.64516129, 0.64957265]),
 'test_recall': array([0.75  , 0.4375, 0.625 , 0.625 , 0.3125, 0.375 , 0.5

In [33]:
logistic_regression(X2,Y2,20, max_iter=300)  # too strong regularization leads to underfitting
# Rishabh - I had to increase max_iter because the model wasn't converging earlier
# https://stackoverflow.com/questions/62658215/convergencewarning-lbfgs-failed-to-converge-status-1-stop-total-no-of-iter
# TODO: Should we increase max_iter for all cases?

{'fit_time': array([0.00219321, 0.00217485, 0.0018518 , 0.00193095, 0.00178289,
        0.00173283, 0.00162387, 0.0016861 , 0.00161982, 0.00188994]),
 'score_time': array([0.00204086, 0.00198412, 0.00166607, 0.00197387, 0.0016489 ,
        0.00163913, 0.00163507, 0.00163078, 0.00161004, 0.00161004]),
 'test_accuracy': array([0.74468085, 0.70212766, 0.7173913 , 0.73913043, 0.63043478,
        0.7173913 , 0.7826087 , 0.73913043, 0.67391304, 0.7173913 ]),
 'train_accuracy': array([0.73493976, 0.7373494 , 0.73798077, 0.73798077, 0.74038462,
        0.75240385, 0.72836538, 0.74038462, 0.74038462, 0.73798077]),
 'test_precision': array([0.6       , 0.58333333, 0.58823529, 0.625     , 0.45454545,
        0.66666667, 0.8       , 0.64285714, 0.54545455, 0.61538462]),
 'train_precision': array([0.6440678 , 0.6446281 , 0.64957265, 0.65486726, 0.65789474,
        0.67521368, 0.6302521 , 0.65517241, 0.64516129, 0.64957265]),
 'test_recall': array([0.75  , 0.4375, 0.625 , 0.625 , 0.3125, 0.375 , 0.5

In [34]:
knn(X2,Y2,1) # 1 neighbor would give perfect accuracy for training set, overfitting

{'fit_time': array([0.00057983, 0.00036788, 0.00036097, 0.00033975, 0.00032902,
        0.00032091, 0.00032067, 0.00032115, 0.00032473, 0.00031996]),
 'score_time': array([0.00722909, 0.00306201, 0.00299811, 0.00301433, 0.003088  ,
        0.0029161 , 0.00294018, 0.0029428 , 0.002913  , 0.00292492]),
 'test_accuracy': array([0.61702128, 0.70212766, 0.69565217, 0.7173913 , 0.63043478,
        0.60869565, 0.69565217, 0.69565217, 0.60869565, 0.63043478]),
 'train_accuracy': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_precision': array([0.45454545, 0.54545455, 0.625     , 0.61538462, 0.45454545,
        0.4375    , 1.        , 0.57142857, 0.41666667, 0.47058824]),
 'train_precision': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_recall': array([0.625 , 0.75  , 0.3125, 0.5   , 0.3125, 0.4375, 0.125 , 0.5   ,
        0.3125, 0.5   ]),
 'train_recall': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_f1': array([0.52631579, 0.63157895, 0.41666667, 0.55172414, 0.37

In [35]:
knn(X2,Y2,3) # 5 neighbors seems to generalize well

{'fit_time': array([0.00055122, 0.00036693, 0.00036025, 0.00034308, 0.00032473,
        0.00032425, 0.00032783, 0.00032115, 0.00032187, 0.00032091]),
 'score_time': array([0.0036869 , 0.00329208, 0.00311685, 0.00307393, 0.00304222,
        0.00306368, 0.00304317, 0.00302792, 0.003016  , 0.00302911]),
 'test_accuracy': array([0.57446809, 0.63829787, 0.73913043, 0.73913043, 0.67391304,
        0.63043478, 0.69565217, 0.69565217, 0.67391304, 0.67391304]),
 'train_accuracy': array([0.80240964, 0.78554217, 0.79326923, 0.79807692, 0.80288462,
        0.8125    , 0.79807692, 0.79807692, 0.80048077, 0.79086538]),
 'test_precision': array([0.40909091, 0.47058824, 0.75      , 0.625     , 0.54545455,
        0.45454545, 0.75      , 0.58333333, 0.55555556, 0.52941176]),
 'train_precision': array([0.74603175, 0.72727273, 0.73387097, 0.75423729, 0.75833333,
        0.7704918 , 0.74193548, 0.74193548, 0.744     , 0.73553719]),
 'test_recall': array([0.5625, 0.5   , 0.375 , 0.625 , 0.375 , 0.3125, 0.1

In [36]:
knn(X2,Y2,100) # 100 neighbors will lead to underfitting

{'fit_time': array([0.00052214, 0.00037193, 0.00034404, 0.00033402, 0.00033164,
        0.00033188, 0.00033402, 0.00033474, 0.00033379, 0.00033379]),
 'score_time': array([0.00469017, 0.00459027, 0.00434303, 0.00437593, 0.00438237,
        0.00437593, 0.00437999, 0.00436521, 0.00434017, 0.00436711]),
 'test_accuracy': array([0.68085106, 0.72340426, 0.7173913 , 0.67391304, 0.65217391,
        0.63043478, 0.67391304, 0.69565217, 0.73913043, 0.69565217]),
 'train_accuracy': array([0.71325301, 0.69638554, 0.67788462, 0.69951923, 0.70673077,
        0.70673077, 0.70673077, 0.70913462, 0.69471154, 0.69711538]),
 'test_precision': array([0.66666667, 0.8       , 1.        , 0.6       , 0.5       ,
        0.        , 1.        , 0.75      , 1.        , 0.75      ]),
 'train_precision': array([0.83783784, 0.71428571, 0.85714286, 0.80645161, 0.86666667,
        0.78947368, 0.75      , 0.81081081, 0.81481481, 0.82142857]),
 'test_recall': array([0.125 , 0.25  , 0.1875, 0.1875, 0.0625, 0.    , 0.0

In [37]:
# decision tree
#decision_tree(x_train,x_test,y_train,y_test)
decision_tree(X2,Y2)

{'fit_time': array([0.00277591, 0.00271392, 0.00283504, 0.00241208, 0.00250292,
        0.0023191 , 0.00231194, 0.0024519 , 0.00238705, 0.00226784]),
 'score_time': array([0.00347996, 0.00320411, 0.00343275, 0.00306821, 0.00315404,
        0.00306082, 0.0030489 , 0.00305414, 0.00307202, 0.00304508]),
 'test_accuracy': array([0.57446809, 0.68085106, 0.65217391, 0.45652174, 0.60869565,
        0.60869565, 0.60869565, 0.63043478, 0.65217391, 0.65217391]),
 'train_accuracy': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_precision': array([0.38888889, 0.53846154, 0.5       , 0.23529412, 0.44444444,
        0.42857143, 0.4375    , 0.47058824, 0.5       , 0.5       ]),
 'train_precision': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_recall': array([0.4375, 0.4375, 0.4375, 0.25  , 0.5   , 0.375 , 0.4375, 0.5   ,
        0.4375, 0.375 ]),
 'train_recall': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_f1': array([0.41176471, 0.48275862, 0.46666667, 0.24242424, 0.47

In [38]:
# svm
#svm(x_train,x_test,y_train,y_test,0.001)
svm(X2,Y2,0.001)

{'fit_time': array([0.02203703, 0.0216949 , 0.02450514, 0.01970601, 0.02295089,
        0.02301812, 0.02016377, 0.02209902, 0.01975393, 0.02273917]),
 'score_time': array([0.0060401 , 0.00510597, 0.00499511, 0.00491619, 0.00485492,
        0.00485969, 0.00491309, 0.00496888, 0.00479484, 0.00482368]),
 'test_accuracy': array([0.65957447, 0.80851064, 0.80434783, 0.73913043, 0.67391304,
        0.7173913 , 0.86956522, 0.76086957, 0.65217391, 0.63043478]),
 'train_accuracy': array([0.80240964, 0.78795181, 0.78846154, 0.79326923, 0.81490385,
        0.80288462, 0.78125   , 0.79326923, 0.81009615, 0.80048077]),
 'test_precision': array([0.5       , 0.73333333, 0.76923077, 0.64285714, 0.55555556,
        0.71428571, 1.        , 0.72727273, 0.5       , 0.42857143]),
 'train_precision': array([0.79245283, 0.74561404, 0.75925926, 0.78431373, 0.78151261,
        0.75833333, 0.73451327, 0.76363636, 0.77310924, 0.7699115 ]),
 'test_recall': array([0.625 , 0.6875, 0.625 , 0.5625, 0.3125, 0.3125, 0.6

In [39]:
#adaboost(x_train,x_test,y_train,y_test)
adaboost(X2,Y2)

{'fit_time': array([0.07746696, 0.07737875, 0.0775218 , 0.07684183, 0.07675815,
        0.07670999, 0.07677817, 0.07802796, 0.07670617, 0.07661414]),
 'score_time': array([0.0151    , 0.01548529, 0.0147469 , 0.01467919, 0.01469398,
        0.01464796, 0.01510787, 0.01479602, 0.01552391, 0.01453686]),
 'test_accuracy': array([0.59574468, 0.63829787, 0.65217391, 0.63043478, 0.52173913,
        0.67391304, 0.69565217, 0.65217391, 0.69565217, 0.67391304]),
 'train_accuracy': array([0.85060241, 0.86506024, 0.85336538, 0.87740385, 0.86298077,
        0.87740385, 0.85336538, 0.84615385, 0.87259615, 0.86538462]),
 'test_precision': array([0.4       , 0.47619048, 0.5       , 0.47368421, 0.28571429,
        0.55555556, 0.58333333, 0.5       , 0.58333333, 0.57142857]),
 'train_precision': array([0.80597015, 0.86065574, 0.82170543, 0.88429752, 0.84251969,
        0.83941606, 0.81203008, 0.78985507, 0.85826772, 0.83846154]),
 'test_recall': array([0.375 , 0.625 , 0.4375, 0.5625, 0.25  , 0.3125, 0.4

In [40]:
#random_forest(x_train,x_test,y_train,y_test)
random_forest(X2,Y2)

{'fit_time': array([0.07062721, 0.06813478, 0.06928897, 0.06853294, 0.06882691,
        0.06925988, 0.06884289, 0.06789303, 0.06789613, 0.06750584]),
 'score_time': array([0.01136684, 0.01126599, 0.01143289, 0.01125884, 0.011379  ,
        0.01154709, 0.01125002, 0.01106405, 0.0109489 , 0.01092315]),
 'test_accuracy': array([0.72340426, 0.72340426, 0.76086957, 0.67391304, 0.69565217,
        0.69565217, 0.63043478, 0.67391304, 0.76086957, 0.73913043]),
 'train_accuracy': array([0.74457831, 0.74939759, 0.75721154, 0.75721154, 0.74759615,
        0.74519231, 0.74278846, 0.74519231, 0.75480769, 0.76682692]),
 'test_precision': array([0.63636364, 0.71428571, 0.77777778, 0.54545455, 0.75      ,
        0.66666667, 0.33333333, 0.57142857, 0.85714286, 0.83333333]),
 'train_precision': array([0.796875  , 0.8125    , 0.79452055, 0.83076923, 0.80952381,
        0.77941176, 0.76056338, 0.80645161, 0.81818182, 0.85074627]),
 'test_recall': array([0.4375, 0.3125, 0.4375, 0.375 , 0.1875, 0.25  , 0.0