In [2]:
# install dependencies
import sys
# !{sys.executable} -m pip install pandas
# !{sys.executable} -m pip install seaborn

In [3]:
# import dependencies
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.preprocessing import LabelEncoder

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate

from IPython.display import display
from collections import Counter

import warnings

#### Exploratory Data Analysis

In [4]:
##### data loading and feature extraction

dataset_1_path = './datasets/project3_dataset1.txt'
dataset_1 = pd.read_csv(dataset_1_path,sep='\t',header=None)
num_columns = len(dataset_1.columns)
num_features = num_columns - 1
dataset_1.columns=["F"+str(i) for i in range(1, num_columns + 1)]
label_column = "F{0}".format(num_columns)
dataset_1_features = dataset_1.loc[:, dataset_1.columns != label_column]
dataset_1_label = dataset_1.loc[:, dataset_1.columns == label_column]



dataset_2_path = './datasets/project3_dataset2.txt'
# load the CSV file as a dataframe
dataset_2 = pd.read_csv(dataset_2_path,sep='\t',header=None)
num_columns = len(dataset_2.columns)
num_features = num_columns - 1
dataset_2.columns=["F"+str(i) for i in range(1, num_columns + 1)]
label_column = "F{0}".format(num_columns)
custom_encoding = {'Present':1, 'Absent':0}
dataset_2["F5"] = dataset_2["F5"].map(custom_encoding)
dataset_2_features = dataset_2.loc[:, dataset_2.columns != label_column]
dataset_2_label = dataset_2.loc[:, dataset_2.columns == label_column]


In [16]:
X, Y = dataset_1_features, dataset_1_label

In [17]:
X2, Y2 = dataset_2_features, dataset_2_label

In [7]:
#X

In [8]:
#Y

In [9]:
#dataset_1

In [10]:
#dataset_2

In [11]:
##### model implementation


# test_split_ratio = 0.2
# x_train, x_test, y_train, y_test = train_test_split(dataset_1_features, dataset_1_label, test_size=test_split_ratio, random_state=0)

# # logistic regression with ridge regression
# def logistic_regression(x_train,x_test,y_train,y_test,reg_param):
#     # all parameters not specified are set to their defaults
#     if(reg_param > 0):
#         logisticRegr = LogisticRegression(penalty="l2",C=reg_param)
#     else:
#         logisticRegr = LogisticRegression(penalty="none") # default l2 reg param 
        
#     scaler = preprocessing.StandardScaler().fit(x_train)
#     x_scaled_train = scaler.transform(x_train)
#     x_scaled_test = scaler.transform(x_test)

#     logisticRegr.fit(x_scaled_train, y_train.values.ravel())

#     prediction = logisticRegr.predict(x_scaled_test)

#     cnf_matrix = metrics.confusion_matrix(y_test, prediction)
#     score = logisticRegr.score(x_scaled_test, y_test)
#     print("Logistic Regression")
#     print("Regularization Parameter : {0}\t Accuracy: {1}\n".format(reg_param,score))

    
#     print(metrics.classification_report(prediction, y_test))

#     plt.figure(figsize=(9,9))
#     sns.heatmap(cnf_matrix, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
#     plt.ylabel('Actual label');
#     plt.xlabel('Predicted label');
#     all_sample_title = 'Accuracy Score: {0}'.format(score)
#     plt.title(all_sample_title, size = 15);
    

# KNN
# def knn(x_train,x_test,y_train,y_test,k):
    
#     scaler = preprocessing.StandardScaler().fit(x_train)
#     x_scaled_train = scaler.transform(x_train)
#     x_scaled_test = scaler.transform(x_test)
    
#     knn = KNeighborsClassifier(n_neighbors=k)
#     knn.fit(x_scaled_train, y_train.values.ravel())
    
#     prediction = knn.predict(x_scaled_test)
#     # Use score method to get accuracy of model
#     score = knn.score(x_scaled_test, y_test)
#     print("K Nearest Neighbor")
#     print("No of Neighbors : {0}\n".format(k,score))
#     print(metrics.classification_report(prediction, y_test))
#     print("***\n")

    
# def decision_tree(x_train,x_test,y_train,y_test):
#     clf = tree.DecisionTreeClassifier()
#     clf = clf.fit(x_train, y_train)
#     prediction = clf.predict(x_test)
#     print("Decision Tree")
#     print(metrics.classification_report(prediction, y_test))


# def adaboost(x_train,x_test,y_train,y_test):
#     clf = AdaBoostClassifier(n_estimators=100, random_state=0)
#     clf = clf.fit(x_train, y_train.values.ravel())
#     prediction = clf.predict(x_test)
#     print("AdaBoost")
#     print(metrics.classification_report(prediction, y_test))
    
    
# def svm(x_train,x_test,y_train,y_test,reg_param):
#     clf = SVC(C=1/reg_param)
#     clf = clf.fit(x_train, y_train.values.ravel())
#     prediction = clf.predict(x_test)
#     print("Support Vector Machine - SVM")
#     print(metrics.classification_report(prediction, y_test))

#### Model Definitions

In [78]:
def svm(X,Y,reg_param):
    clf = SVC(C=reg_param)
    cv_results = cross_validate(clf, X,Y.values.ravel(), cv=10, scoring=('accuracy', 'precision', 'recall', 'f1','roc_auc'), return_train_score=True)
    mean_metrics(cv_results)
    return cv_results

In [7]:
def decision_tree(X,y):
    clf = tree.DecisionTreeClassifier()
    cv_results = cross_validate(clf, X,y, cv=10, scoring=('accuracy', 'precision', 'recall', 'f1','roc_auc'), return_train_score=True)
    mean_metrics(cv_results)
    return cv_results

In [8]:
def adaboost(X,Y):
    clf = AdaBoostClassifier(n_estimators=100, random_state=0)
    clf = clf.fit(X, Y.values.ravel())
    cv_results = cross_validate(clf, X,Y.values.ravel(), cv=10, scoring=('accuracy', 'precision', 'recall', 'f1','roc_auc'), return_train_score=True)
    mean_metrics(cv_results)
    return cv_results

In [46]:
def logistic_regression(X,y,reg_param, max_iter=100):
    # all parameters not specified are set to their defaults
    #reg_param is inverse strength
    if(reg_param > 0):
        logisticRegr = LogisticRegression(penalty="l2",C=1/reg_param, max_iter=max_iter)
    else:
        logisticRegr = LogisticRegression(penalty="none", max_iter=max_iter) # default l2 reg param 
        
    scaler = preprocessing.StandardScaler().fit(X)
    x_scaled = scaler.transform(X)
    cv_results = cross_validate(logisticRegr, x_scaled,y.values.ravel(), cv=10, scoring=('accuracy', 'precision', 'recall', 'f1','roc_auc'), return_train_score=True)
    mean_metrics(cv_results)
    return cv_results

In [31]:
# TODO: Can I use the sklearn implementation? Or should I use the decision tree classifier to create custom random forest methodß
# def random_forest(X,y):
#     clf = RandomForestClassifier(max_depth=2, random_state=0)
#     cv_results = cross_validate(clf, X,y.values.ravel(), cv=10, scoring=('accuracy', 'precision', 'recall', 'f1','roc_auc'), return_train_score=True)
#     mean_metrics(cv_results)
#     return cv_results


class RandomForest:
    def __init__(self, n_trees=10):
        self.n_trees = n_trees
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            X_sample, y_sample = self.bagging_samples(X, y)
            dec_tree = tree.DecisionTreeClassifier()
            dec_tree.fit(X_sample, y_sample)
            self.trees.append(dec_tree)

    def bagging_samples(self, X, y):
        n_samples = X.shape[0]
        X_sampled = X.sample(frac=1, replace=True, random_state=1)
        return X_sampled, y.loc[X_sampled.index]

    def find_majority_vote(self, y):
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common

    def predict(self, X):
        predictions = np.array([tree.predict(X) for tree in self.trees])
        tree_preds = np.swapaxes(predictions, 0, 1)
        predictions = np.array([self.find_majority_vote(pred) for pred in tree_preds])
        return predictions
    

def random_forest(X, Y):
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

    clf = RandomForest(n_trees=20)
    clf.fit(x_train, y_train)
    prediction = clf.predict(x_test)
    print(metrics.classification_report(prediction, y_test))


In [13]:
def knn(X,y,k):
    
    scaler = preprocessing.StandardScaler().fit(X)
    x_scaled = scaler.transform(X)
    #x_scaled_test = scaler.transform(x_test)
    
    knn = KNeighborsClassifier(n_neighbors=k)
    
    cv_results = cross_validate(knn, x_scaled,y.values.ravel(), cv=10, scoring=('accuracy', 'precision', 'recall', 'f1','roc_auc'), return_train_score=True)
    mean_metrics(cv_results)
    return cv_results

In [38]:
def mean_metrics(results):
    # display(results)
    print('\nMean Test Accuracy: ',results['test_accuracy'].mean())
    print('Mean Train Accuracy: ',results['train_accuracy'].mean())
    print('Mean Test Precision: ',results['test_precision'].mean())
    print('Mean Train Precision: ',results['train_precision'].mean())
    print('Mean Test Recall: ',results['test_recall'].mean())
    print('Mean Train Recall: ',results['train_recall'].mean())
    print('Mean Test F1 Score: ',results['test_f1'].mean())
    print('Mean Train F1 Score: ',results['train_f1'].mean())

#### DataSet 1 Training and Evaluation

In [17]:
# logistic regression
#logistic_regression(x_train,x_test,y_train,y_test,10**-5) # weak regularization leads to overfitting
#logistic_regression(x_train,x_test,y_train,y_test,5)  # right amount of regularization improves perfomance 
#logistic_regression(x_train,x_test,y_train,y_test,20)  # too strong regularization leads to underfitting

In [59]:
warnings.filterwarnings('ignore')
logistic_regression(X,Y,10**-5) # weak regularization leads to overfitting
# Warning is due to overfitting: https://github.com/Berkeley-Data/hpt/issues/52#issuecomment-803665466
# Suppressed warning according to: https://stackoverflow.com/questions/43162506/undefinedmetricwarning-f-score-is-ill-defined-and-being-set-to-0-0-in-labels-wi


Mean Test Accuracy:  0.9542293233082708
Mean Train Accuracy:  0.9968757614522417
Mean Test Precision:  0.9360987201204592
Mean Train Precision:  0.9968503428437294
Mean Test Recall:  0.9432900432900434
Mean Train Recall:  0.9947588867456598
Mean Test F1 Score:  0.939110502722429
Mean Train F1 Score:  0.9958004850697947


{'fit_time': array([0.03349876, 0.02950048, 0.02949739, 0.02749801, 0.03099799,
        0.02949739, 0.03050017, 0.03199983, 0.03050017, 0.02849984]),
 'score_time': array([0.00149989, 0.00149894, 0.00150204, 0.00150013, 0.00149965,
        0.00149989, 0.00100136, 0.002002  , 0.00150204, 0.0010016 ]),
 'test_accuracy': array([0.92982456, 0.96491228, 1.        , 0.94736842, 0.94736842,
        0.94736842, 0.96491228, 0.96491228, 0.96491228, 0.91071429]),
 'train_accuracy': array([1.        , 0.99414062, 0.9921875 , 1.        , 1.        ,
        0.99414062, 1.        , 1.        , 0.9921875 , 0.99610136]),
 'test_precision': array([0.875     , 0.95454545, 1.        , 0.95      , 0.95      ,
        0.95      , 0.91304348, 0.95238095, 0.95238095, 0.86363636]),
 'train_precision': array([1.        , 0.99470899, 0.9895288 , 1.        , 1.        ,
        0.99473684, 1.        , 1.        , 0.9895288 , 1.        ]),
 'test_recall': array([0.95454545, 0.95454545, 1.        , 0.9047619 , 0.9

In [19]:
warnings.filterwarnings('always')

In [49]:
logistic_regression(X,Y,1)  # right amount of regularization improves perfomance 


Mean Test Accuracy:  0.9824561403508772
Mean Train Accuracy:  0.9882835343567251
Mean Test Precision:  0.9813664596273292
Mean Train Precision:  0.9925501052524751
Mean Test Recall:  0.9714285714285713
Mean Train Recall:  0.975888674565996
Mean Test F1 Score:  0.9761749903213317
Mean Train F1 Score:  0.9841421558039956


{'fit_time': array([0.01249862, 0.01149893, 0.01149964, 0.01050043, 0.0095005 ,
        0.01100135, 0.01000023, 0.01049805, 0.00999928, 0.01049995]),
 'score_time': array([0.00100088, 0.00149941, 0.00099993, 0.00100088, 0.00100088,
        0.00100017, 0.00100088, 0.00150156, 0.00149965, 0.0015018 ]),
 'test_accuracy': array([0.98245614, 1.        , 1.        , 0.98245614, 0.96491228,
        0.98245614, 0.96491228, 0.98245614, 0.96491228, 1.        ]),
 'train_accuracy': array([0.98828125, 0.98632812, 0.98632812, 0.9921875 , 0.98632812,
        0.99023438, 0.98828125, 0.98828125, 0.98828125, 0.98830409]),
 'test_precision': array([0.95652174, 1.        , 1.        , 1.        , 0.95238095,
        1.        , 0.95238095, 1.        , 0.95238095, 1.        ]),
 'train_precision': array([0.99462366, 0.98930481, 0.9893617 , 1.        , 0.99462366,
        0.99468085, 0.98941799, 0.98941799, 0.98941799, 0.99465241]),
 'test_recall': array([1.        , 1.        , 1.        , 0.95238095, 0.9

In [50]:
logistic_regression(X,Y,10**5, max_iter=300)  # too strong regularization leads to underfitting
# Rishabh - I had to increase max_iter because the model wasn't converging earlier
# https://stackoverflow.com/questions/62658215/convergencewarning-lbfgs-failed-to-converge-status-1-stop-total-no-of-iter
# TODO: Should we increase max_iter for all cases?


Mean Test Accuracy:  0.6274122807017544
Mean Train Accuracy:  0.6274164686890839
Mean Test Precision:  0.0
Mean Train Precision:  0.0
Mean Test Recall:  0.0
Mean Train Recall:  0.0
Mean Test F1 Score:  0.0
Mean Train F1 Score:  0.0


{'fit_time': array([0.00599694, 0.00400019, 0.00459456, 0.0049994 , 0.00399804,
        0.00450063, 0.00449944, 0.004498  , 0.00399899, 0.00399923]),
 'score_time': array([0.00149846, 0.00149846, 0.00150156, 0.00100064, 0.00100112,
        0.00099945, 0.00100064, 0.00100136, 0.00150108, 0.00099993]),
 'test_accuracy': array([0.61403509, 0.61403509, 0.63157895, 0.63157895, 0.63157895,
        0.63157895, 0.63157895, 0.63157895, 0.63157895, 0.625     ]),
 'train_accuracy': array([0.62890625, 0.62890625, 0.62695312, 0.62695312, 0.62695312,
        0.62695312, 0.62695312, 0.62695312, 0.62695312, 0.62768031]),
 'test_precision': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'train_precision': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'test_recall': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'train_recall': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'test_f1': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'train_f1': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

In [22]:
# KNN classification
# knn(x_train,x_test,y_train,y_test,1) # 1 neighbor would give perfect accuracy for training set, overfitting
# knn(x_train,x_test,y_train,y_test,3) # 5 neighbors seems to generalize well
# knn(x_train,x_test,y_train,y_test,100) # 100 neighbors will lead to underfitting

In [65]:
knn(X,Y,1) # 1 neighbor would give perfect accuracy for training set, overfitting


Mean Test Accuracy:  0.9472744360902257
Mean Train Accuracy:  1.0
Mean Test Precision:  0.9449264713166314
Mean Train Precision:  1.0
Mean Test Recall:  0.9147186147186147
Mean Train Recall:  1.0
Mean Test F1 Score:  0.9271241613470768
Mean Train F1 Score:  1.0


{'fit_time': array([0.        , 0.00049996, 0.        , 0.00049996, 0.00049758,
        0.        , 0.        , 0.        , 0.00049949, 0.00050044]),
 'score_time': array([0.0674994 , 0.01400018, 0.01400137, 0.01399994, 0.01449966,
        0.01399899, 0.01449943, 0.01499963, 0.01450038, 0.01500034]),
 'test_accuracy': array([0.96491228, 0.94736842, 0.98245614, 0.92982456, 0.92982456,
        0.94736842, 0.9122807 , 0.96491228, 0.94736842, 0.94642857]),
 'train_accuracy': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_precision': array([0.95454545, 0.91304348, 0.95454545, 0.94736842, 0.9047619 ,
        0.95      , 1.        , 1.        , 0.95      , 0.875     ]),
 'train_precision': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_recall': array([0.95454545, 0.95454545, 1.        , 0.85714286, 0.9047619 ,
        0.9047619 , 0.76190476, 0.9047619 , 0.9047619 , 1.        ]),
 'train_recall': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_f1': array([0.95454545, 

In [66]:
knn(X,Y,5) # 5 neighbors seems to generalize well


Mean Test Accuracy:  0.9648809523809524
Mean Train Accuracy:  0.9785198129873294
Mean Test Precision:  0.9772727272727273
Mean Train Precision:  0.9934273114519853
Mean Test Recall:  0.9290043290043289
Mean Train Recall:  0.9486332322953983
Mean Test F1 Score:  0.9507761786434503
Mean Train F1 Score:  0.9705000337380628


{'fit_time': array([0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.00049996, 0.00050044, 0.        ]),
 'score_time': array([0.01899886, 0.01550055, 0.01549983, 0.01550102, 0.01550102,
        0.01500154, 0.01450014, 0.01400089, 0.01549983, 0.01599836]),
 'test_accuracy': array([0.96491228, 0.96491228, 0.94736842, 0.9122807 , 0.96491228,
        0.98245614, 0.98245614, 0.96491228, 0.98245614, 0.98214286]),
 'train_accuracy': array([0.98046875, 0.97460938, 0.98046875, 0.97460938, 0.98046875,
        0.9765625 , 0.97851562, 0.97851562, 0.98242188, 0.9785575 ]),
 'test_precision': array([0.95454545, 0.95454545, 0.90909091, 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 0.95454545]),
 'train_precision': array([0.99450549, 0.99441341, 0.99453552, 0.98901099, 0.99453552,
        0.98907104, 0.98913043, 0.99450549, 0.99456522, 1.        ]),
 'test_recall': array([0.95454545, 0.95454545, 0.95238095, 0.76190476, 0.9

In [67]:
knn(X,Y,100) # 100 neighbors will lead to underfitting


Mean Test Accuracy:  0.9332393483709274
Mean Train Accuracy:  0.9345821150097466
Mean Test Precision:  0.9891812865497076
Mean Train Precision:  0.9924830536914842
Mean Test Recall:  0.8298701298701298
Mean Train Recall:  0.8307026729126481
Mean Test F1 Score:  0.9003975021429527
Mean Train F1 Score:  0.9043818963514262


{'fit_time': array([0.        , 0.00049973, 0.        , 0.0005002 , 0.        ,
        0.        , 0.0005002 , 0.        , 0.        , 0.        ]),
 'score_time': array([0.02199793, 0.01750135, 0.01650071, 0.01550007, 0.01700068,
        0.01599956, 0.01749969, 0.01700044, 0.01750016, 0.01749992]),
 'test_accuracy': array([0.96491228, 0.9122807 , 0.94736842, 0.85964912, 0.96491228,
        0.94736842, 0.9122807 , 0.94736842, 0.92982456, 0.94642857]),
 'train_accuracy': array([0.92773438, 0.93359375, 0.93359375, 0.94335938, 0.93164062,
        0.93359375, 0.9375    , 0.9296875 , 0.93554688, 0.93957115]),
 'test_precision': array([1.        , 0.94736842, 1.        , 1.        , 1.        ,
        1.        , 0.94444444, 1.        , 1.        , 1.        ]),
 'train_precision': array([0.98726115, 0.99367089, 0.99371069, 0.99390244, 0.9875    ,
        0.98757764, 1.        , 0.99363057, 0.99375   , 0.99382716]),
 'test_recall': array([0.90909091, 0.81818182, 0.85714286, 0.61904762, 0.9

In [72]:
# decision tree
#decision_tree(x_train,x_test,y_train,y_test)
decision_tree(X,Y)


Mean Test Accuracy:  0.9208646616541353
Mean Train Accuracy:  1.0
Mean Test Precision:  0.9045174447977651
Mean Train Precision:  1.0
Mean Test Recall:  0.8867965367965368
Mean Train Recall:  1.0
Mean Test F1 Score:  0.8920428557443033
Mean Train F1 Score:  1.0


{'fit_time': array([0.00550222, 0.00549793, 0.00549984, 0.00449991, 0.00500035,
        0.00399971, 0.00399995, 0.00449991, 0.00500011, 0.00400019]),
 'score_time': array([0.0034976 , 0.00299978, 0.00300002, 0.00299954, 0.0025003 ,
        0.00300026, 0.00250053, 0.00250006, 0.00300002, 0.00250006]),
 'test_accuracy': array([0.94736842, 0.85964912, 0.98245614, 0.85964912, 0.92982456,
        0.9122807 , 0.92982456, 0.94736842, 0.94736842, 0.89285714]),
 'train_accuracy': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_precision': array([0.95238095, 0.79166667, 0.95454545, 0.93333333, 0.94736842,
        0.83333333, 0.94736842, 0.95      , 0.90909091, 0.82608696]),
 'train_precision': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_recall': array([0.90909091, 0.86363636, 1.        , 0.66666667, 0.85714286,
        0.95238095, 0.85714286, 0.9047619 , 0.95238095, 0.9047619 ]),
 'train_recall': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_f1': array([0.93023256, 

In [79]:
svm(X,Y,0.001)


Mean Test Accuracy:  0.6274122807017544
Mean Train Accuracy:  0.6274164686890839
Mean Test Precision:  0.0
Mean Train Precision:  0.0
Mean Test Recall:  0.0
Mean Train Recall:  0.0
Mean Test F1 Score:  0.0
Mean Train F1 Score:  0.0


{'fit_time': array([0.00599933, 0.00499988, 0.00499988, 0.00500035, 0.00449991,
        0.00450039, 0.00550032, 0.00500059, 0.00499988, 0.00449991]),
 'score_time': array([0.00550079, 0.00450015, 0.00450015, 0.00399971, 0.00450015,
        0.0044992 , 0.00449991, 0.00449967, 0.00399995, 0.00399995]),
 'test_accuracy': array([0.61403509, 0.61403509, 0.63157895, 0.63157895, 0.63157895,
        0.63157895, 0.63157895, 0.63157895, 0.63157895, 0.625     ]),
 'train_accuracy': array([0.62890625, 0.62890625, 0.62695312, 0.62695312, 0.62695312,
        0.62695312, 0.62695312, 0.62695312, 0.62695312, 0.62768031]),
 'test_precision': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'train_precision': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'test_recall': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'train_recall': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'test_f1': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'train_f1': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

In [93]:
svm(X,Y,10**8)


Mean Test Accuracy:  0.9280075187969924
Mean Train Accuracy:  0.9765663072612085
Mean Test Precision:  0.8992205734768663
Mean Train Precision:  0.9741098443893849
Mean Test Recall:  0.90995670995671
Mean Train Recall:  0.9627886470101957
Mean Test F1 Score:  0.9033794758213363
Mean Train F1 Score:  0.9683582599160342


{'fit_time': array([1.49899983, 3.13100004, 3.29899859, 1.78350019, 2.0909996 ,
        5.88304949, 1.72449899, 0.94649768, 2.51899695, 0.58299637]),
 'score_time': array([0.00299907, 0.00250006, 0.00300121, 0.00299978, 0.00250053,
        0.00250173, 0.00250292, 0.00300312, 0.00300288, 0.00300002]),
 'test_accuracy': array([0.92982456, 0.9122807 , 0.96491228, 0.85964912, 0.92982456,
        0.98245614, 0.87719298, 0.92982456, 0.92982456, 0.96428571]),
 'train_accuracy': array([0.97265625, 0.9765625 , 0.96484375, 0.97851562, 0.9765625 ,
        0.98046875, 0.98046875, 0.9765625 , 0.97851562, 0.98050682]),
 'test_precision': array([0.84615385, 0.86956522, 0.95238095, 0.84210526, 0.9047619 ,
        0.95454545, 0.81818182, 0.94736842, 0.9047619 , 0.95238095]),
 'train_precision': array([0.97311828, 0.96354167, 0.96256684, 0.98387097, 0.96373057,
        0.96891192, 0.98918919, 0.97860963, 0.9787234 , 0.97883598]),
 'test_recall': array([1.        , 0.90909091, 0.95238095, 0.76190476, 0.9

In [96]:
# svm
#svm(x_train,x_test,y_train,y_test,0.001)
svm(X,Y,10**3)


Mean Test Accuracy:  0.9542606516290727
Mean Train Accuracy:  0.9582103587962963
Mean Test Precision:  0.9568831168831169
Mean Train Precision:  0.9573939295986861
Mean Test Recall:  0.91991341991342
Mean Train Recall:  0.9292449710664095
Mean Test F1 Score:  0.937492872980678
Mean Train F1 Score:  0.9430953836563052


{'fit_time': array([0.00349951, 0.00349689, 0.00349784, 0.00349951, 0.00299859,
        0.00349855, 0.00349784, 0.00299835, 0.00349998, 0.00299954]),
 'score_time': array([0.00349951, 0.00300193, 0.00250149, 0.00250053, 0.00250053,
        0.00200129, 0.00300193, 0.00250173, 0.00199842, 0.0025003 ]),
 'test_accuracy': array([0.96491228, 0.94736842, 0.98245614, 0.9122807 , 0.92982456,
        0.96491228, 0.92982456, 0.98245614, 1.        , 0.92857143]),
 'train_accuracy': array([0.95898438, 0.95703125, 0.953125  , 0.96484375, 0.96289062,
        0.95507812, 0.95703125, 0.95507812, 0.95507812, 0.96296296]),
 'test_precision': array([0.95454545, 1.        , 1.        , 0.9       , 0.9047619 ,
        1.        , 0.9047619 , 1.        , 1.        , 0.9047619 ]),
 'train_precision': array([0.96174863, 0.9516129 , 0.94652406, 0.9726776 , 0.9673913 ,
        0.9516129 , 0.95187166, 0.95652174, 0.9516129 , 0.96236559]),
 'test_recall': array([0.95454545, 0.86363636, 0.95238095, 0.85714286, 0.9

In [28]:
#adaboost(x_train,x_test,y_train,y_test)
adaboost(X,Y)


Mean Test Accuracy:  0.9701441102756891
Mean Train Accuracy:  1.0
Mean Test Precision:  0.975909090909091
Mean Train Precision:  1.0
Mean Test Recall:  0.9432900432900434
Mean Train Recall:  1.0
Mean Test Recall:  0.9432900432900434
Mean Train Recall:  1.0
Mean Test F1 Score:  0.9590960655906772
Mean Train F1 Score:  1.0


{'fit_time': array([0.17351604, 0.16961932, 0.17472911, 0.17262197, 0.17156291,
        0.17011189, 0.17003584, 0.16971111, 0.17230821, 0.17515922]),
 'score_time': array([0.01658797, 0.01640201, 0.01688766, 0.01641989, 0.01587915,
        0.01586509, 0.0161202 , 0.01645279, 0.01596808, 0.01698184]),
 'test_accuracy': array([0.96491228, 0.98245614, 0.98245614, 0.94736842, 0.98245614,
        0.98245614, 0.98245614, 0.94736842, 0.94736842, 0.98214286]),
 'train_accuracy': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_precision': array([0.95454545, 1.        , 1.        , 0.95      , 1.        ,
        1.        , 1.        , 0.95      , 0.95      , 0.95454545]),
 'train_precision': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_recall': array([0.95454545, 0.95454545, 0.95238095, 0.9047619 , 0.95238095,
        0.95238095, 0.95238095, 0.9047619 , 0.9047619 , 1.        ]),
 'train_recall': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_f1': array([0.95454545, 

In [33]:
random_forest(X,Y)

              precision    recall  f1-score   support

           0       0.97      0.94      0.96        72
           1       0.91      0.95      0.93        42

    accuracy                           0.95       114
   macro avg       0.94      0.95      0.94       114
weighted avg       0.95      0.95      0.95       114



#### DataSet 2 Training and Evaluation

In [63]:
#import warnings
warnings.filterwarnings('ignore')
logistic_regression(X2,Y2,10**-5) # weak regularization leads to overfitting
# Warning is due to overfitting: https://github.com/Berkeley-Data/hpt/issues/52#issuecomment-803665466
# Suppressed warning according to: https://stackoverflow.com/questions/43162506/undefinedmetricwarning-f-score-is-ill-defined-and-being-set-to-0-0-in-labels-wi


Mean Test Accuracy:  0.7164199814986124
Mean Train Accuracy:  0.7385750695088045
Mean Test Precision:  0.6121477052359404
Mean Train Precision:  0.6503517801019639
Mean Test Recall:  0.50625
Mean Train Recall:  0.5305555555555554
Mean Test F1 Score:  0.5459650840857738
Mean Train F1 Score:  0.5842935584350059


{'fit_time': array([0.00150347, 0.00150084, 0.00149918, 0.00150204, 0.00099921,
        0.00150061, 0.00099945, 0.00150013, 0.00049996, 0.00100017]),
 'score_time': array([0.00299621, 0.00199986, 0.00200033, 0.00199795, 0.00099969,
        0.0010004 , 0.00100017, 0.00099993, 0.00150228, 0.00150037]),
 'test_accuracy': array([0.74468085, 0.70212766, 0.7173913 , 0.73913043, 0.63043478,
        0.7173913 , 0.7826087 , 0.73913043, 0.67391304, 0.7173913 ]),
 'train_accuracy': array([0.73493976, 0.7373494 , 0.73798077, 0.73798077, 0.74038462,
        0.75240385, 0.72836538, 0.74038462, 0.73798077, 0.73798077]),
 'test_precision': array([0.6       , 0.58333333, 0.58823529, 0.625     , 0.45454545,
        0.66666667, 0.8       , 0.64285714, 0.54545455, 0.61538462]),
 'train_precision': array([0.6440678 , 0.6446281 , 0.64957265, 0.65486726, 0.65789474,
        0.67521368, 0.6302521 , 0.65517241, 0.64227642, 0.64957265]),
 'test_recall': array([0.75  , 0.4375, 0.625 , 0.625 , 0.3125, 0.375 , 0.5

In [19]:
warnings.filterwarnings('always')

In [62]:
logistic_regression(X2,Y2,5)  # right amount of regularization improves perfomance 


Mean Test Accuracy:  0.7251156336725254
Mean Train Accuracy:  0.7417018072289157
Mean Test Precision:  0.6466341991341992
Mean Train Precision:  0.659546809905361
Mean Test Recall:  0.49375
Mean Train Recall:  0.5256944444444444
Mean Test F1 Score:  0.5473101956741508
Mean Train F1 Score:  0.5849796570218101


{'fit_time': array([0.00099897, 0.00100136, 0.00099993, 0.00100112, 0.0005002 ,
        0.00049996, 0.00100136, 0.00100017, 0.00099993, 0.00099993]),
 'score_time': array([0.00150061, 0.0014987 , 0.00150013, 0.00149846, 0.00149989,
        0.00150061, 0.00149989, 0.00099945, 0.00100017, 0.00099993]),
 'test_accuracy': array([0.74468085, 0.70212766, 0.76086957, 0.73913043, 0.65217391,
        0.7173913 , 0.80434783, 0.73913043, 0.67391304, 0.7173913 ]),
 'train_accuracy': array([0.7373494 , 0.74216867, 0.73317308, 0.74278846, 0.74519231,
        0.75721154, 0.73076923, 0.73798077, 0.75      , 0.74038462]),
 'test_precision': array([0.6       , 0.58333333, 0.66666667, 0.625     , 0.5       ,
        0.66666667, 1.        , 0.64285714, 0.54545455, 0.63636364]),
 'train_precision': array([0.64957265, 0.65811966, 0.64864865, 0.66666667, 0.66964286,
        0.69026549, 0.63559322, 0.64957265, 0.66949153, 0.65789474]),
 'test_recall': array([0.75  , 0.4375, 0.625 , 0.625 , 0.3125, 0.375 , 0.4

In [64]:
logistic_regression(X2,Y2,10**5, max_iter=300)  # too strong regularization leads to underfitting
# Rishabh - I had to increase max_iter because the model wasn't converging earlier
# https://stackoverflow.com/questions/62658215/convergencewarning-lbfgs-failed-to-converge-status-1-stop-total-no-of-iter
# TODO: Should we increase max_iter for all cases?


Mean Test Accuracy:  0.653654024051804
Mean Train Accuracy:  0.6536793327154774
Mean Test Precision:  0.0
Mean Train Precision:  0.0
Mean Test Recall:  0.0
Mean Train Recall:  0.0
Mean Test F1 Score:  0.0
Mean Train F1 Score:  0.0


{'fit_time': array([0.0009985 , 0.00099993, 0.00100017, 0.00100017, 0.00150061,
        0.00099993, 0.00099874, 0.00099993, 0.00099993, 0.00100017]),
 'score_time': array([0.00200152, 0.00149941, 0.00100136, 0.00099921, 0.00149989,
        0.00150013, 0.00099969, 0.0015018 , 0.00150013, 0.00099993]),
 'test_accuracy': array([0.65957447, 0.65957447, 0.65217391, 0.65217391, 0.65217391,
        0.65217391, 0.65217391, 0.65217391, 0.65217391, 0.65217391]),
 'train_accuracy': array([0.65301205, 0.65301205, 0.65384615, 0.65384615, 0.65384615,
        0.65384615, 0.65384615, 0.65384615, 0.65384615, 0.65384615]),
 'test_precision': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'train_precision': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'test_recall': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'train_recall': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'test_f1': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'train_f1': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

In [68]:
knn(X2,Y2,1) # 1 neighbor would give perfect accuracy for training set, overfitting


Mean Test Accuracy:  0.6601757631822387
Mean Train Accuracy:  1.0
Mean Test Precision:  0.5591113543319426
Mean Train Precision:  1.0
Mean Test Recall:  0.4375
Mean Train Recall:  1.0
Mean Test F1 Score:  0.4531702809357075
Mean Train F1 Score:  1.0


{'fit_time': array([0.00049973, 0.        , 0.        , 0.00049996, 0.0005002 ,
        0.00049996, 0.0005002 , 0.        , 0.00049973, 0.00049996]),
 'score_time': array([0.0025003 , 0.0025003 , 0.00249982, 0.00199962, 0.0019989 ,
        0.00200009, 0.00199962, 0.0025003 , 0.00199986, 0.00199986]),
 'test_accuracy': array([0.61702128, 0.70212766, 0.69565217, 0.7173913 , 0.63043478,
        0.60869565, 0.69565217, 0.69565217, 0.60869565, 0.63043478]),
 'train_accuracy': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_precision': array([0.45454545, 0.54545455, 0.625     , 0.61538462, 0.45454545,
        0.4375    , 1.        , 0.57142857, 0.41666667, 0.47058824]),
 'train_precision': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_recall': array([0.625 , 0.75  , 0.3125, 0.5   , 0.3125, 0.4375, 0.125 , 0.5   ,
        0.3125, 0.5   ]),
 'train_recall': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_f1': array([0.52631579, 0.63157895, 0.41666667, 0.55172414, 0.37

In [71]:
knn(X2,Y2,5) # 5 neighbors seems to generalize well


Mean Test Accuracy:  0.6604070305272896
Mean Train Accuracy:  0.7672034291010195
Mean Test Precision:  0.5195991426873781
Mean Train Precision:  0.7185349740483389
Mean Test Recall:  0.375
Mean Train Recall:  0.538888888888889
Mean Test F1 Score:  0.4227584052584052
Mean Train F1 Score:  0.6156671977944923


{'fit_time': array([0.00049996, 0.00049996, 0.00049877, 0.00050259, 0.00049925,
        0.00049996, 0.        , 0.00050068, 0.        , 0.        ]),
 'score_time': array([0.00300026, 0.00250053, 0.0025022 , 0.00249696, 0.00250196,
        0.00199986, 0.00249767, 0.00199938, 0.00249887, 0.0025003 ]),
 'test_accuracy': array([0.63829787, 0.57446809, 0.7173913 , 0.7173913 , 0.65217391,
        0.65217391, 0.65217391, 0.67391304, 0.63043478, 0.69565217]),
 'train_accuracy': array([0.78795181, 0.77831325, 0.75721154, 0.76442308, 0.75721154,
        0.77644231, 0.77644231, 0.75240385, 0.75961538, 0.76201923]),
 'test_precision': array([0.47619048, 0.375     , 0.66666667, 0.58823529, 0.5       ,
        0.5       , 0.5       , 0.54545455, 0.44444444, 0.6       ]),
 'train_precision': array([0.75925926, 0.73636364, 0.70093458, 0.7254902 , 0.70873786,
        0.73831776, 0.71794872, 0.69158879, 0.69642857, 0.71028037]),
 'test_recall': array([0.625, 0.375, 0.375, 0.625, 0.25 , 0.25 , 0.25 , 0.

In [70]:
knn(X2,Y2,100) # 100 neighbors will lead to underfitting


Mean Test Accuracy:  0.6882516188714153
Mean Train Accuracy:  0.7008196246524561
Mean Test Precision:  0.7066666666666667
Mean Train Precision:  0.8068912570101026
Mean Test Recall:  0.15
Mean Train Recall:  0.18194444444444446
Mean Test F1 Score:  0.24217406260749913
Mean Train F1 Score:  0.29385633503798536


{'fit_time': array([0.00049758, 0.00049925, 0.00049973, 0.00049973, 0.        ,
        0.00049996, 0.00050068, 0.        , 0.0005002 , 0.00049925]),
 'score_time': array([0.00450182, 0.00400066, 0.00300002, 0.00350165, 0.00350046,
        0.00300026, 0.00349951, 0.00349998, 0.00349951, 0.00300002]),
 'test_accuracy': array([0.68085106, 0.72340426, 0.7173913 , 0.67391304, 0.65217391,
        0.63043478, 0.67391304, 0.69565217, 0.73913043, 0.69565217]),
 'train_accuracy': array([0.71325301, 0.69638554, 0.67788462, 0.69951923, 0.70673077,
        0.70673077, 0.70673077, 0.70913462, 0.69471154, 0.69711538]),
 'test_precision': array([0.66666667, 0.8       , 1.        , 0.6       , 0.5       ,
        0.        , 1.        , 0.75      , 1.        , 0.75      ]),
 'train_precision': array([0.83783784, 0.71428571, 0.85714286, 0.80645161, 0.86666667,
        0.78947368, 0.75      , 0.81081081, 0.81481481, 0.82142857]),
 'test_recall': array([0.125 , 0.25  , 0.1875, 0.1875, 0.0625, 0.    , 0.0

In [73]:
# decision tree
#decision_tree(x_train,x_test,y_train,y_test)
decision_tree(X2,Y2)


Mean Test Accuracy:  0.6234042553191489
Mean Train Accuracy:  1.0
Mean Test Precision:  0.4682719241542771
Mean Train Precision:  1.0
Mean Test Recall:  0.45625
Mean Train Recall:  1.0
Mean Test F1 Score:  0.45985267952813585
Mean Train F1 Score:  1.0


{'fit_time': array([0.00250244, 0.00200295, 0.0019989 , 0.00200033, 0.00249958,
        0.00200081, 0.00199938, 0.00200009, 0.00200081, 0.00199938]),
 'score_time': array([0.00249791, 0.00299621, 0.00249815, 0.0035007 , 0.00249982,
        0.00249887, 0.00250006, 0.00249839, 0.0024991 , 0.00250006]),
 'test_accuracy': array([0.57446809, 0.65957447, 0.63043478, 0.45652174, 0.60869565,
        0.65217391, 0.67391304, 0.58695652, 0.73913043, 0.65217391]),
 'train_accuracy': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_precision': array([0.38888889, 0.5       , 0.47058824, 0.28571429, 0.44444444,
        0.5       , 0.53846154, 0.41176471, 0.64285714, 0.5       ]),
 'train_precision': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_recall': array([0.4375, 0.4375, 0.5   , 0.375 , 0.5   , 0.4375, 0.4375, 0.4375,
        0.5625, 0.4375]),
 'train_recall': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_f1': array([0.41176471, 0.46666667, 0.48484848, 0.32432432, 0.47

In [98]:
# svm
#svm(x_train,x_test,y_train,y_test,0.001)
svm(X2,Y2,0.001)


Mean Test Accuracy:  0.653654024051804
Mean Train Accuracy:  0.6536793327154774
Mean Test Precision:  0.0
Mean Train Precision:  0.0
Mean Test Recall:  0.0
Mean Train Recall:  0.0
Mean Test F1 Score:  0.0
Mean Train F1 Score:  0.0


{'fit_time': array([0.00300169, 0.00299978, 0.00249934, 0.00249958, 0.00250006,
        0.00250006, 0.00249982, 0.0025003 , 0.00199986, 0.00250101]),
 'score_time': array([0.00349855, 0.00400019, 0.00300002, 0.00300002, 0.00300002,
        0.00299978, 0.00300002, 0.00300002, 0.00300002, 0.00349855]),
 'test_accuracy': array([0.65957447, 0.65957447, 0.65217391, 0.65217391, 0.65217391,
        0.65217391, 0.65217391, 0.65217391, 0.65217391, 0.65217391]),
 'train_accuracy': array([0.65301205, 0.65301205, 0.65384615, 0.65384615, 0.65384615,
        0.65384615, 0.65384615, 0.65384615, 0.65384615, 0.65384615]),
 'test_precision': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'train_precision': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'test_recall': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'train_recall': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'test_f1': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'train_f1': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

In [97]:
svm(X2,Y2,1000)


Mean Test Accuracy:  0.73159111933395
Mean Train Accuracy:  0.7974976830398518
Mean Test Precision:  0.6571106671106672
Mean Train Precision:  0.7662656174496109
Mean Test Recall:  0.48125
Mean Train Recall:  0.5979166666666667
Mean Test F1 Score:  0.5440935111632021
Mean Train F1 Score:  0.6712727450676905


{'fit_time': array([0.0150001 , 0.01399827, 0.01599741, 0.01200008, 0.01499987,
        0.01499748, 0.01249838, 0.01449895, 0.01250005, 0.0149982 ]),
 'score_time': array([0.00399899, 0.00350118, 0.00300074, 0.00300002, 0.00250149,
        0.00300169, 0.00300145, 0.00250196, 0.00300074, 0.00300097]),
 'test_accuracy': array([0.65957447, 0.80851064, 0.80434783, 0.73913043, 0.67391304,
        0.7173913 , 0.86956522, 0.76086957, 0.65217391, 0.63043478]),
 'train_accuracy': array([0.80240964, 0.78795181, 0.78846154, 0.79326923, 0.81490385,
        0.80288462, 0.78125   , 0.79326923, 0.81009615, 0.80048077]),
 'test_precision': array([0.5       , 0.73333333, 0.76923077, 0.64285714, 0.55555556,
        0.71428571, 1.        , 0.72727273, 0.5       , 0.42857143]),
 'train_precision': array([0.79245283, 0.74561404, 0.75925926, 0.78431373, 0.78151261,
        0.75833333, 0.73451327, 0.76363636, 0.77310924, 0.7699115 ]),
 'test_recall': array([0.625 , 0.6875, 0.625 , 0.5625, 0.3125, 0.3125, 0.6

In [99]:
svm(X2,Y2,10**8)


Mean Test Accuracy:  0.6361702127659575
Mean Train Accuracy:  0.8737227757182577
Mean Test Precision:  0.47717760309865564
Mean Train Precision:  0.8117340731546319
Mean Test Recall:  0.51875
Mean Train Recall:  0.8284722222222222
Mean Test F1 Score:  0.4900602110534198
Mean Train F1 Score:  0.8194888860734466


{'fit_time': array([47.16610336, 44.12800002, 60.31660581, 47.38600063, 55.51854849,
        37.29302812, 68.48807526, 21.50049973, 40.32664824, 75.24564171]),
 'score_time': array([0.00299907, 0.00300145, 0.00299931, 0.00300241, 0.00299978,
        0.00300002, 0.00299978, 0.00300002, 0.00299978, 0.00300074]),
 'test_accuracy': array([0.61702128, 0.74468085, 0.65217391, 0.76086957, 0.60869565,
        0.63043478, 0.60869565, 0.60869565, 0.58695652, 0.54347826]),
 'train_accuracy': array([0.82891566, 0.85783133, 0.88461538, 0.87259615, 0.86538462,
        0.86298077, 0.87019231, 0.90384615, 0.88701923, 0.90384615]),
 'test_precision': array([0.45833333, 0.6       , 0.5       , 0.66666667, 0.4       ,
        0.46666667, 0.45454545, 0.42857143, 0.42857143, 0.36842105]),
 'train_precision': array([0.75524476, 0.8057554 , 0.81168831, 0.82269504, 0.79333333,
        0.78064516, 0.80821918, 0.88235294, 0.80124224, 0.85616438]),
 'test_recall': array([0.6875, 0.75  , 0.4375, 0.625 , 0.25  , 0

In [27]:
#adaboost(x_train,x_test,y_train,y_test)
adaboost(X2,Y2)


Mean Test Accuracy:  0.6429694727104533
Mean Train Accuracy:  0.8624316496756256
Mean Test Precision:  0.4929239766081871
Mean Train Precision:  0.8353178980058636
Mean Test Recall:  0.40625
Mean Train Recall:  0.7520833333333334
Mean Test Recall:  0.40625
Mean Train Recall:  0.7520833333333334
Mean Test F1 Score:  0.43516538778810865
Mean Train F1 Score:  0.7911610803730593


{'fit_time': array([0.07100248, 0.07000113, 0.06950188, 0.07050061, 0.07000351,
        0.07050657, 0.06900358, 0.06850076, 0.06900382, 0.06800056]),
 'score_time': array([0.010499  , 0.01149774, 0.01099825, 0.01049781, 0.0109973 ,
        0.01049376, 0.01049972, 0.01099634, 0.00999808, 0.01050138]),
 'test_accuracy': array([0.59574468, 0.63829787, 0.65217391, 0.63043478, 0.52173913,
        0.67391304, 0.69565217, 0.65217391, 0.69565217, 0.67391304]),
 'train_accuracy': array([0.85060241, 0.86506024, 0.85336538, 0.87740385, 0.86298077,
        0.87740385, 0.85336538, 0.84615385, 0.87259615, 0.86538462]),
 'test_precision': array([0.4       , 0.47619048, 0.5       , 0.47368421, 0.28571429,
        0.55555556, 0.58333333, 0.5       , 0.58333333, 0.57142857]),
 'train_precision': array([0.80597015, 0.86065574, 0.82170543, 0.88429752, 0.84251969,
        0.83941606, 0.81203008, 0.78985507, 0.85826772, 0.83846154]),
 'test_recall': array([0.375 , 0.625 , 0.4375, 0.5625, 0.25  , 0.3125, 0.4

In [34]:
random_forest(X2,Y2)

              precision    recall  f1-score   support

           0       0.66      0.74      0.69        57
           1       0.48      0.39      0.43        36

    accuracy                           0.60        93
   macro avg       0.57      0.56      0.56        93
weighted avg       0.59      0.60      0.59        93

