In [2]:
# install dependencies
import sys
# !{sys.executable} -m pip install pandas

In [3]:
# import dependencies
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.preprocessing import LabelEncoder

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate

import warnings
from sklearn.model_selection import GridSearchCV

#### Exploratory Data Analysis

In [4]:
##### data loading and feature extraction

dataset_1_path = './datasets/project3_dataset1.txt'
dataset_1 = pd.read_csv(dataset_1_path,sep='\t',header=None)
num_columns = len(dataset_1.columns)
num_features = num_columns - 1
dataset_1.columns=["F"+str(i) for i in range(1, num_columns + 1)]
label_column = "F{0}".format(num_columns)
dataset_1_features = dataset_1.loc[:, dataset_1.columns != label_column]
dataset_1_label = dataset_1.loc[:, dataset_1.columns == label_column]



dataset_2_path = './datasets/project3_dataset2.txt'
# load the CSV file as a dataframe
dataset_2 = pd.read_csv(dataset_2_path,sep='\t',header=None)
num_columns = len(dataset_2.columns)
num_features = num_columns - 1
dataset_2.columns=["F"+str(i) for i in range(1, num_columns + 1)]
label_column = "F{0}".format(num_columns)
custom_encoding = {'Present':1, 'Absent':0}
dataset_2["F5"] = dataset_2["F5"].map(custom_encoding)
dataset_2_features = dataset_2.loc[:, dataset_2.columns != label_column]
dataset_2_label = dataset_2.loc[:, dataset_2.columns == label_column]


In [5]:
X, Y = dataset_1_features, dataset_1_label

In [6]:
X2, Y2 = dataset_2_features, dataset_2_label

#### Hyperparameter Tuning Models

In [12]:
def tune_svm(X,Y):
    svc = SVC()
    parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
    clf = GridSearchCV(svc, parameters, scoring='accuracy')
    clf.fit(X,Y.values.ravel())
    #cv_results = cross_validate(clf, X,Y.values.ravel(), cv=10, scoring=('accuracy', 'precision', 'recall', 'f1','roc_auc'), return_train_score=True)
    return clf.best_params_

In [18]:
def tune_adaboost(X,Y):
    ada = AdaBoostClassifier(random_state=0)
    parameters = {'n_estimators': [100,200,300]}
    clf = GridSearchCV(ada, parameters, scoring='accuracy')
    clf = clf.fit(X, Y.values.ravel())
    #cv_results = cross_validate(clf, X,Y.values.ravel(), cv=10, scoring=('accuracy', 'precision', 'recall', 'f1','roc_auc'), return_train_score=True)
    return clf.best_params_

In [27]:
def tune_logistic_regression(X,y, max_iter=100):
    logisticRegr = LogisticRegression(max_iter=max_iter)    
    scaler = preprocessing.StandardScaler().fit(X)
    x_scaled = scaler.transform(X)
    parameters={"C":np.logspace(-3,5,10)}
    clf = GridSearchCV(logisticRegr, parameters, scoring='accuracy', cv = 10)
    clf = clf.fit(x_scaled,y.values.ravel())
    return clf.best_params_
    #cv_results = cross_validate(logisticRegr, x_scaled,y.values.ravel(), cv=10, scoring=('accuracy', 'precision', 'recall', 'f1','roc_auc'), return_train_score=True)

In [33]:
# TODO: Can I use the sklearn implementation? Or should I use the decision tree classifier to create custom random forest methodß
def tune_random_forest(X,y):
    param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
        }
    rf = RandomForestClassifier()
    clf = GridSearchCV(rf, param_grid, scoring='accuracy', cv = 10)
    clf = clf.fit(X,y.values.ravel())
    return clf.best_params_

In [37]:
def tune_knn(X,y):
    
    scaler = preprocessing.StandardScaler().fit(X)
    x_scaled = scaler.transform(X)
    #x_scaled_test = scaler.transform(x_test)
    k_range = list(range(1, 31))
    param_grid = dict(n_neighbors=k_range)
    knn = KNeighborsClassifier()
    clf = GridSearchCV(knn, param_grid, scoring='accuracy', cv = 10)
    clf = clf.fit(x_scaled,y.values.ravel())
    return clf.best_params_

In [39]:
def tune_decision_tree(X,y):
    params = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4]}
    dt = tree.DecisionTreeClassifier()
    clf = GridSearchCV(dt, params, scoring='accuracy', cv = 10)
    clf = clf.fit(X,y)
    return clf.best_params_

#### Dataset 1 Hyperparameter Tuning

In [13]:
tune_svm(X,Y)

{'C': 1, 'kernel': 'linear'}

In [19]:
tune_adaboost(X,Y)

{'n_estimators': 300}

In [30]:
warnings.filterwarnings('ignore')
print(tune_logistic_regression(X,Y, max_iter=1000))
warnings.filterwarnings('always')

{'C': 3.593813663804626}


In [34]:
tune_random_forest(X,Y)

{'criterion': 'gini',
 'max_depth': 5,
 'max_features': 'sqrt',
 'n_estimators': 500}

In [38]:
tune_knn(X,Y)

{'n_neighbors': 11}

In [40]:
tune_decision_tree(X,Y)

{'max_leaf_nodes': 13, 'min_samples_split': 3}

#### Dataset 2 Hyperparameter Tuning

In [67]:
tune_svm(X2,Y2)

{'C': 1, 'kernel': 'linear'}

In [68]:
tune_adaboost(X2,Y2)

{'n_estimators': 100}

In [69]:
warnings.filterwarnings('ignore')
print(tune_logistic_regression(X2,Y2, max_iter=1000))
warnings.filterwarnings('always')

{'C': 0.05994842503189409}


In [70]:
tune_random_forest(X2,Y2)

{'criterion': 'entropy',
 'max_depth': 4,
 'max_features': 'auto',
 'n_estimators': 200}

In [71]:
tune_knn(X2,Y2)

{'n_neighbors': 25}

In [72]:
tune_decision_tree(X2,Y2)

{'max_leaf_nodes': 6, 'min_samples_split': 2}

#### Model Definitions

In [41]:
def svm(X,Y,C, kernel):
    clf = SVC(C=C, kernel=kernel)
    cv_results = cross_validate(clf, X,Y.values.ravel(), cv=10, scoring=('accuracy', 'precision', 'recall', 'f1','roc_auc'), return_train_score=True)
    return cv_results

In [44]:
def adaboost(X,Y, n_estimators):
    clf = AdaBoostClassifier(n_estimators=n_estimators, random_state=0)
    clf = clf.fit(X, Y.values.ravel())
    cv_results = cross_validate(clf, X,Y.values.ravel(), cv=10, scoring=('accuracy', 'precision', 'recall', 'f1','roc_auc'), return_train_score=True)
    return cv_results

In [50]:
def logistic_regression(X,y,reg_param, max_iter=100):
    # all parameters not specified are set to their defaults
    if(reg_param > 0):
        logisticRegr = LogisticRegression(penalty="l2",C=reg_param, max_iter=max_iter)
    else:
        logisticRegr = LogisticRegression(penalty="none", max_iter=max_iter) # default l2 reg param 
        
    scaler = preprocessing.StandardScaler().fit(X)
    x_scaled = scaler.transform(X)
    cv_results = cross_validate(logisticRegr, x_scaled,y.values.ravel(), cv=10, scoring=('accuracy', 'precision', 'recall', 'f1','roc_auc'), return_train_score=True)
    return cv_results

In [56]:
# TODO: Can I use the sklearn implementation? Or should I use the decision tree classifier to create custom random forest methodß
def random_forest(X,y, criterion, max_depth, max_features, n_estimators):
    clf = RandomForestClassifier(max_depth=max_depth, random_state=0, criterion=criterion,
                                 max_features=max_features,n_estimators=n_estimators )
    cv_results = cross_validate(clf, X,y.values.ravel(), cv=10, scoring=('accuracy', 'precision', 'recall', 'f1','roc_auc'), return_train_score=True)
    return cv_results

In [47]:
def knn(X,y,k):
    
    scaler = preprocessing.StandardScaler().fit(X)
    x_scaled = scaler.transform(X)
    #x_scaled_test = scaler.transform(x_test)
    
    knn = KNeighborsClassifier(n_neighbors=k)
    
    cv_results = cross_validate(knn, x_scaled,y.values.ravel(), cv=10, scoring=('accuracy', 'precision', 'recall', 'f1','roc_auc'), return_train_score=True)
    return cv_results

In [53]:
def decision_tree(X,y, max_leaf_nodes, min_samples_split):
    clf = tree.DecisionTreeClassifier(max_leaf_nodes=13, min_samples_split=3)
    cv_results = cross_validate(clf, X,y, cv=10, scoring=('accuracy', 'precision', 'recall', 'f1','roc_auc'), return_train_score=True)
    return cv_results

#### DataSet 1 Training and Evaluation

In [51]:
warnings.filterwarnings('ignore')
logistic_regression(X,Y,3.59,1000)

{'fit_time': array([0.05724692, 0.02053189, 0.01158905, 0.0113647 , 0.00954413,
        0.00871086, 0.00611567, 0.01367617, 0.00689602, 0.00942492]),
 'score_time': array([0.00268006, 0.00191736, 0.00175786, 0.0017612 , 0.00173807,
        0.00176406, 0.00173521, 0.00173092, 0.00176406, 0.00175714]),
 'test_accuracy': array([0.98245614, 1.        , 1.        , 0.98245614, 0.96491228,
        0.98245614, 0.94736842, 0.98245614, 0.96491228, 1.        ]),
 'train_accuracy': array([0.9921875 , 0.98828125, 0.98828125, 0.9921875 , 0.98632812,
        0.9921875 , 0.99023438, 0.99023438, 0.99023438, 0.98830409]),
 'test_precision': array([0.95652174, 1.        , 1.        , 1.        , 1.        ,
        1.        , 0.90909091, 1.        , 0.95238095, 1.        ]),
 'train_precision': array([1.        , 0.99462366, 0.99465241, 1.        , 0.99462366,
        1.        , 0.99468085, 0.98947368, 0.99468085, 0.99465241]),
 'test_recall': array([1.        , 1.        , 1.        , 0.95238095, 0.9

In [48]:
knn(X,Y,11) # 1 neighbor would give perfect accuracy for training set, overfitting

{'fit_time': array([0.00101805, 0.00033212, 0.00030613, 0.00025702, 0.00026011,
        0.00025702, 0.00023913, 0.00025606, 0.00029898, 0.00026584]),
 'score_time': array([0.01675582, 0.00589705, 0.00417495, 0.00410795, 0.00405502,
        0.00398707, 0.0039959 , 0.00485778, 0.00447893, 0.00551915]),
 'test_accuracy': array([0.98245614, 0.96491228, 0.98245614, 0.92982456, 0.96491228,
        0.98245614, 0.98245614, 0.96491228, 0.98245614, 0.98214286]),
 'train_accuracy': array([0.97265625, 0.97070312, 0.97070312, 0.9765625 , 0.97265625,
        0.97265625, 0.97265625, 0.97070312, 0.97460938, 0.97270955]),
 'test_precision': array([1.        , 0.95454545, 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 0.95454545]),
 'train_precision': array([0.98351648, 0.99435028, 0.98888889, 0.98907104, 0.98895028,
        0.98895028, 0.98895028, 0.98888889, 0.98901099, 0.99441341]),
 'test_recall': array([0.95454545, 0.95454545, 0.95238095, 0.80952381, 0.9

In [55]:
decision_tree(X,Y, max_leaf_nodes=13, min_samples_split=3)

{'fit_time': array([0.01672006, 0.01169086, 0.0063448 , 0.0065589 , 0.005548  ,
        0.0050559 , 0.00550818, 0.0050199 , 0.00557303, 0.00470018]),
 'score_time': array([0.00766492, 0.00634909, 0.00433612, 0.00363183, 0.00317383,
        0.00312614, 0.00313973, 0.00310922, 0.00312114, 0.00310397]),
 'test_accuracy': array([0.94736842, 0.87719298, 0.96491228, 0.87719298, 0.96491228,
        0.89473684, 0.94736842, 0.94736842, 0.96491228, 0.94642857]),
 'train_accuracy': array([0.98828125, 0.98828125, 0.9921875 , 0.9921875 , 0.99023438,
        0.99414062, 0.9921875 , 0.99023438, 0.98828125, 0.99415205]),
 'test_precision': array([0.95238095, 0.82608696, 1.        , 0.88888889, 1.        ,
        0.8       , 0.95      , 0.95      , 1.        , 0.90909091]),
 'train_precision': array([0.9893617 , 0.99462366, 1.        , 0.99470899, 1.        ,
        1.        , 0.99470899, 0.98947368, 0.98941799, 1.        ]),
 'test_recall': array([0.90909091, 0.86363636, 0.9047619 , 0.76190476, 0.9

In [42]:
svm(X,Y,1, 'linear')

{'fit_time': array([1.16835213, 1.83804297, 0.57287502, 0.63425207, 0.57767773,
        1.59166193, 0.48907924, 0.43592381, 0.83763003, 0.48390484]),
 'score_time': array([0.01042604, 0.00356507, 0.00351214, 0.00347996, 0.00310302,
        0.00320506, 0.00303674, 0.00290108, 0.00375772, 0.00367212]),
 'test_accuracy': array([0.9122807 , 0.96491228, 0.98245614, 0.9122807 , 0.9122807 ,
        0.96491228, 0.9122807 , 0.98245614, 1.        , 0.98214286]),
 'train_accuracy': array([0.96679688, 0.96289062, 0.97265625, 0.96875   , 0.97070312,
        0.96679688, 0.96875   , 0.96679688, 0.9609375 , 0.96491228]),
 'test_precision': array([0.84, 1.  , 1.  , 0.9 , 0.9 , 1.  , 0.9 , 1.  , 1.  , 1.  ]),
 'train_precision': array([0.96756757, 0.96721311, 0.97326203, 0.96296296, 0.97826087,
        0.95789474, 0.96791444, 0.96276596, 0.96216216, 0.96756757]),
 'test_recall': array([0.95454545, 0.90909091, 0.95238095, 0.85714286, 0.85714286,
        0.9047619 , 0.85714286, 0.95238095, 1.        , 0.9

In [45]:
#adaboost(x_train,x_test,y_train,y_test)
adaboost(X,Y, 300)

{'fit_time': array([0.60094094, 0.60139489, 0.60209799, 0.60245919, 0.60145712,
        0.60125303, 0.60193825, 0.60184598, 0.60057998, 0.59857702]),
 'score_time': array([0.03939295, 0.03889823, 0.03921771, 0.03888583, 0.03883767,
        0.03952289, 0.03896499, 0.03893495, 0.03880692, 0.03845191]),
 'test_accuracy': array([0.96491228, 0.98245614, 0.98245614, 0.92982456, 0.96491228,
        0.96491228, 0.98245614, 0.96491228, 0.98245614, 0.96428571]),
 'train_accuracy': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_precision': array([0.95454545, 1.        , 1.        , 0.94736842, 1.        ,
        0.95238095, 1.        , 1.        , 1.        , 0.91304348]),
 'train_precision': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_recall': array([0.95454545, 0.95454545, 0.95238095, 0.85714286, 0.9047619 ,
        0.95238095, 0.95238095, 0.9047619 , 0.95238095, 1.        ]),
 'train_recall': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_f1': array([0.95454545, 

In [58]:
#random_forest(x_train,x_test,y_train,y_test)
random_forest(X,Y, 'gini', 5, 'sqrt', 500)

{'fit_time': array([0.56493282, 0.52228475, 0.519835  , 0.5178082 , 0.52513003,
        0.52864885, 0.52830482, 0.52736306, 0.5302527 , 0.53744721]),
 'score_time': array([0.04388928, 0.0440371 , 0.04426312, 0.04310489, 0.04320478,
        0.044559  , 0.045259  , 0.04406786, 0.04443431, 0.04415202]),
 'test_accuracy': array([0.96491228, 0.96491228, 0.98245614, 0.92982456, 0.96491228,
        0.96491228, 0.96491228, 0.98245614, 0.98245614, 0.94642857]),
 'train_accuracy': array([0.99414062, 0.99023438, 0.99414062, 0.99414062, 0.99414062,
        0.99414062, 0.99609375, 0.99609375, 0.99414062, 0.99415205]),
 'test_precision': array([0.95454545, 0.95454545, 1.        , 0.94736842, 1.        ,
        0.95238095, 1.        , 1.        , 1.        , 0.90909091]),
 'train_precision': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'test_recall': array([0.95454545, 0.95454545, 0.95238095, 0.85714286, 0.9047619 ,
        0.95238095, 0.9047619 , 0.95238095, 0.95238095, 0.95238095]),
 'train_r

#### DataSet 2 Training and Evaluation

In [73]:
warnings.filterwarnings('ignore')
logistic_regression(X2,Y2,0.0599,1000)

{'fit_time': array([0.00575995, 0.00285816, 0.00230193, 0.00244284, 0.00205302,
        0.00205112, 0.00199986, 0.00208735, 0.0018301 , 0.002074  ]),
 'score_time': array([0.00480008, 0.00293493, 0.00246191, 0.002666  , 0.00200415,
        0.00198197, 0.00200915, 0.00187182, 0.00185394, 0.00185585]),
 'test_accuracy': array([0.74468085, 0.70212766, 0.80434783, 0.73913043, 0.67391304,
        0.73913043, 0.80434783, 0.73913043, 0.67391304, 0.7173913 ]),
 'train_accuracy': array([0.75180723, 0.74457831, 0.73798077, 0.74038462, 0.75240385,
        0.76442308, 0.73798077, 0.74519231, 0.74759615, 0.74038462]),
 'test_precision': array([0.6       , 0.58333333, 0.81818182, 0.625     , 0.54545455,
        0.75      , 1.        , 0.64285714, 0.54545455, 0.63636364]),
 'train_precision': array([0.68807339, 0.67272727, 0.66666667, 0.67307692, 0.69158879,
        0.70909091, 0.65765766, 0.67592593, 0.67256637, 0.66071429]),
 'test_recall': array([0.75  , 0.4375, 0.5625, 0.625 , 0.375 , 0.375 , 0.4

In [74]:
knn(X2,Y2,25) # 1 neighbor would give perfect accuracy for training set, overfitting

{'fit_time': array([0.00139475, 0.00122786, 0.00041485, 0.00039768, 0.00039601,
        0.00035405, 0.00033498, 0.00032783, 0.00032687, 0.00032711]),
 'score_time': array([0.01184797, 0.00440097, 0.00419402, 0.00375724, 0.00384593,
        0.00365806, 0.00346303, 0.0034771 , 0.00342512, 0.00345993]),
 'test_accuracy': array([0.70212766, 0.72340426, 0.7826087 , 0.73913043, 0.7173913 ,
        0.67391304, 0.7173913 , 0.80434783, 0.69565217, 0.7173913 ]),
 'train_accuracy': array([0.75421687, 0.75662651, 0.73076923, 0.75240385, 0.73798077,
        0.75480769, 0.74759615, 0.73557692, 0.75721154, 0.75      ]),
 'test_precision': array([0.57142857, 0.63636364, 0.875     , 0.64285714, 0.66666667,
        0.66666667, 0.8       , 0.81818182, 0.625     , 0.66666667]),
 'train_precision': array([0.76923077, 0.7654321 , 0.73529412, 0.75308642, 0.72727273,
        0.72826087, 0.70967742, 0.68888889, 0.75294118, 0.75641026]),
 'test_recall': array([0.5   , 0.4375, 0.4375, 0.5625, 0.375 , 0.125 , 0.2

In [75]:
decision_tree(X2,Y2, max_leaf_nodes=6, min_samples_split=2)

{'fit_time': array([0.0088551 , 0.00265193, 0.00228691, 0.00222802, 0.00234175,
        0.00247407, 0.00244212, 0.00210571, 0.00211716, 0.00245094]),
 'score_time': array([0.00996184, 0.00390816, 0.00396609, 0.00354576, 0.00441408,
        0.00447106, 0.00395179, 0.00350714, 0.00391197, 0.00371289]),
 'test_accuracy': array([0.68085106, 0.72340426, 0.7173913 , 0.67391304, 0.63043478,
        0.69565217, 0.73913043, 0.73913043, 0.67391304, 0.69565217]),
 'train_accuracy': array([0.81204819, 0.79759036, 0.80288462, 0.79807692, 0.81009615,
        0.81971154, 0.80528846, 0.80288462, 0.80048077, 0.79326923]),
 'test_precision': array([0.52631579, 0.57894737, 0.6       , 0.53846154, 0.47058824,
        0.66666667, 0.7       , 0.64285714, 0.53846154, 0.57142857]),
 'train_precision': array([0.78448276, 0.69736842, 0.76271186, 0.80612245, 0.73049645,
        0.83495146, 0.75609756, 0.73484848, 0.74796748, 0.75438596]),
 'test_recall': array([0.625 , 0.6875, 0.5625, 0.4375, 0.5   , 0.25  , 0.4

In [76]:
svm(X2,Y2,1, 'linear')

{'fit_time': array([0.15509391, 0.152004  , 0.15908718, 0.10424972, 0.14359903,
        0.11258721, 0.10091591, 0.10709119, 0.1480062 , 0.13645005]),
 'score_time': array([0.00392914, 0.003824  , 0.00350475, 0.00379324, 0.00380588,
        0.00346899, 0.00336099, 0.00334287, 0.00339389, 0.0038619 ]),
 'test_accuracy': array([0.72340426, 0.74468085, 0.73913043, 0.76086957, 0.65217391,
        0.7173913 , 0.76086957, 0.73913043, 0.67391304, 0.73913043]),
 'train_accuracy': array([0.7373494 , 0.7373494 , 0.71634615, 0.74038462, 0.73076923,
        0.73557692, 0.73076923, 0.73317308, 0.74519231, 0.73557692]),
 'test_precision': array([0.57894737, 0.64285714, 0.61111111, 0.66666667, 0.5       ,
        0.71428571, 0.72727273, 0.64285714, 0.54545455, 0.66666667]),
 'train_precision': array([0.65217391, 0.64      , 0.60655738, 0.66071429, 0.63333333,
        0.6440678 , 0.63559322, 0.63636364, 0.64179104, 0.63492063]),
 'test_recall': array([0.6875, 0.5625, 0.6875, 0.625 , 0.375 , 0.3125, 0.5

In [77]:
#adaboost(x_train,x_test,y_train,y_test)
adaboost(X2,Y2, 100)

{'fit_time': array([0.08208704, 0.08234906, 0.08087993, 0.08074212, 0.08121991,
        0.08113885, 0.08078623, 0.08097887, 0.08483791, 0.08103228]),
 'score_time': array([0.01489592, 0.01481175, 0.01461101, 0.01456118, 0.01464701,
        0.01461029, 0.01458669, 0.01459098, 0.01498699, 0.01459098]),
 'test_accuracy': array([0.59574468, 0.63829787, 0.65217391, 0.63043478, 0.52173913,
        0.67391304, 0.69565217, 0.65217391, 0.69565217, 0.67391304]),
 'train_accuracy': array([0.85060241, 0.86506024, 0.85336538, 0.87740385, 0.86298077,
        0.87740385, 0.85336538, 0.84615385, 0.87259615, 0.86538462]),
 'test_precision': array([0.4       , 0.47619048, 0.5       , 0.47368421, 0.28571429,
        0.55555556, 0.58333333, 0.5       , 0.58333333, 0.57142857]),
 'train_precision': array([0.80597015, 0.86065574, 0.82170543, 0.88429752, 0.84251969,
        0.83941606, 0.81203008, 0.78985507, 0.85826772, 0.83846154]),
 'test_recall': array([0.375 , 0.625 , 0.4375, 0.5625, 0.25  , 0.3125, 0.4

In [78]:
#random_forest(x_train,x_test,y_train,y_test)
random_forest(X2,Y2, 'entropy', 4, 'auto', 200)

{'fit_time': array([0.19807792, 0.16062307, 0.15977669, 0.15927386, 0.15919614,
        0.15940094, 0.15931678, 0.15892196, 0.15923309, 0.15945911]),
 'score_time': array([0.01954484, 0.01935315, 0.01900005, 0.0191021 , 0.01902986,
        0.01900387, 0.01904631, 0.01900411, 0.01907706, 0.01906896]),
 'test_accuracy': array([0.74468085, 0.68085106, 0.76086957, 0.73913043, 0.67391304,
        0.69565217, 0.7173913 , 0.73913043, 0.7173913 , 0.7173913 ]),
 'train_accuracy': array([0.82168675, 0.81445783, 0.80528846, 0.82451923, 0.82211538,
        0.82692308, 0.8125    , 0.80769231, 0.82211538, 0.81730769]),
 'test_precision': array([0.61111111, 0.55555556, 0.72727273, 0.625     , 0.57142857,
        0.66666667, 0.66666667, 0.64285714, 0.71428571, 0.66666667]),
 'train_precision': array([0.84313725, 0.82524272, 0.82474227, 0.85858586, 0.85714286,
        0.84615385, 0.84375   , 0.81372549, 0.84313725, 0.84693878]),
 'test_recall': array([0.6875, 0.3125, 0.5   , 0.625 , 0.25  , 0.25  , 0.3