In [0]:
 # Import some common packages
import os
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from google.colab import files
import io

In [0]:
#Upload teh data hour.csv
uploaded_Hour = files.upload()

Saving hour.csv to hour.csv


In [0]:
bike = pd.read_csv(io.StringIO(uploaded_Hour["hour.csv"].decode("utf-8")))

In [0]:
#First couple of entries
bike.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [0]:
#explore the response variables
bike['cnt'].value_counts()[0:10]

5     260
6     236
4     231
3     224
2     208
7     198
8     182
1     158
10    155
11    147
Name: cnt, dtype: int64

In [0]:
#Percentiles
print("Mean: ", np.mean(bike["cnt"]))
print("Std: ", np.std(bike["cnt"]))
print("25%: ", np.percentile(bike["cnt"], 25))
print("75%: ", np.percentile(bike["cnt"], 75))

Mean:  189.46308763450142
Std:  181.38238043116962
25%:  40.0
75%:  281.0


In [0]:
##################################
# DATA CLEANING & FEATURE SCALING#
# ################################
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import OneHotEncoder
try:
  from sklearn.impute import SimpleImputer # Scikit-Learn 0.20+
except ImportError:
  from sklearn.preprocessing import Imputer as SimpleImputer
try:
    from sklearn.compose import ColumnTransformer
except ImportError:
    from future_encoders import ColumnTransformer # Scikit-Learn < 0.20
X_labels = ['season','mnth','hr', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed']
X = bike[X_labels]
Y = bike[["cnt"]]
X_train_pre, X_test_pre, Y_train, Y_test = train_test_split(X,Y, random_state=1, test_size=0.2) # 80-20 Train-Test split

# Permission granted to use std_scaler 
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

x_num = ['hr', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed']
cat_attribs = ["season","mnth"]
#Make a full pipeline with num and onehot encoder
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, x_num),
    ("cat", OneHotEncoder(sparse=False), cat_attribs),
])
X_train = full_pipeline.fit_transform(X_train_pre)
X_test = full_pipeline.fit_transform(X_test_pre)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [0]:
#Classify the response variable into three classes 0, 1, 2
Y_train_class = []
Y_test_class = []
low = np.percentile(Y_train["cnt"], 25)
high = np.percentile(Y_train["cnt"], 75)


# Only supposed to use Training according to TA
for element in Y_train["cnt"]:
  if element <= low:
    Y_train_class.append(0)
  elif low < element <= high:
    Y_train_class.append(1)
  else:
    Y_train_class.append(2)
    
for element in Y_test["cnt"]:
  if element <= low:
    Y_test_class.append(0)
  elif low < element <= high:
    Y_test_class.append(1)
  else:
    Y_test_class.append(2)

In [0]:
#classes of response variable
print(Y_train_class[0:5])
print(Y_test_class[0:5])

[1, 0, 0, 2, 2]
[2, 1, 1, 1, 1]


In [0]:
print("X_train Dimension: ", len(X_train))
print("Y_train Dimension: ", len(Y_train_class))
print("X_test Dimension: ", len(X_test))
print("Y_test Dimension: ", len(Y_test_class))

X_train Dimension:  13903
Y_train Dimension:  13903
X_test Dimension:  3476
Y_test Dimension:  3476


In [0]:
from sklearn.linear_model import LogisticRegression
# Training your svm here
soft_clf = LogisticRegression(random_state=42, solver="lbfgs",multi_class = "multinomial")
soft_clf.fit(X_train,Y_train_class)
# Testing your svm here
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import mean_squared_error

prediction = soft_clf.predict(X_test)
print("Accuracy: ", accuracy_score(Y_test_class, prediction))
print("Precision: ",precision_score(Y_test_class, prediction,average=None))
print("Recall: ", recall_score(Y_test_class, prediction, average=None))
print("F-1: ", f1_score(Y_test_class, prediction, average=None))
mse = mean_squared_error(Y_test_class, prediction)
rmse = np.sqrt(mse)
print("MSE: ", mse)
print("RMSE: ", rmse)

Accuracy:  0.6682968929804373
Precision:  [0.71897436 0.64782836 0.65084746]
Recall:  [0.78149387 0.73124631 0.43340858]
F-1:  [0.74893162 0.68701443 0.5203252 ]
MSE:  0.3532796317606444
RMSE:  0.5943733101011892




In [0]:
##Fine Tune
##Explore different Cs
import warnings
warnings.filterwarnings("ignore")

soft_clf20 = LogisticRegression(C=20, random_state=42, solver="lbfgs",multi_class = "multinomial")
soft_clf20.fit(X_train, Y_train_class)
prediction = soft_clf20.predict(X_test)
print("C=20")
print("Accuracy: ", accuracy_score(Y_test_class, prediction))
print("Precision: ",precision_score(Y_test_class, prediction,average=None))
print("Recall: ", recall_score(Y_test_class, prediction, average=None))
print("F-1: ", f1_score(Y_test_class, prediction, average=None))
mse = mean_squared_error(Y_test_class, prediction)
rmse = np.sqrt(mse)
print("MSE: ", mse)
print("RMSE: ", rmse)

soft_clf5 = LogisticRegression(C=5, random_state=42, solver="lbfgs",multi_class = "multinomial")
soft_clf5.fit(X_train, Y_train_class)
prediction = soft_clf5.predict(X_test)
print("C=5")
print("Accuracy: ", accuracy_score(Y_test_class, prediction))
print("Precision: ",precision_score(Y_test_class, prediction,average=None))
print("Recall: ", recall_score(Y_test_class, prediction, average=None))
print("F-1: ", f1_score(Y_test_class, prediction, average=None))

soft_clf1_5 = LogisticRegression(C=1.5, random_state=42, solver="lbfgs",multi_class = "multinomial")
soft_clf1_5.fit(X_train, Y_train_class)
prediction = soft_clf1_5.predict(X_test)
print("C=1.5")
print("Accuracy: ", accuracy_score(Y_test_class, prediction))
print("Precision: ",precision_score(Y_test_class, prediction,average=None))
print("Recall: ", recall_score(Y_test_class, prediction, average=None))
print("F-1: ", f1_score(Y_test_class, prediction, average=None))

soft_clf1 = LogisticRegression(C=1, random_state=42, solver="lbfgs",multi_class = "multinomial")
soft_clf1.fit(X_train, Y_train_class)
prediction = soft_clf1.predict(X_test)
print("C=1")
print("Accuracy: ", accuracy_score(Y_test_class, prediction))
print("Precision: ",precision_score(Y_test_class, prediction,average=None))
print("Recall: ", recall_score(Y_test_class, prediction, average=None))
print("F-1: ", f1_score(Y_test_class, prediction, average=None))

C=20
Accuracy:  0.6694476409666283
Precision:  [0.71836735 0.64971159 0.65195246]
Recall:  [0.78483835 0.73183698 0.43340858]
F-1:  [0.75013319 0.68833333 0.52067797]
MSE:  0.3538550057537399
RMSE:  0.5948571305395438
C=5
Accuracy:  0.6694476409666283
Precision:  [0.71836735 0.64971159 0.65195246]
Recall:  [0.78483835 0.73183698 0.43340858]
F-1:  [0.75013319 0.68833333 0.52067797]
C=1.5
Accuracy:  0.6685845799769851
Precision:  [0.71807967 0.6486911  0.65076661]
Recall:  [0.78372352 0.73183698 0.43115124]
F-1:  [0.74946695 0.6877602  0.51866938]
C=1
Accuracy:  0.6682968929804373
Precision:  [0.71897436 0.64782836 0.65084746]
Recall:  [0.78149387 0.73124631 0.43340858]
F-1:  [0.74893162 0.68701443 0.5203252 ]


In [0]:
#Grid Search for our svm model
from sklearn.model_selection import GridSearchCV
from scipy.stats import uniform
soft_clf = LogisticRegression(random_state=42, solver="lbfgs",multi_class = "multinomial")
param_distributions = {"C": np.arange(0, 10, 0.1)}
grid_search = GridSearchCV(soft_clf, param_distributions, cv=5, verbose=3, n_jobs=-1)
grid_search.fit(X_train, Y_train_class)
clf_regress = grid_search
print("best estimator: ", grid_search.best_params_)
print("best score: ", grid_search.best_score_)
grid_search.best_estimator_.fit(X_train, Y_train_class)
predictionb = grid_search.best_estimator_.predict(X_test)
print("C=Grid Search")
print("Accuracy: ", accuracy_score(Y_test_class, predictionb))
print("Precision: ",precision_score(Y_test_class, predictionb,average=None))
print("Recall: ", recall_score(Y_test_class, predictionb, average=None))
print("F-1: ", f1_score(Y_test_class, predictionb, average=None))

# Testing your svm here
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import mean_squared_error
print("")
print("C=1=default")
soft_clf = LogisticRegression(random_state=42, solver="lbfgs",multi_class = "multinomial")
soft_clf.fit(X_train,Y_train_class)
prediction = soft_clf.predict(X_test)
print("Accuracy: ", accuracy_score(Y_test_class, prediction))
print("Precision: ",precision_score(Y_test_class, prediction,average=None))
print("Recall: ", recall_score(Y_test_class, prediction, average=None))
print("F-1: ", f1_score(Y_test_class, prediction, average=None))
print("")
print("Difference between default and fine tuning")
print("Accuracy_difference: ", accuracy_score(Y_test_class, predictionb) - accuracy_score(Y_test_class, prediction))
print("Precision_difference: ",precision_score(Y_test_class, predictionb,average=None) - precision_score(Y_test_class, prediction,average=None))
print("Recall_difference: ", recall_score(Y_test_class, predictionb, average=None) - recall_score(Y_test_class, prediction, average=None))
print("F-1_difference: ", f1_score(Y_test_class, predictionb, average=None) - f1_score(Y_test_class, prediction, average=None))

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   21.0s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  6.1min finished


best estimator:  {'C': 0.2}
best score:  0.6681291807523556
C=Grid Search
Accuracy:  0.6682968929804373
Precision:  [0.71954964 0.64733542 0.65128205]
Recall:  [0.78372352 0.73183698 0.43002257]
F-1:  [0.75026681 0.6869975  0.51801496]

C=1=default
Accuracy:  0.6682968929804373
Precision:  [0.71897436 0.64782836 0.65084746]
Recall:  [0.78149387 0.73124631 0.43340858]
F-1:  [0.74893162 0.68701443 0.5203252 ]

Difference between default and fine tuning
Accuracy_difference:  0.0
Precision_difference:  [ 0.00057528 -0.00049294  0.00043459]
Recall_difference:  [ 0.00222965  0.00059067 -0.003386  ]
F-1_difference:  [ 1.33518503e-03 -1.69235612e-05 -2.31024744e-03]


In [0]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, Y_train_class)
output = clf.predict(X_test)
print("Decision class 3")
print("Accurarcy: ", accuracy_score(Y_test_class,output))
print("Precision: ", precision_score(Y_test_class,output, average=None))
print("Recall: ", recall_score(Y_test_class,output, average=None))
print("f1: ", f1_score(Y_test_class,output, average=None))

Decision class 3
Accurarcy:  0.8331415420023015
Precision:  [0.9003517  0.82122261 0.79190101]
Recall:  [0.85618729 0.84111045 0.79458239]
f1:  [0.87771429 0.83104756 0.79323944]


In [0]:
#Fine tune Decision tree
##Original
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, Y_train_class)
output = clf.predict(X_test)
print("Decision class 3")
accprev = accuracy_score(Y_test_class,output)
precprev = precision_score(Y_test_class,output, average=None)
recallprev = recall_score(Y_test_class,output, average=None)
f1prev = f1_score(Y_test_class,output, average=None)
print("Accurarcy: ", accuracy_score(Y_test_class,output))
print("Precision: ", precision_score(Y_test_class,output, average=None))
print("Recall: ", recall_score(Y_test_class,output, average=None))
print("f1: ", f1_score(Y_test_class,output, average=None))

##Fine tune
from sklearn.tree import DecisionTreeClassifier

max_depths = np.linspace(1, 64, 64, endpoint=True)

best_accuracy = None
best_depth = None
best_prec = None
best_rec = None
best_f1 = None
clf_df = None
for max_depth in max_depths:
  dt = DecisionTreeClassifier(max_depth=max_depth)
  dt.fit(X_train, Y_train_class)
  train_pred = dt.predict(X_train)  
  output = dt.predict(X_test)
  
  
  if best_accuracy == None:
    best_accuracy = accuracy_score(Y_test_class,output)
    best_depth = max_depth
    best_prec = precision_score(Y_test_class,output, average=None)
    best_rec = recall_score(Y_test_class,output, average=None)
    best_f1 = f1_score(Y_test_class,output, average=None)
  if accuracy_score(Y_test_class,output) > best_accuracy:
    best_accuracy = accuracy_score(Y_test_class,output)
    best_depth = max_depth
    best_prec = precision_score(Y_test_class,output, average=None)
    best_rec = recall_score(Y_test_class,output, average=None)
    best_f1 = f1_score(Y_test_class,output, average=None)
    clf_df = dt
print("3 classes")
print("Accuracy: ", best_accuracy)
print("Depth: ", best_depth)
print("Precision: ", best_prec)
print("Recall: ", best_rec)
print("F-1: ", best_f1)
print("")
print("The difference")
print("Accuracy: ", best_accuracy - accprev)
print("Precision: ", best_prec - precprev)
print("Recall: ", best_rec - recallprev)
print("F-1: ", best_f1 - f1prev)


Decision class 3
Accurarcy:  0.8268124280782508
Precision:  [0.89929742 0.81426107 0.78142695]
Recall:  [0.85618729 0.83638512 0.77878104]
f1:  [0.87721302 0.82517483 0.78010175]
3 classes
Accuracy:  0.832566168009206
Depth:  59.0
Precision:  [0.9028103  0.81792237 0.79310345]
Recall:  [0.85953177 0.84642646 0.77878104]
F-1:  [0.88063963 0.83193033 0.78587699]

The difference
Accuracy:  0.005753739930955182
Precision:  [0.00351288 0.0036613  0.01167649]
Recall:  [0.00334448 0.01004135 0.        ]
F-1:  [0.00342661 0.00675551 0.00577524]


In [0]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
clf.fit(X_train, Y_train_class)
output = clf.predict(X_test)
print("Accuracy: ", accuracy_score(Y_test_class,output))
print("Precision: ", precision_score(Y_test_class,output, average=None))
print("Recall: ", recall_score(Y_test_class,output, average=None))
print("f1: ", f1_score(Y_test_class,output, average=None))

Accuracy:  0.8075373993095513
Precision:  [0.92560976 0.74450812 0.87010676]
Recall:  [0.84615385 0.92085056 0.55191874]
f1:  [0.88410017 0.82334302 0.67541436]


In [0]:
from sklearn.model_selection import GridSearchCV
from scipy.stats import uniform
import warnings
warnings.filterwarnings("ignore")
clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
param_distributions = {"n_estimators": np.arange(53, 58, 1), 'max_features':[8, 10, 14, 16], 'bootstrap': [True]}
grid_search_random = GridSearchCV(clf, param_distributions, cv=5, verbose=3, n_jobs=-1)
grid_search_random.fit(X_train, Y_train_class)


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   17.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.2min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': array([53, 54, 55, 56, 57]), 'max_features': [8, 10, 14, 16], 'bootstrap': [True]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [0]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
clf.fit(X_train, Y_train_class)
output = clf.predict(X_test)
print("Old")
print("Accurarcy: ", accuracy_score(Y_test_class,output))
prevacc = accuracy_score(Y_test_class,output)
print("Precision: ", precision_score(Y_test_class,output, average=None))
prevpre = precision_score(Y_test_class,output, average=None)
print("Recall: ", recall_score(Y_test_class,output, average=None))
prevrec = recall_score(Y_test_class,output, average=None)
print("f1: ", f1_score(Y_test_class,output, average=None))
prevf1 = f1_score(Y_test_class,output, average=None)
print("")
print("Best: ")
print("best estimator: ", grid_search_random.best_params_)
print("best score: ", grid_search_random.best_score_)
output = grid_search_random.predict(X_test)
print("Accurarcy: ", accuracy_score(Y_test_class,output))
newacc = accuracy_score(Y_test_class,output)
print("Precision: ", precision_score(Y_test_class,output, average=None))
newprec = precision_score(Y_test_class,output, average=None)
print("Recall: ", recall_score(Y_test_class,output, average=None))
newrecall = recall_score(Y_test_class,output, average=None)
print("f1: ", f1_score(Y_test_class,output, average=None))
newf1 = f1_score(Y_test_class,output, average=None)
print("")
print("The difference of random forest")
print("Accuracy: ", newacc - prevacc)
print("Precision: ", newprec - prevpre)
print("Recall: ", newrecall - prevrec)
print("F1: ", newf1 - prevf1)

Old
Accurarcy:  0.8075373993095513
Precision:  [0.92560976 0.74450812 0.87010676]
Recall:  [0.84615385 0.92085056 0.55191874]
f1:  [0.88410017 0.82334302 0.67541436]

Best: 
best estimator:  {'bootstrap': True, 'max_features': 14, 'n_estimators': 56}
best score:  0.8616845285190247
Accurarcy:  0.8535673187571922
Precision:  [0.93050648 0.82644628 0.83374384]
Recall:  [0.88071349 0.88600118 0.76410835]
f1:  [0.90492554 0.85518814 0.79740872]

The difference of random forest
Accuracy:  0.0460299194476409
Precision:  [ 0.00489672  0.08193816 -0.03636292]
Recall:  [ 0.03455964 -0.03484938  0.21218962]
F1:  [0.02082537 0.03184513 0.12199435]


In [0]:
#lets use different voting (next one)
from sklearn.ensemble import VotingClassifier
named_estimators = [
    ("random_forest_clf", grid_search_random), #random forest
    ("decision_clf", clf_df), #decision tree
    ("regression_clf", clf_regress), #SVM
]
voting_clf = VotingClassifier(named_estimators)
voting_clf.fit(X_train, Y_train_class)
print("Score: ", voting_clf.score(X_test, Y_test_class))
output = voting_clf.predict(X_test)
print("Accurarcy: ", accuracy_score(Y_test_class,output))
print("Precision: ", precision_score(Y_test_class,output, average=None))
print("Recall: ", recall_score(Y_test_class,output, average=None))
print("f1: ", f1_score(Y_test_class,output, average=None))

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   15.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.2min finished


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   17.7s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  5.7min finished


Score:  0.8420598388952819
Accurarcy:  0.8420598388952819
Precision:  [0.90804598 0.81658429 0.82802548]
Recall:  [0.88071349 0.8783225  0.73363431]
f1:  [0.89417091 0.84632897 0.77797726]


In [0]:
#KNN model
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=4)
knn_clf.fit(X_train, Y_train_class)
output = knn_clf.predict(X_test)
print("Accurarcy: ", accuracy_score(Y_test_class,output))
print("Precision: ", precision_score(Y_test_class,output, average=None))
print("Recall: ", recall_score(Y_test_class,output, average=None))
print("f1: ", f1_score(Y_test_class,output, average=None))

Accurarcy:  0.7491369390103567
Precision:  [0.76890756 0.75893398 0.70904926]
Recall:  [0.81605351 0.74010632 0.6986456 ]
f1:  [0.79177934 0.74940191 0.70380898]


In [0]:
from sklearn.ensemble import VotingClassifier
named_estimators = [
    ("random_forest_clf", grid_search_random),
    ("decision_clf", clf_df),
    ("KNN", knn_clf),
    ("regression_clf", clf_regress),
]
voting_clf = VotingClassifier(named_estimators)
voting_clf.fit(X_train, Y_train_class)
print("Score: ", voting_clf.score(X_test, Y_test_class))
output = voting_clf.predict(X_test)
print("Accurarcy: ", accuracy_score(Y_test_class,output))
print("Precision: ", precision_score(Y_test_class,output, average=None))
print("Recall: ", recall_score(Y_test_class,output, average=None))
print("f1: ", f1_score(Y_test_class,output, average=None))

In [0]:
#Voting class without SVM
from sklearn.ensemble import VotingClassifier
named_estimators = [
    ("random_forest_clf", grid_search_random),
    ("decision_clf", clf_df),
    ("KNN", knn_clf),
]
voting_clf = VotingClassifier(named_estimators)
voting_clf.fit(X_train, Y_train_class)
print("Score: ", voting_clf.score(X_test, Y_test_class))
output = voting_clf.predict(X_test)
print("Accurarcy: ", accuracy_score(Y_test_class,output))
print("Precision: ", precision_score(Y_test_class,output, average=None))
print("Recall: ", recall_score(Y_test_class,output, average=None))
print("f1: ", f1_score(Y_test_class,output, average=None))

In [0]:
#Voting classifier without SVM 
## We only have random forest, decision tree, and KNN
from sklearn.ensemble import VotingClassifier
named_estimators = [
    ("random_forest_clf", grid_search_random),
    ("decision_clf", clf_df),
    ("KNN", knn_clf),
]
#hard
print("Hard")
voting_clf = VotingClassifier(named_estimators,voting='hard')
voting_clf.fit(X_train, Y_train_class)
print("Score: ", voting_clf.score(X_test, Y_test_class))
output = voting_clf.predict(X_test)
print("Accurarcy: ", accuracy_score(Y_test_class,output))
print("Precision: ", precision_score(Y_test_class,output, average=None))
print("Recall: ", recall_score(Y_test_class,output, average=None))
print("f1: ", f1_score(Y_test_class,output, average=None))
print("")
#Soft
print("Soft")
voting_clf = VotingClassifier(named_estimators,voting='soft')
voting_clf.fit(X_train, Y_train_class)
print("Score: ", voting_clf.score(X_test, Y_test_class))
output = voting_clf.predict(X_test)
print("Accurarcy: ", accuracy_score(Y_test_class,output))
print("Precision: ", precision_score(Y_test_class,output, average=None))
print("Recall: ", recall_score(Y_test_class,output, average=None))
print("f1: ", f1_score(Y_test_class,output, average=None))

Our hard voting tended to have a higher acc score with the KNN, Random Forest, and Decision Tree classifiers.