# Import Data

In [61]:
#import DataFetcher class from file heart_data.py
import DataFetcher #importing the file DataFetcher.py from the folder data
data_fetcher = DataFetcher.DataFetcher("C:\\Users\\kruth\\OneDrive\\Desktop\\Cardiac_Events_ML\\preprocessing\\final_dataset.csv")
X_train = data_fetcher.get_X_train()
X_test = data_fetcher.get_X_test()
y_train = data_fetcher.get_y_train()
y_test = data_fetcher.get_y_test()
features = data_fetcher.get_features()
target_names = data_fetcher.get_target_names()
X = data_fetcher.get_X()
y = data_fetcher.get_y()

# Hard and Soft Voting on 3 Classifiers

In [62]:
import joblib
lr = joblib.load('joblib_dump\LRC.pkl')
dt = joblib.load('joblib_dump\DTC.pkl')
svm = joblib.load('joblib_dump\SVM.pkl')

In [63]:
from sklearn.ensemble import VotingClassifier as vc
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

#hard voting
#it uses the predicted class labels for majority rule voting
voting_clf_hard = vc(estimators=[('lr', lr), ('svm', svm), ('dt', dt)], voting='hard')
voting_clf_hard.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
y_pred_svm = svm.predict(X_test)
y_pred_dt = dt.predict(X_test)
y_pred_hard = voting_clf_hard.predict(X_test)
print("Acurracy of LR ", accuracy_score(y_test, y_pred_lr))
print("Accuracy of SVM ", accuracy_score(y_test, y_pred_svm))
print("Accuracy of DT ", accuracy_score(y_test, y_pred_dt),"\n")
print("Acurracy of hard voting: ", accuracy_score(y_test, y_pred_hard))



#soft voting
#it uses the predicted class probabilities (predict_proba) for majority rule voting
voting_clf_soft = vc(estimators=[('lr', lr), ('svm', svm), ('dt', dt)], voting='soft')
voting_clf_soft.fit(X_train, y_train)
y_pred_soft = voting_clf_soft.predict(X_test)
print("Acurracy of soft voting: ", accuracy_score(y_test, y_pred_soft), "\n")



Acurracy of LR  0.65
Accuracy of SVM  0.5
Accuracy of DT  0.9 

Acurracy of hard voting:  0.9
Acurracy of soft voting:  0.9333333333333333 



## on more classifiers

In [64]:
import joblib
lr = joblib.load('joblib_dump\LRC.pkl')
dt = joblib.load('joblib_dump\DTC.pkl')
svm = joblib.load('joblib_dump\SVM.pkl')
rf = joblib.load('joblib_dump\RFC.pkl')
ab = joblib.load('joblib_dump\ABC.pkl')
gb = joblib.load('joblib_dump\GBC.pkl')
knn = joblib.load('joblib_dump\KNN.pkl')

In [65]:
#hard voting
#it uses the predicted class labels for majority rule voting
voting_clf_hard = vc(estimators=[('lr', lr), ('svm', svm), ('dt', dt), ('rf', rf), ('ab', ab), ('gb', gb), ('knn', knn)], voting='hard')
voting_clf_hard.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
y_pred_svm = svm.predict(X_test)
y_pred_dt = dt.predict(X_test)
y_pred_rf = rf.predict(X_test)
y_pred_ab = ab.predict(X_test)
y_pred_gb = gb.predict(X_test)
y_pred_knn = knn.predict(X_test)
y_pred_hard = voting_clf_hard.predict(X_test)
print("Acurracy of LR ", accuracy_score(y_test, y_pred_lr))
print("Accuracy of SVM ", accuracy_score(y_test, y_pred_svm))
print("Accuracy of DT ", accuracy_score(y_test, y_pred_dt))
print("Accuracy of RF ", accuracy_score(y_test, y_pred_rf))
print("Accuracy of AB ", accuracy_score(y_test, y_pred_ab))
print("Accuracy of GB ", accuracy_score(y_test, y_pred_gb))
print("Accuracy of KNN ", accuracy_score(y_test, y_pred_knn),"\n")

print("Acurracy of hard voting: ", accuracy_score(y_test, y_pred_hard))



#soft voting
#it uses the predicted class probabilities (predict_proba) for majority rule voting
voting_clf_soft = vc(estimators=[('lr', lr), ('svm', svm), ('dt', dt), ('rf', rf), ('ab', ab), ('gb', gb), ('knn', knn)], voting='soft')
voting_clf_soft.fit(X_train, y_train)
y_pred_soft = voting_clf_soft.predict(X_test)
print("Acurracy of soft voting: ", accuracy_score(y_test, y_pred_soft), "\n")


Acurracy of LR  0.65
Accuracy of SVM  0.5
Accuracy of DT  0.9
Accuracy of RF  0.95
Accuracy of AB  0.9333333333333333
Accuracy of GB  0.9
Accuracy of KNN  0.6166666666666667 

Acurracy of hard voting:  0.9166666666666666
Acurracy of soft voting:  0.8833333333333333 



# Bagging and Pasting

## Bagging on Decision Tree 

In [81]:
from sklearn.ensemble import BaggingClassifier as bc
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn.metrics import accuracy_score

#bagging (bootstrap = True)
#it uses soft voting by default, (predict_proba scores of each instance for each class of every DT in the forest is considered)
#RANDOM PATCHES: Sampling both training instances and features is called the Random Patches method.
#Sampling features results in even more predictor diversity, trading a bit more bias for a lower variance.
#RANDOM SUBSPACES: Sampling features with all training instances is called the Random Subspaces method.
bagging_clf = bc(base_estimator=dtc(), 
                n_estimators=100,  #number of trees in the forest
                max_samples=50, #number of instances to draw from X_train to train each Decision Tree
                max_features=1.0, #number of features to draw from X_train to train each Decision Tree
                bootstrap=True, #whether samples are drawn with replacement
                bootstrap_features=False, #whether features are drawn with replacement
                oob_score= True, #whether to use out-of-bag samples to estimate the generalization accuracy
                n_jobs=-1,
                verbose= 1
)

bagging_clf.fit(X_train, y_train)
print("OOb Score: ", bagging_clf.oob_score_) #gives the mean accuracy on the out-of-bag samples (generalization accuracy)
print("Decision function: ", bagging_clf.oob_decision_function_) #gives class probabilities for each instance in the out-of-bag set
y_pred_bagging = bagging_clf.predict(X_test)
print("Acurracy of bagging: ", accuracy_score(y_test, y_pred_bagging))

OOb Score:  0.7866108786610879
Decision function:  [[0.23684211 0.76315789]
 [0.54320988 0.45679012]
 [0.92307692 0.07692308]
 [0.08860759 0.91139241]
 [0.275      0.725     ]
 [0.2        0.8       ]
 [0.48809524 0.51190476]
 [0.24050633 0.75949367]
 [0.73863636 0.26136364]
 [0.93243243 0.06756757]
 [0.34146341 0.65853659]
 [0.90909091 0.09090909]
 [0.32051282 0.67948718]
 [0.4939759  0.5060241 ]
 [0.30263158 0.69736842]
 [0.34146341 0.65853659]
 [0.05194805 0.94805195]
 [0.2987013  0.7012987 ]
 [0.85185185 0.14814815]
 [0.41558442 0.58441558]
 [0.6547619  0.3452381 ]
 [0.48148148 0.51851852]
 [0.3375     0.6625    ]
 [0.59722222 0.40277778]
 [0.55696203 0.44303797]
 [0.17283951 0.82716049]
 [0.04819277 0.95180723]
 [0.13580247 0.86419753]
 [0.37837838 0.62162162]
 [0.58441558 0.41558442]
 [0.6835443  0.3164557 ]
 [0.01123596 0.98876404]
 [0.8313253  0.1686747 ]
 [0.38961039 0.61038961]
 [0.33333333 0.66666667]
 [0.85897436 0.14102564]
 [0.35897436 0.64102564]
 [0.52941176 0.47058824]

[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.0s finished


## Pasting with Decision Tree

In [86]:
#pasting (bootstrap=False)
#it uses hard voting by default, (the class with the highest number of votes is predicted)
pasting_clf = bc(base_estimator=dtc(),
                n_estimators=100,  #number of trees in the forest
                max_samples=50, #number of instances to draw from X_train to train each Decision Tree
                max_features=1.0, #number of features to draw from X_train to train each Decision Tree
                bootstrap=False, #whether samples are drawn with replacement
                bootstrap_features=False, #whether features are drawn with replacement
                n_jobs=-1,
                verbose= 1
)

pasting_clf.fit(X_train, y_train)
y_pred_pasting = pasting_clf.predict(X_test)
print("Acurracy of pasting: ", accuracy_score(y_test, y_pred_pasting))


Acurracy of pasting:  0.8666666666666667


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.0s finished


# Random Forest

In [108]:
#Random Forest is an ensemble of Decision Trees, generally
#trained via the bagging method (or sometimes pasting), typically with max_samples
#set to the size of the training set.
#The Random Forest algorithm introduces extra randomness when growing trees;
#instead of searching for the very best feature when splitting a node, it
#searches for the best feature among a random subset of features.

from sklearn.ensemble import RandomForestClassifier as rfc
rf_clf = rfc(n_estimators=1000,
            max_depth=2,
            #max_features=2, #if 2, accuracy = 0.95
            #max_samples=50,
            #max_leaf_nodes=4,
            bootstrap=True, #bagging
            min_samples_leaf=8, 
            min_samples_split=4,
            random_state=42,
            n_jobs=-1,
            verbose=1
)

rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
print("Acurracy of Random Forest: ", accuracy_score(y_test, y_pred_rf))


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    0.7s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.0s


Acurracy of Random Forest:  0.95


[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed:    0.0s finished
