In [2]:
!pip install lightgbm



In [7]:
import sys
sys.path.append('..')
import numpy as np
import pickle
from data import dataset_loader
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
#from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score
import warnings
warnings.filterwarnings('ignore') 
#warnings.filterwarnings("ignore", category=FutureWarning)
#warnings.filterwarnings("ignore", category=ConvergenceWarning)
#warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

def run_metrics(clf,train_x,train_y,test_x,test_y):
    """
    Print ML classifier metrics on train and test
    """
    accuracyTrain = clf.score(train_x, train_y)
    print(f'Accuracy on train set: {accuracyTrain}')
    
    accuracy = clf.score(test_x, test_y)
    print(f'Accuracy on test set: {accuracy}')

    precisionTrain = precision_score(train_y, clf.predict(train_x), average="macro")
    print(f'Precision on train set: {precisionTrain}')
    
    precision = precision_score(test_y, clf.predict(test_x), average="macro")
    print(f'Precision on test set: {precision}')

    recallTrain = recall_score(train_y, clf.predict(train_x), average="macro")
    print(f'Recall on train set: {recallTrain}')
    
    recall = recall_score(test_y, clf.predict(test_x), average="macro")
    print(f'Recall on test set: {recall}')
    
    f1Train = f1_score(train_y, clf.predict(train_x), average="macro")
    print(f'F1-Score on train set: {f1Train}')
    
    f1 = f1_score(test_y, clf.predict(test_x), average="macro")
    print(f'F1-Score on test set: {f1}')    
    
    #rocTrain = roc_auc_score(train_y, clf.predict(train_x))
    #print(f'ROC on train set: {rocTrain}')
    
    #roc = roc_auc_score(test_y, clf.predict(test_x))
    #print(f'ROC on test set: {rocTest}')

def run_naive_bayes(train_x, train_y, test_x, test_y):
    clf = GaussianNB()
    clf.fit(train_x, train_y)
    print("Naive Bayes")
    run_metrics(clf,train_x,train_y,test_x,test_y)
    
def run_logistic_regression(train_x, train_y, test_x, test_y):
    clf = LogisticRegression()
    clf.fit(train_x, train_y)
    print("Logistic Regression")
    run_metrics(clf,train_x,train_y,test_x,test_y)
    
def run_elastic_net(train_x, train_y, test_x, test_y):
    clf = LogisticRegression(penalty='elasticnet',max_iter=200,solver='saga',l1_ratio=.9)
    clf.fit(train_x, train_y)
    print("Elastic Net")
    run_metrics(clf,train_x,train_y,test_x,test_y)
    
def run_knn(train_x, train_y, test_x, test_y):
    clf = KNeighborsClassifier(n_neighbors=9)
    clf.fit(train_x, train_y)
    print("KNN")
    run_metrics(clf,train_x,train_y,test_x,test_y)

def run_decision_tree(train_x, train_y, test_x, test_y):
    clf = DecisionTreeClassifier()
    clf.fit(train_x, train_y)
    print("Decision Tree")
    run_metrics(clf,train_x,train_y,test_x,test_y)
    
def run_random_forest(train_x, train_y, test_x, test_y, num_trees):
    clf = RandomForestClassifier(n_estimators=num_trees)
    clf.fit(train_x, train_y)
    print("Random Forest")
    run_metrics(clf,train_x,train_y,test_x,test_y)
    
def run_xgboost(train_x, train_y, test_x, test_y, num_trees):
    clf = GradientBoostingClassifier(n_estimators=num_trees)
    clf.fit(train_x, train_y)
    print("XG Boost")
    run_metrics(clf,train_x,train_y,test_x,test_y)
    
def run_adaboost(train_x, train_y, test_x, test_y, num_trees):
    clf = AdaBoostClassifier(n_estimators=num_trees)
    clf.fit(train_x, train_y)
    print("ADA Boost")
    run_metrics(clf,train_x,train_y,test_x,test_y)
    
def run_ensemble(train_x, train_y, test_x, test_y):
    #Elastic Net
    el = LogisticRegression(penalty='elasticnet',max_iter=200,solver='saga',l1_ratio=.9)
    el.fit(train_x, train_y)
    
    #Random Forest
    rf = RandomForestClassifier(n_estimators=1000)
    rf.fit(train_x, train_y)
    
    #KNN
    knn = KNeighborsClassifier(n_neighbors=9)
    knn.fit(train_x, train_y)    
    
    #Record Classifiers
    estimators=[('elastic-net',el), ('random-forest',rf), ('k-nearest-neighbors',knn)]

    #Create Ensemble
    ensemble = VotingClassifier(estimators,voting='hard')
    ensemble.fit(train_x, train_y)
    run_metrics(ensemble,train_x,train_y,test_x,test_y)

def main():
    # Only do these lines once.
    np.random.seed(2)
    df = dataset_loader.get_dataset_df('../data/csvs/f1_public.csv', '../data/csvs/translated_dataset.csv')
    train_x, train_y, test_x, test_y = dataset_loader.get_train_test_split(df)
    
    # Add model runs here.
    #run_naive_bayes(train_x, train_y, test_x, test_y)
    #run_logistic_regression(train_x, train_y, test_x, test_y)
    #run_elastic_net(train_x, train_y, test_x, test_y)
    #run_knn(train_x, train_y, test_x, test_y)
    #run_decision_tree(train_x, train_y, test_x, test_y)
    #run_random_forest(train_x, train_y, test_x, test_y, num_trees=1000)
    #run_xgboost(train_x, train_y, test_x, test_y, num_trees=1000)
    #run_adaboost(train_x, train_y, test_x, test_y, num_trees=1000)
    run_ensemble(train_x, train_y, test_x, test_y)
    
if __name__ == '__main__':
    main()

input dataset loaded from ../data/csvs/f1_public.csv
output dataset loaded to ../data/csvs/translated_dataset.csv
Dataset shape: (21049, 80)
0.8990450852772103 : 0.10095491472278968 split for testing starting at injury date 2006.
Accuracy on train set: 0.8618685267385331
Accuracy on test set: 0.7152941176470589
Precision on train set: 0.8430562592825199
Precision on test set: 0.466065283706925
Recall on train set: 0.6169163202153569
Recall on test set: 0.4537585336732389
F1-Score on train set: 0.6236285610508108
F1-Score on test set: 0.4466021589607382
