In [1]:
import os
import pandas as  pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

### File Retrieval and Preprocessing

In [29]:
files = os.listdir('../final_stats/preprocessed/')
files[:2]

def split_target(df, target_col='away_winner_wts'):
    df = df.copy()
    target = df.pop(target_col)
    return df, target

def normalize_df(df):
    scaler = MinMaxScaler()
    df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    data, target = split_target(df)
    return data, target

def get_data(file):
    df = pd.read_csv(f'../final_stats/preprocessed/{file}', index_col=[0])
    data, target = normalize_df(df)
    return data, target


def remove_last_1(df):
    df = df.copy()
    df.drop(df.iloc[:, 1:52], inplace=True, axis=1)
    df.drop(df.iloc[:, 53:103], inplace=True, axis=1)
    return df

def get_data_skip_last_1(file):
    df = pd.read_csv(f'../final_stats/preprocessed/{file}', index_col=[0])
    df = remove_last_1(df)
    data, target = normalize_df(df)
    return data, target

In [30]:
print(files[0])
x, y = get_data(files[0])
x.head()

preprocessed_no_sent_last_1.csv


Unnamed: 0,home_score_team_last_1,home_score_opp_last_1,home_pass_cmp_off_last_1,home_pass_att_off_last_1,home_pass_yds_off_last_1,home_pass_tds_off_last_1,home_ints_off_last_1,home_sacks_off_last_1,home_sacks_yds_off_last_1,home_pass_yds_per_att_last_1,...,away_ravens,away_saints,away_seahawks,away_steelers,away_texans,away_titans,away_vikings,day_Sat,day_Sun,day_Thu
0,0.470588,0.414634,0.515152,0.315789,0.732323,0.75,0.333333,0.428571,0.278689,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.392157,0.756098,0.636364,0.631579,0.722222,0.75,0.0,0.142857,0.131148,0.54321,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.705882,0.414634,0.575758,0.421053,0.671717,0.75,0.0,0.142857,0.098361,0.703704,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.392157,0.243902,0.242424,0.263158,0.406566,0.0,0.0,0.0,0.0,0.444444,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.317073,0.424242,0.421053,0.35101,0.0,0.666667,0.571429,0.442623,0.296296,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### Chart Code

In [7]:
def get_average_acc_history(acc_histories):
    num_epochs = len(acc_histories[0])
    return [np.mean([x[i] for x in acc_histories]) for i in range(num_epochs)]

def get_average_loss_history(loss_histories):
    num_epochs = len(loss_histories[0])
    return [np.mean([x[i] for x in loss_histories]) for i in range(num_epochs)]
        
def plot_ave_acc_and_loss_histories(acc_histories, loss_histories):
    average_acc_history = get_average_acc_history(acc_histories)
    average_loss_history = get_average_loss_history(loss_histories)

    plt.figure(figsize=(15,5))
    
    plt.subplot(1, 2, 1)
    plt.plot(range(1, len(average_acc_history) + 1), average_acc_history)
    plt.xlabel("Epochs")
    plt.ylabel("Validation Accuracy")

    plt.subplot(1, 2, 2)

    plt.plot(range(1, len(average_loss_history) + 1), average_loss_history)
    plt.xlabel("Epochs")
    plt.ylabel("Validation Loss")
    plt.show()

def compare_ave_loss_acc_histories(acc_hist_1, acc_hist_2, loss_hist_1, loss_hist_2):
    min_epochs = min(len(acc_hist_1[0]), len(acc_hist_2[0]))
    
    ave_acc_hist_1 = get_average_acc_history(acc_hist_1)
    ave_loss_hist_1 = get_average_loss_history(loss_hist_1)
    
    ave_acc_hist_2 = get_average_acc_history(acc_hist_2)
    ave_loss_hist_2 = get_average_loss_history(loss_hist_2)


    blue_dots = 'bo'
    solid_blue_line = 'b'
    red_dots = 'ro'
    solid_red_line = 'r'

    epochs = range(1, min_epochs + 1)
    plt.figure(figsize=(15,5))
    
    plt.subplot(1, 2, 1)
    plt.plot(epochs, ave_acc_hist_1, solid_blue_line, label='Model 1')
    plt.plot(epochs, ave_acc_hist_2, solid_red_line, label='Model 2')
    plt.xlabel("Epochs")
    plt.ylabel("Validation Accuracy")
    plt.legend()

    plt.subplot(1, 2, 2)

    plt.plot(epochs, ave_loss_hist_1, solid_blue_line, label='Model 1')
    plt.plot(epochs, ave_loss_hist_2, solid_red_line, label='Model 2')
    plt.xlabel("Epochs")
    plt.ylabel("Validation Loss")
    plt.legend()
    
    plt.show()
    

In [6]:
from sklearn.model_selection import KFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

### Repeated K-Fold Cross Validation
Limited in dataset so want to run a few times
Grid search of the learning rate and the n-estimators perameter
Hold out test set of 0.2 

In [36]:

n_ests = [2, 5, 10, 25, 50, 100, 200, 300]
learn_rates = [0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10]
base_est_depths = list(range(1,11,1))

def get_ada_classifier(n, lr, random_state=99):
    return AdaBoostClassifier(n_estimators=n, learning_rate=lr, random_state=random_state)

def grid_search(X_train, y_train, n_ests, learn_rates, base_depths, base_est='dt'):
    max_score = {
        'scores_mean': 0
    }
    
    all_scores = []
    
    for n in n_ests:
        for rate in learn_rates:
            for depth in base_depths:
                if base_est == 'dt':
                    base = DecisionTreeClassifier(max_depth=depth)
                elif base_est == 'svm':
                    base = SVC(probability=True, kernel='linear')

                clf = AdaBoostClassifier(
                    base_estimator = base,
                    n_estimators=n, 
                    learning_rate=rate, 
                    random_state=99
                )

                cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

                scores = cross_val_score(
                    clf, 
                    X_train, 
                    y_train, 
                    scoring='accuracy', 
                    cv=cv, 
                    n_jobs=-1
                )
                
                scores_dict = {
                    'scores_mean': scores.mean(),
                    'scores_std': scores.std(),
                    'n': n,
                    'depth': depth,
                    'learn_rate': rate,
                    'scores': scores
                }
                all_scores.append(scores_dict)

                if scores.mean() > max_score['scores_mean']:
                    max_score = scores_dict
    return all_scores, max_score


In [31]:
all_scores, max_score = grid_search(x,y, n_ests, learn_rates, base_est_depths[:1])

In [16]:
all_scores_svm, max_score_svm = grid_search(x, y, n_ests, learn_rates, base_depths=[1], base_est='svm')

In [32]:
def get_best_params(all_scores):
    ms = max(all_scores, key=lambda x:x['scores_mean'])
    print(f"Accuracy: {round(ms['scores_mean']*100,2)}%")
    print(f"Best Paramaters: \nn_estimators: {ms['n']} \t learning_rate: {ms['learn_rate']} \t depth: {ms['depth']}")

#### Best Params for DT Base

In [33]:
get_best_params(all_scores)

Accuracy: 60.58%
Best Paramaters: 
n_estimators: 100 	 learning_rate: 0.1 	 depth: 1


#### Best Params for SVM Base


In [19]:
get_best_params(all_scores_svm)

Accuracy: 51.26%
Best Paramaters: 
n_estimators: 2 	 learning_rate: 0.5 	 depth: 1


Decision Tree best option

### Train Base Model and Test

In [37]:
def train_base_model(max_score, X_train, y_train, X_test, y_test):
    base_model = get_ada_classifier(max_score['n'], max_score['learn_rate'], max_score['depth'])
    base_model.fit(X_train, y_train)

    score = base_model.score(X_test, y_test)

    print(f"Final Accuracy: {round(score*100, 2)}%")
    
    return score

### Base model

### Run for every dataset on base model??


In [41]:
def all_datasets(files):
    n_ests = [10, 50, 100, 200, 300]
    learn_rates = [0.01, 0.05, 0.1, 1, 5]
    base_est_depths = list(range(1,4,1))

    results = []

    for file in files:
        x, y = get_data(file)
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

        all_scores, max_score = grid_search(X_train, y_train, n_ests, learn_rates, base_est_depths)

        print('Data: ', file)
        test_score = train_base_model(max_score, X_train, y_train, X_test, y_test)

        results.append([file, test_score, max_score])
    files2 = [f for f in files if 'last_1' not in f]
    for file in files2:
        x,y = get_data_skip_last_1(file)
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
        all_scores, max_score = grid_search(X_train, y_train, n_ests, learn_rates, base_est_depths)
        test_score = train_base_model(max_score, X_train, y_train, X_test, y_test)
        filename = file + 'no_last_1'
        results.append([filename, test_score, max_score])

    return results

results = all_datasets(files)


Data:  preprocessed_no_sent_last_1.csv
Final Accuracy: 57.14%
Data:  preprocessed_no_sent_last_10.csv
Final Accuracy: 57.14%
Data:  preprocessed_no_sent_last_3.csv
Final Accuracy: 42.86%
Data:  preprocessed_no_sent_last_5.csv
Final Accuracy: 66.67%
Data:  preprocessed_no_sent_last_7.csv
Final Accuracy: 61.9%
Data:  preprocessed_sent_24_last_1.csv
Final Accuracy: 57.14%
Data:  preprocessed_sent_24_last_10.csv
Final Accuracy: 57.14%
Data:  preprocessed_sent_24_last_3.csv
Final Accuracy: 42.86%
Data:  preprocessed_sent_24_last_5.csv
Final Accuracy: 52.38%
Data:  preprocessed_sent_24_last_7.csv
Final Accuracy: 61.9%
Data:  preprocessed_sent_96_last_1.csv
Final Accuracy: 71.43%
Data:  preprocessed_sent_96_last_10.csv
Final Accuracy: 57.14%
Data:  preprocessed_sent_96_last_3.csv
Final Accuracy: 42.86%
Data:  preprocessed_sent_96_last_5.csv
Final Accuracy: 57.14%
Data:  preprocessed_sent_96_last_7.csv
Final Accuracy: 61.9%
Data:  preprocessed_sent_cross_last_1.csv
Final Accuracy: 57.14%
Data:

In [47]:
def get_top_score(results):
    top_score = 0
    top_index = 0
    for i, result in enumerate(results):
        # print(f'Test Accuracy: {result[1]}')
        if result[1] >= top_score:
            top_score = result[1]
            top_index = i
            print(i, top_score)

    print('Top Score: ', top_score, '\t Top Data: ', results[top_index][0])
    return results[top_index]

get_top_score(results)

0 0.5714285714285714
1 0.5714285714285714
3 0.6666666666666666
10 0.7142857142857143
30 0.7142857142857143
Top Score:  0.7142857142857143 	 Top Data:  preprocessed_sent_cross_last_5.csvno_last_1


['preprocessed_sent_cross_last_5.csvno_last_1',
 0.7142857142857143,
 {'scores_mean': 0.5601851851851852,
  'scores_std': 0.18416385022891585,
  'n': 200,
  'depth': 1,
  'learn_rate': 0.1,
  'scores': array([0.44444444, 0.44444444, 0.88888889, 0.66666667, 0.5       ,
         0.25      , 0.75      , 0.5       , 0.625     , 0.625     ,
         0.66666667, 0.55555556, 0.33333333, 0.55555556, 0.75      ,
         0.5       , 0.5       , 0.5       , 0.75      , 0.625     ,
         0.77777778, 0.55555556, 0.22222222, 0.44444444, 0.25      ,
         0.875     , 0.25      , 0.875     , 0.625     , 0.5       ])}]

In [48]:
results[10][0]

'preprocessed_sent_96_last_1.csv'

### Old Chart Code Functions

In [63]:
def plot_train_val_loss(history):
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    epochs = range(1, len(loss) + 1)

    blue_dots = 'bo'
    solid_blue_line = 'b'

    plt.plot(epochs, loss, blue_dots, label = 'Training loss')
    plt.plot(epochs, val_loss, solid_blue_line, label = 'Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.show()
    
def plot_train_val_acc(history):
    plt.clf()

    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
        
    epochs = range(1, len(acc) + 1)

    blue_dots = 'bo'
    solid_blue_line = 'b'
        
    plt.plot(epochs, acc, blue_dots, label = 'Training Accuracy')
    plt.plot(epochs, val_acc, solid_blue_line, label = 'Validation Accuracy')
    plt.title('Training and validation acc')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

def plot_train_val_loss_acc(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']

    loss = history.history['loss']
    val_loss = history.history['val_loss']

    epochs_range = range(1, len(acc) + 1)
    blue_dots = 'bo'
    solid_blue_line = 'b'

    plt.figure(figsize=(15, 5))

    plt.subplot(1, 2, 1)
    # plt.plot(epochs_range, acc, label='Training Accuracy')
    # plt.plot(epochs_range, val_acc, label='Validation Accuracy')
    
 
    plt.plot(epochs_range, acc, blue_dots, label = 'Training Accuracy')
    plt.plot(epochs_range, val_acc, solid_blue_line, label = 'Validation Accuracy')
    plt.legend(loc='lower right')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(epochs_range, loss, blue_dots, label='Training Loss')
    plt.plot(epochs_range, val_loss, solid_blue_line, label='Validation Loss')
    plt.legend(loc='upper right')
    plt.title('Training and Validation Loss')
    plt.show()
  
def compare_loss_and_acc(hist1, hist2, title1, title2): 
    min_epochs = min(len(hist1.history['loss']), len(hist2.history['loss']))
    
    train_loss_1 = hist1['loss'][:min_epochs]
    val_loss_1 = hist1['val_loss'][:min_epochs]

    train_loss_2 = hist2['loss'][:min_epochs]
    val_loss_2 = hist2['val_loss'][:min_epochs]


    train_acc_1 = hist1['accuracy'][:min_epochs]
    val_acc_1 = hist1['val_accuracy'][:min_epochs]

    train_acc_2 = hist2['accuracy'][:min_epochs]
    val_acc_2 = hist2['val_accuracy'][:min_epochs]

    epochs = range(1, min_epochs + 1)

    blue_dots = 'bo'
    solid_blue_line = 'b'
    red_dots = 'ro'
    solid_red_line = 'r'

    plt.figure(figsize=(15,5))
    plt.subplot(1, 2, 1)

    plt.plot(epochs, train_loss_1, blue_dots, label = f'{title1} Training Loss')
    plt.plot(epochs, val_loss_1, solid_blue_line, label = f'{title1} Validation Loss')
    plt.plot(epochs, train_loss_2, red_dots, label = f'{title2} Training Loss')
    plt.plot(epochs, val_loss_2, solid_red_line, label = f'{title2} Validation Loss')
    # plt.title('Loss With Dropout Layers')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(epochs, train_acc_1, blue_dots, label = f'{title1} Training Accuracy')
    plt.plot(epochs, val_acc_1, solid_blue_line, label = f'{title1} Validation Accuracy')
    plt.plot(epochs, train_acc_2, red_dots, label = f'{title2} Training Accuracy')
    plt.plot(epochs, val_acc_2, solid_red_line, label = f'{title2} Validation Accuracy')
    # plt.title('Accuracy With Dropout Layers')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')

    plt.show()

def compare_acc(hist1, hist2, title1, title2): 
    min_epochs = min(len(hist1.history['accuracy']), len(hist2.history['accuracy']))
    acc = hist1.history['accuracy'][:min_epochs]
    val_acc = hist1.history['val_accuracy'][:min_epochs]
    acc_drop = hist2.history['accuracy'][:min_epochs]
    val_acc_drop = hist2.history['val_accuracy'][:min_epochs]
    
    epochs = range(1, min_epochs + 1)

    blue_dots = 'bo'
    solid_blue_line = 'b'
    red_dots = 'ro'
    solid_red_line = 'r'

    plt.plot(epochs, acc, blue_dots, label = f'{title1} Training Accuracy')
    plt.plot(epochs, val_acc, solid_blue_line, label = f'{title1} Validation Accuracy')
    plt.plot(epochs, acc_drop, red_dots, label = f'{title2} Training Accuracy')
    plt.plot(epochs, val_acc_drop, solid_red_line, label = f'{title2} Validation Accuracy')
    # plt.title('Accuracy With Dropout Layers')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.show()