In [40]:
import os
import pandas as  pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

### File Retrieval and Preprocessing

In [73]:
files = os.listdir('../final_stats/preprocessed/')
files[:2]

def split_target(df, target_col='away_winner_wts'):
    df = df.copy()
    target = df.pop(target_col)
    return df, target

def normalize_df(df):
    scaler = MinMaxScaler()
    df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    data, target = split_target(df)
    return data, target

def get_data(file):
    df = pd.read_csv(f'../final_stats/preprocessed/{file}', index_col=[0])
    data, target = normalize_df(df)
    return data, target

def remove_last_1(df):
    df = df.copy()
    df.drop(df.iloc[:, 1:52], inplace=True, axis=1)
    df.drop(df.iloc[:, 53:103], inplace=True, axis=1)
    return df

def get_data_skip_last_1(file):
    df = pd.read_csv(f'../final_stats/preprocessed/{file}', index_col=[0])
    df = remove_last_1(df)
    data, target = normalize_df(df)
    return data, target


In [65]:
print(files[1])
x, y = get_data(files[1])
x.head()

preprocessed_no_sent_last_10.csv


Unnamed: 0,home_score_team_last_1,home_score_opp_last_1,home_pass_cmp_off_last_1,home_pass_att_off_last_1,home_pass_yds_off_last_1,home_pass_tds_off_last_1,home_ints_off_last_1,home_sacks_off_last_1,home_sacks_yds_off_last_1,home_pass_yds_per_att_last_1,...,away_ravens,away_saints,away_seahawks,away_steelers,away_texans,away_titans,away_vikings,day_Sat,day_Sun,day_Thu
0,0.470588,0.414634,0.515152,0.315789,0.732323,0.75,0.333333,0.428571,0.278689,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.392157,0.756098,0.636364,0.631579,0.722222,0.75,0.0,0.142857,0.131148,0.54321,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.705882,0.414634,0.575758,0.421053,0.671717,0.75,0.0,0.142857,0.098361,0.703704,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.392157,0.243902,0.242424,0.263158,0.406566,0.0,0.0,0.0,0.0,0.444444,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.317073,0.424242,0.421053,0.35101,0.0,0.666667,0.571429,0.442623,0.296296,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### Chart Code

In [4]:
def get_average_acc_history(acc_histories):
    num_epochs = len(acc_histories[0])
    return [np.mean([x[i] for x in acc_histories]) for i in range(num_epochs)]

def get_average_loss_history(loss_histories):
    num_epochs = len(loss_histories[0])
    return [np.mean([x[i] for x in loss_histories]) for i in range(num_epochs)]
        
def plot_ave_acc_and_loss_histories(acc_histories, loss_histories):
    average_acc_history = get_average_acc_history(acc_histories)
    average_loss_history = get_average_loss_history(loss_histories)

    plt.figure(figsize=(15,5))
    
    plt.subplot(1, 2, 1)
    plt.plot(range(1, len(average_acc_history) + 1), average_acc_history)
    plt.xlabel("Epochs")
    plt.ylabel("Validation Accuracy")

    plt.subplot(1, 2, 2)

    plt.plot(range(1, len(average_loss_history) + 1), average_loss_history)
    plt.xlabel("Epochs")
    plt.ylabel("Validation Loss")
    plt.show()

def compare_ave_loss_acc_histories(acc_hist_1, acc_hist_2, loss_hist_1, loss_hist_2):
    min_epochs = min(len(acc_hist_1[0]), len(acc_hist_2[0]))
    
    ave_acc_hist_1 = get_average_acc_history(acc_hist_1)
    ave_loss_hist_1 = get_average_loss_history(loss_hist_1)
    
    ave_acc_hist_2 = get_average_acc_history(acc_hist_2)
    ave_loss_hist_2 = get_average_loss_history(loss_hist_2)


    blue_dots = 'bo'
    solid_blue_line = 'b'
    red_dots = 'ro'
    solid_red_line = 'r'

    epochs = range(1, min_epochs + 1)
    plt.figure(figsize=(15,5))
    
    plt.subplot(1, 2, 1)
    plt.plot(epochs, ave_acc_hist_1, solid_blue_line, label='Model 1')
    plt.plot(epochs, ave_acc_hist_2, solid_red_line, label='Model 2')
    plt.xlabel("Epochs")
    plt.ylabel("Validation Accuracy")
    plt.legend()

    plt.subplot(1, 2, 2)

    plt.plot(epochs, ave_loss_hist_1, solid_blue_line, label='Model 1')
    plt.plot(epochs, ave_loss_hist_2, solid_red_line, label='Model 2')
    plt.xlabel("Epochs")
    plt.ylabel("Validation Loss")
    plt.legend()
    
    plt.show()
    

In [33]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.naive_bayes import GaussianNB


### Repeated K-Fold Cross Validation
Limited in dataset so want to run a few times
Grid search of the learning rate and the n-estimators perameter
Hold out test set of 0.2 

In [91]:


def get_nb_model(smoothing=1e-9):
    clf = GaussianNB(var_smoothing=smoothing)
    return clf

def grid_search(X_train, y_train, smoothings):
    max_score = {
        'scores_mean': 0
    }

    all_scores = []
    
    for n in smoothings:
        clf = get_nb_model(n)

        cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

        scores = cross_val_score(
            clf, 
            X_train, 
            y_train, 
            scoring='accuracy', 
            cv=cv, 
            n_jobs=-1
        )
        scores_dict = {
            'scores_mean': scores.mean(),
            'scores_std': scores.std(),
            'smoothing': n,
            'scores': scores
        }
        all_scores.append(scores_dict)

        if scores.mean() > max_score['scores_mean']:
            max_score = scores_dict
    return all_scores, max_score


In [92]:
smoothings = np.logspace(0,-9, num=100)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

all_scores_1, max_score_1 = grid_search(X_train,y_train, smoothings)

In [48]:
print(len(all_scores_1))
max_score_1

100


{'scores_mean': 0.562037037037037,
 'scores_std': 0.17084776110910707,
 'smoothing': 1.0,
 'scores': array([0.33333333, 0.44444444, 0.55555556, 0.66666667, 0.5       ,
        0.75      , 0.625     , 0.5       , 0.75      , 0.625     ,
        0.55555556, 0.33333333, 0.33333333, 0.77777778, 0.625     ,
        0.25      , 0.75      , 0.5       , 0.625     , 0.75      ,
        0.55555556, 0.44444444, 0.55555556, 0.55555556, 0.125     ,
        0.75      , 0.375     , 0.75      , 0.75      , 0.75      ])}

In [49]:
def get_best_params(all_scores):
    ms = max(all_scores, key=lambda x:x['scores_mean'])
    print(f"Accuracy: {round(ms['scores_mean']*100,2)}%")
    print(f"Best Paramaters: \n var_smoothing: {ms['smoothing']}")



In [50]:
get_best_params(all_scores_1)

Accuracy: 56.2%
Best Paramaters: 
 var_smoothing: 1.0


### Train Base Model and Test

In [95]:
def test_model(max_score, X_train, y_train, X_test, y_test):
    base_model = get_nb_model(max_score['smoothing'])
    base_model.fit(X_train, y_train)

    score = base_model.score(X_test, y_test)
    print(f"Final Accuracy: {round(score*100, 2)}%")

    return score

In [52]:
score = test_model(max_score_1, x, y)

Final Accuracy: 42.86%


### Run for every dataset on base model??


In [96]:

# X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

def all_datasets(files):
    results = [ ]
    smoothings = np.logspace(0,-9, num=100)

    for file in files:
        print(file)
        x, y = get_data(file)
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
        # print (np.shape(X_train), np.shape(X_test))
        all_scores, max_score = grid_search(X_train, y_train, smoothings)
        # print(max_score)
        test_score = test_model(max_score, X_train, y_train, X_test, y_test)
        results.append([file, test_score, max_score])
    return results

results = all_datasets(files)


preprocessed_no_sent_last_1.csv
Final Accuracy: 42.86%
preprocessed_no_sent_last_10.csv
Final Accuracy: 42.86%
preprocessed_no_sent_last_3.csv
Final Accuracy: 42.86%
preprocessed_no_sent_last_5.csv
Final Accuracy: 57.14%
preprocessed_no_sent_last_7.csv
Final Accuracy: 47.62%
preprocessed_sent_24_last_1.csv
Final Accuracy: 47.62%
preprocessed_sent_24_last_10.csv
Final Accuracy: 52.38%
preprocessed_sent_24_last_3.csv
Final Accuracy: 42.86%
preprocessed_sent_24_last_5.csv
Final Accuracy: 57.14%
preprocessed_sent_24_last_7.csv
Final Accuracy: 47.62%
preprocessed_sent_96_last_1.csv
Final Accuracy: 52.38%
preprocessed_sent_96_last_10.csv
Final Accuracy: 42.86%
preprocessed_sent_96_last_3.csv
Final Accuracy: 42.86%
preprocessed_sent_96_last_5.csv
Final Accuracy: 57.14%
preprocessed_sent_96_last_7.csv
Final Accuracy: 47.62%
preprocessed_sent_cross_last_1.csv
Final Accuracy: 42.86%
preprocessed_sent_cross_last_10.csv
Final Accuracy: 42.86%
preprocessed_sent_cross_last_3.csv
Final Accuracy: 47.6

In [97]:
def all_datasets_skip_last_1(files):
    results = [ ]
    smoothings = np.logspace(0,-9, num=100)
    files = [f for f in files if 'last_1' not in f]
    for file in files:
        print(file)
        x, y = get_data_skip_last_1(file)

        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

        all_scores, max_score = grid_search(X_train, y_train, smoothings)
        test_score = test_model(max_score, X_train, y_train, X_test, y_test)
        results.append([file, test_score, max_score])
    return results

results2= all_datasets_skip_last_1(files)

preprocessed_no_sent_last_3.csv
Final Accuracy: 57.14%
preprocessed_no_sent_last_5.csv
Final Accuracy: 42.86%
preprocessed_no_sent_last_7.csv
Final Accuracy: 47.62%
preprocessed_sent_24_last_3.csv
Final Accuracy: 47.62%
preprocessed_sent_24_last_5.csv
Final Accuracy: 52.38%
preprocessed_sent_24_last_7.csv
Final Accuracy: 42.86%
preprocessed_sent_96_last_3.csv
Final Accuracy: 47.62%
preprocessed_sent_96_last_5.csv
Final Accuracy: 52.38%
preprocessed_sent_96_last_7.csv
Final Accuracy: 42.86%
preprocessed_sent_cross_last_3.csv
Final Accuracy: 52.38%
preprocessed_sent_cross_last_5.csv
Final Accuracy: 47.62%
preprocessed_sent_cross_last_7.csv
Final Accuracy: 42.86%


In [85]:
def get_top_score(results):
    top_score = 0
    top_index = 0
    for i, result in enumerate(results):
        # print(f'Test Accuracy: {result[1]}')
        if result[1] >= top_score:
            top_score = result[1]
            top_index = i
            print(i, top_score)

    print('Top Score: ', top_score, '\t Top Data: ', results[top_index][0])
    return results[top_index]

In [98]:
top1 = get_top_score(results)

0 0.42857142857142855
1 0.42857142857142855
2 0.42857142857142855
3 0.5714285714285714
8 0.5714285714285714
13 0.5714285714285714
18 0.5714285714285714
Top Score:  0.5714285714285714 	 Top Data:  preprocessed_sent_cross_last_5.csv


In [99]:
top2 = get_top_score(results2)

0 0.5714285714285714
Top Score:  0.5714285714285714 	 Top Data:  preprocessed_no_sent_last_3.csv


In [100]:
print(results[3][0])
print(results[8][0])
print(results[13][0])
print(results[18][0])

print(results2[0][0])

preprocessed_no_sent_last_5.csv
preprocessed_sent_24_last_5.csv
preprocessed_sent_96_last_5.csv
preprocessed_sent_cross_last_5.csv
preprocessed_no_sent_last_3.csv


### 5 Top models Achieving 57% accuracy on test holdout set
1. 96 Hour Sentiment model + last 1 + last 5
2. 96 Hour Sentiment model + last 3
3. 24 Hour Sentiment model + last 3
