## NCAA Win Predictor Using Kenpom 

This program uses college basketball stats from kenpom to determine winners of selected matchups using machine learning

In [299]:
import kenpompy
import kenpompy.summary as kp
import kenpompy.team as kpt
import pandas as pd
from sklearn.naive_bayes import BernoulliNB
from sklearn import tree
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
import re
import warnings
warnings.filterwarnings('ignore')

# For deep learning:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

In [300]:
# Logging into kenpom
from kenpompy.utils import login

with open("credentials.txt") as f:
    lines = f.readlines()
    username = lines[0].strip()
    password = lines[1].strip()

browser = login(username, password)

## Fetching all of the data being used. Putting the raw versions into csv's

In [301]:
# Stats for 2021 Season
# eff_stats = kp.get_efficiency(browser, season=2021)
# four_factors = kp.get_fourfactors(browser, season=2021)
# team_stats = kp.get_teamstats(browser, season=2021)
# point_dist = kp.get_pointdist(browser, season=2021)
# heights = kp.get_height(browser, season=2021)
# player_stats = kp.get_playerstats(browser, season=2021)

In [302]:
eff_stats = kp.get_efficiency(browser)
four_factors = kp.get_fourfactors(browser)
team_stats = kp.get_teamstats(browser)
point_dist = kp.get_pointdist(browser)
heights = kp.get_height(browser)
player_stats = kp.get_playerstats(browser)

In [303]:
eff_stats.to_csv("./Stats_CSVs/eff_stats.csv")
four_factors.to_csv("./Stats_CSVs/four_factors.csv")
team_stats.to_csv("./Stats_CSVs/team_stats.csv")
point_dist.to_csv("./Stats_CSVs/point_dist.csv")
heights.to_csv("./Stats_CSVs/hegihts.csv")
player_stats.to_csv("./Stats_CSVs/top_player_stats.csv")

## Getting the teams ranked by kenpom (kenpom.com landing page)

In [304]:
valid_teams = kpt.get_valid_teams(browser)

## Cleaning the data

In [305]:
# Function to remove all of the columns that have "rank" cleans up the data
def remove_rank_cols(df):
    df = df[df.columns.drop(list(df.filter(regex='Rank')))]
    df = df[df.columns.drop(list(df.filter(regex='Raw')))]
    df = df[df.columns.drop(list(df.filter(regex='Conference')))]
    return df

eff_stats = remove_rank_cols(eff_stats)
four_factors = remove_rank_cols(four_factors)
team_stats = remove_rank_cols(team_stats)
point_dist = remove_rank_cols(point_dist)

In [306]:
# Function to remove all of the teams that are not in valid_teams
def remove_low_rank_teams(df):
    df = df[df['Team'].isin(valid_teams)]
    df = df.sort_values(by="Team")
    return df

eff_stats = remove_low_rank_teams(eff_stats)
four_factors = remove_low_rank_teams(four_factors)
team_stats = remove_low_rank_teams(team_stats)
point_dist = remove_low_rank_teams(point_dist)

Creating a final dataset of just the top 100 teams with all of their stats

In [307]:
data = pd.merge(eff_stats, four_factors, on="Team")
data = pd.merge(data, team_stats, on="Team")
data = pd.merge(data, point_dist, on="Team")

In [308]:
data.to_csv("./Stats_CSVs/full_stats.csv")

In [309]:
full_stats = data

## Using data to predict the outcome of team_name's games

In [310]:
# Function to clean the schedule of a given team
def clean_schedules(team_name):
    # Uses 'data' data frame to compile a new dataframe that has all of the entered team name's stats with their opponents
    schedule = kpt.get_schedule(browser, team_name, season=2021)
    
    schedule.rename(columns={"Opponent Name":"Opponent"},inplace=True)
    
    opponents = schedule['Opponent']
    results = schedule['Result']
    
    schedule = schedule.drop(columns='Date')
    schedule = schedule[schedule['Opponent'].isin(valid_teams)]
    schedule = schedule[schedule.Result.str.len() < 11]
    schedule = schedule[schedule.Result != 'Result']
    
    schedule = schedule.reset_index()
    
    full_results = schedule['Result']

    if len(full_results[0]) > 3:
        results = []
        team_scores = []
        opp_scores = []
        for i in range(len(full_results)):
            full_results[i] = re.sub("\s", "", full_results[i])
            full_results[i] = re.split(r"[-,]", full_results[i])
            results.append(full_results[i][0])
            team_scores.append(full_results[i][1])
            opp_scores.append(full_results[i][2])
       
    main_team_stats = data[data['Team'] == team_name]
    final_df = pd.DataFrame()
    
    # Filling first part of df with Gonzaga Stats
    for item in opponents:
        final_df = final_df.append(main_team_stats)
           
    # Creating opponent dataframe and filling it 
    opp_df = pd.DataFrame()
    for opp in opponents:
        # This if statement skips over lines in the data that do not show game data
        if (opp in valid_teams):
            opp_df = opp_df.append(data[data['Team'] == opp])
    
    #Writing and reading from a csv to reset the indecies for merging
    final_df.to_csv('./Stats_CSVs/clean_schedules_1.csv')
    final_df = pd.read_csv('./Stats_CSVs/clean_schedules_1.csv')
    opp_df.to_csv('./Stats_CSVs/clean_schedules_2.csv')
    opp_df = pd.read_csv('./Stats_CSVs/clean_schedules_2.csv')
    
    # Adding opp_ prefix to all of the opponent columns
    opp_df = opp_df.add_prefix('opp_')
    
    # Combining the two dataframes
    final_df = pd.concat([final_df, opp_df], axis=1, join='inner')
    
    # Removing the extra indexing
    final_df.drop('opp_Unnamed: 0', inplace=True, axis=1)
    final_df.drop('Unnamed: 0', inplace=True, axis=1)
    
    # Literally just removing the opponent name:
    final_df.drop('opp_Team', inplace=True, axis=1)
    final_df.drop('Team', inplace=True, axis=1)
    
    final_df['team_score'] = team_scores
    final_df['opp_score'] = opp_scores
    
    for i in range(len(results)):
        if results[i] == 'W':
            results[i] = 1
        else:
            results[i] = 0
    
    final_df['Result'] = results
    
    return final_df

## Building and training the network:

<b>This next part takes a while to run. Gets each team's schedule. 10000+ rows in a dataframe

In [311]:
# season_df = pd.DataFrame()

# for team in valid_teams:
#     try:
#         team_data = clean_schedules(team)
#         season_df = season_df.append(team_data)
#     except (IndexError) as e:
#         print("Error processing ", team, e)
#         continue

# season_df.to_csv("./Stats_CSVs/full_2021_season_full_stats.csv")

<b> Reading stats from preloaded CSV

In [312]:
season_df = pd.read_csv("./Stats_CSVs/full_season_full_stats.csv")
season_df = season_df.drop(columns='Unnamed: 0')
season_df

Unnamed: 0,Tempo-Adj,Avg. Poss Length-Offense,Avg. Poss Length-Defense,Off. Efficiency-Adj,Def. Efficiency-Adj,AdjTempo,AdjOE_x,Off-eFG%,Off-TO%,Off-OR%,...,opp_AdjOE_y,opp_Off-FT,opp_Off-2P,opp_Off-3P,opp_Def-FT,opp_Def-2P,opp_Def-3P,team_score,opp_score,Result
0,72.5,14.5,17.8,121.8,88.8,72.5,121.8,59.4,15.9,29.0,...,37.3,20.1,51.5,28.4,17.6,52.9,29.5,97,63,1
1,72.5,14.5,17.8,121.8,88.8,72.5,121.8,59.4,15.9,29.0,...,36.0,19.5,52.4,28.1,20.9,51.9,27.2,86,74,1
2,72.5,14.5,17.8,121.8,88.8,72.5,121.8,59.4,15.9,29.0,...,29.5,20.0,55.7,24.3,20.1,50.7,29.2,84,57,1
3,72.5,14.5,17.8,121.8,88.8,72.5,121.8,59.4,15.9,29.0,...,41.9,13.4,51.8,34.9,16.3,45.6,38.0,92,50,1
4,72.5,14.5,17.8,121.8,88.8,72.5,121.8,59.4,15.9,29.0,...,40.1,18.1,46.7,35.2,19.3,47.0,33.7,107,54,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10586,62.3,20.0,17.7,79.5,111.1,62.3,79.5,42.4,26.1,30.0,...,31.8,17.4,57.1,25.5,16.4,52.4,31.2,66,56,1
10587,62.3,20.0,17.7,79.5,111.1,62.3,79.5,42.4,26.1,30.0,...,38.7,18.0,49.9,32.1,17.3,52.1,30.6,74,61,0
10588,62.3,20.0,17.7,79.5,111.1,62.3,79.5,42.4,26.1,30.0,...,34.7,17.4,53.8,28.8,19.1,45.1,35.8,66,54,0
10589,62.3,20.0,17.7,79.5,111.1,62.3,79.5,42.4,26.1,30.0,...,34.4,19.7,55.3,25.0,18.7,53.6,27.8,67,41,0


Creating and training the model:

In [313]:
X_ta = season_df.drop(['Result', 'team_score', 'opp_score'], axis=1).values
y_ta = season_df['Result'].values

model = tree.DecisionTreeRegressor()

model.fit(X_ta, y_ta)

predictions = model.predict(X_ta)
predictions = np.round(predictions, 2)

# Getting the model's score
model.score(X_ta, y_ta)

0.7647971128245763

Loop to get each team's predicted record based on the above model:

In [314]:
all_teams_pred = []


# Loop to get each team's actual wins/losses and compares them to the model's guesses
for team in valid_teams[0:1]:
    team_data = clean_schedules(team)
    X_ta = team_data.drop(['Result', 'team_score', 'opp_score'], axis=1).values

    predictions = model.predict(X_ta)
    predictions = np.round(predictions, 2)

    for i in range(len(predictions)):
        if predictions[i] > 0.5:
            predictions[i] = 1
        else:
            predictions[i] = 0
    wins = sum(predictions)
    losses = len(predictions) - wins
    
    team_tuple = (team, wins, losses)
    all_teams_pred.append(team_tuple)

In [315]:
all_teams_pred

[('Gonzaga', 27.0, 5.0)]

## Predicting Individual Games:

In [316]:
# Function that puts two teams' stats into one data frame and makes a prediction on the game
def predict_game(team1, team2):
    team1_df = full_stats.loc[full_stats['Team'] == team1]
    team2_df = full_stats.loc[full_stats['Team'] == team2]

    # Adding opp_ prefix to all of the opponent columns
    team2_df = team2_df.add_prefix('opp_')

    # Adding join col to make the merge easy
    team1_df['join'] = [1]
    team2_df['join'] = [1]

    # Merging the two dataframes on the 'join' col
    game_df = pd.merge(team1_df, team2_df, on=['join'])

    # Dropping the unneccessary columns
    game_df.drop('Team', inplace=True, axis=1)
    game_df.drop('opp_Team', inplace=True, axis=1)
    game_df.drop('join', inplace=True, axis=1)
    prediction = model.predict(game_df)

    if(prediction[0] > 0.5):
        return team1
    else:
        return team2

## March Maddness Predictions:

In [317]:
first_four = [
    ['Wyoming', 'Indiana'],
    ['Bryant', 'Wright St.'],
    ['Texas Southern', 'Texas A&M Corpus Chris'],
    ['Rutgers', 'Notre Dame']
]

round_64 = [
    ['Gonzaga', 'Georgia St.'],
    ['Boise St.', 'Memphis'], 
    ['Connecticut', 'New Mexico St.'], 
    ['Arkansas', 'Vermont'],
    ['Alabama', predict_game('Rutgers', 'Notre Dame')],
    ['Texas Tech', 'Montana St.'],
    ['Michigan St.', 'Davidson'],
    ['Duke', 'Cal St. Fullerton'],
    ['Baylor', 'Norfolk St.'],
    ['North Carolina', 'Marquette'],
    ["Saint Mary's", predict_game('Wyoming', 'Indiana')],
    ['UCLA', 'Akron'],
    ['Texas', 'Virginia Tech'],
    ['Purdue', 'Yale'],
    ['Murray St.', 'San Francisco'],
    ['Kentucky', "Saint Peter's"],
    ['Arizona', predict_game('Bryant', 'Wright St.')],
    ['Seton Hall', 'TCU'],
    ['Houston', 'UAB'],
    ['Illinois', 'Chattanooga'],
    ['Colorado St.', 'Michigan'],
    ['Tennessee', 'Longwood'],
    ['Ohio St.', 'Loyola Chicago'],
    ['Villanova', 'Delaware'],
    ['Kansas', predict_game('Texas Southern', 'Texas A&M Corpus Chris')],
    ['San Diego St.', 'Creighton'],
    ['Iowa', 'Richmond'],
    ['Providence', 'South Dakota St.'],
    ['LSU', 'Iowa St.'],
    ['Wisconsin', 'Colgate'],
    ['USC', 'Miami FL'],
    ['Auburn', "Jacksonville St."]
]

# round_64_2021 = [
#     ['Gonzaga', 'Norfolk St.'],
#     ['Oklahoma', 'Missouri'], 
#     ['Creighton', 'UC Santa Barbara'], 
#     ['Virginia', 'Ohio'],
#     ['USC', 'Drake'],
#     ['Kansas', 'Eastern Washington'],
#     ['Oregon', 'Oregon'],
#     ['Iowa', 'Grand Canyon'],
#     ['Baylor', 'Hartford'],
#     ['North Carolina', 'Wisconsin'],
#     ["Winthrop", 'Villanova'],
#     ['Purdue', 'North Texas'],
#     ['Utah St.', 'Texas Tech'],
#     ['Arkansas', 'Colgate'],
#     ['Virginia Tech', 'Florida'],
#     ['Ohio St.', "Oral Roberts"],
#     ['Drexel', 'Illinois'],
#     ['Loyola Chicago', 'Georgia Tech'],
#     ['Tennessee', 'Oregon St.'],
#     ['Oklahoma St.', 'Liberty'],
#     ['Syracuse', 'San Diego St.'],
#     ['West Virginia', 'Morehead St.'],
#     ['Rutgers', 'Clemson'],
#     ['Houston', 'Cleveland St.'],
#     ['Alabama', 'Iona'],
#     ['Connecticut', 'Maryland'],
#     ['Abilene Christian', 'Texas'],
#     ['UCLA', 'BYU'],
#     ['Florida St.', 'UNC Greensboro'],
#     ['Georgetown', 'Colorado'],
#     ['St. Bonaventure', 'LSU'],
#     ['Texas Southern', "Michigan"]
# ]

By re-fitting the model in the next step, the network will return a differnt set of predictions:

In [318]:
X_ta = season_df.drop(['Result', 'team_score', 'opp_score'], axis=1).values
y_ta = season_df['Result'].values

model.fit(X_ta, y_ta)
predictions = model.predict(X_ta)
predictions = np.round(predictions, 2)
model.score(X_ta, y_ta)

preds_64 = []
preds_32 = []
preds_16 = []
preds_8 = []
preds_4 = []

for game in round_64:
    preds_64.append(predict_game(game[0], game[1]))

for i in range(0, len(preds_64), 2):
    preds_32.append(predict_game(preds_64[i],preds_64[i+1]))

for i in range(0, len(preds_32), 2):
    preds_16.append(predict_game(preds_32[i],preds_32[i+1]))

for i in range(0, len(preds_16), 2):
    preds_8.append(predict_game(preds_16[i],preds_16[i+1]))

for i in range(0, len(preds_8), 2):
    preds_4.append(predict_game(preds_8[i],preds_8[i+1]))

champion = predict_game(preds_4[0], preds_4[1])

champion

'Arizona'

## Creating a deep learning model with Keras:

Current way of doing this: https://machinelearningmastery.com/tutorial-first-neural-network-python-keras/

Better way to do this: https://machinelearningmastery.com/binary-classification-tutorial-with-the-keras-deep-learning-library/

Accuracy of model tends to stop around 74% compared to 76% with decision tree. Though, the results of the network appear to be more "realisitic"

In [319]:
X_ta = season_df.drop(['Result', 'team_score', 'opp_score'], axis=1).values
y_ta = season_df['Result'].values

# Function to create the baseline model:
def create_baseline():
    dl_model = Sequential()
    # Adding layers:
    dl_model.add(Dense(60, input_dim=60, activation='relu'))
    dl_model.add(Dense(20, activation='relu'))
#     dl_model.add(Dense(20, activation='relu'))
    dl_model.add(Dense(1, activation='sigmoid'))
    # Compiling:
    dl_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return dl_model

deep_model = create_baseline()
deep_model.fit(X_ta, y_ta, epochs=30, batch_size = 50)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x118893c71f0>

In [320]:
deep_model.summary()

Model: "sequential_32"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_109 (Dense)           (None, 60)                3660      
                                                                 
 dense_110 (Dense)           (None, 20)                1220      
                                                                 
 dense_111 (Dense)           (None, 1)                 21        
                                                                 
Total params: 4,901
Trainable params: 4,901
Non-trainable params: 0
_________________________________________________________________


In [321]:
# Function that puts two teams' stats into one data frame and makes a prediction on the game
def get_matchup_data(team1, team2):
    team1_df = full_stats.loc[full_stats['Team'] == team1]
    team2_df = full_stats.loc[full_stats['Team'] == team2]

    # Adding opp_ prefix to all of the opponent columns
    team2_df = team2_df.add_prefix('opp_')

    # Adding join col to make the merge easy
    team1_df['join'] = [1]
    team2_df['join'] = [1]

    # Merging the two dataframes on the 'join' col
    game_df = pd.merge(team1_df, team2_df, on=['join'])

    # Dropping the unneccessary columns
    game_df.drop('Team', inplace=True, axis=1)
    game_df.drop('opp_Team', inplace=True, axis=1)
    game_df.drop('join', inplace=True, axis=1)
    
    return game_df

In [338]:
def dl_predict_game(team1, team2):
    prob = deep_model.predict((get_matchup_data(team1, team2)).astype(float))[0][0]
    print(prob)
    if prob > 0.5:
        return team1
    else:
        return team2
        

In [337]:
dl_predict_game('Arizona', 'Gonzaga')

0.20696631


'Gonzaga'

In [333]:
preds_64 = []
preds_32 = []
preds_16 = []
preds_8 = []
preds_4 = []

for game in round_64:
    preds_64.append(dl_predict_game(game[0], game[1]))

for i in range(0, len(preds_64), 2):
    preds_32.append(dl_predict_game(preds_64[i],preds_64[i+1]))

for i in range(0, len(preds_32), 2):
    preds_16.append(dl_predict_game(preds_32[i],preds_32[i+1]))

for i in range(0, len(preds_16), 2):
    preds_8.append(dl_predict_game(preds_16[i],preds_16[i+1]))

for i in range(0, len(preds_8), 2):
    preds_4.append(dl_predict_game(preds_8[i],preds_8[i+1]))

champion = dl_predict_game(preds_4[0], preds_4[1])

0.9647269
0.54405147
0.6664855
0.5401206
0.43647772
0.8769855
0.40961155
0.92154896
0.9450418
0.4877281
0.55255204
0.8614359
0.4929427
0.8812754
0.36406246
0.8939565
0.9471556
0.45017922
0.7052376
0.6367272
0.3819822
0.8887901
0.3346734
0.90320575
0.93036103
0.5220697
0.71926
0.44399852
0.59110284
0.70369536
0.45733023
0.9038265
0.8226677
0.39108354
0.15639734
0.13221863
0.71862924
0.26891875
0.2201775
0.19858763
0.78992283
0.49301577
0.18012393
0.25858036
0.61826885
0.6709782
0.439933
0.11407995
0.77432925
0.38716888
0.41825482
0.29172775
0.59901035
0.44319463
0.41142285
0.22015774
0.5695279
0.33432388
0.54960996
0.3511964
0.56840473
0.5196368
0.44378442


['Gonzaga', 'Arizona']