## NCAA Win Predictor Using Kenpom 

This program uses college basketball stats from kenpom to determine winners of selected matchups using machine learning

In [5]:
import kenpompy
import kenpompy.summary as kp
import kenpompy.team as kpt
import pandas as pd
from sklearn.naive_bayes import BernoulliNB
from sklearn import tree
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
import re
import warnings
warnings.filterwarnings('ignore')

# For deep learning:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

In [6]:
# Logging into kenpom
from kenpompy.utils import login
browser = login("jayminwest@gmail.com", "G0mfnZags")

## Fetching all of the data being used. Putting the raw versions into csv's

In [7]:
eff_stats = kp.get_efficiency(browser)
four_factors = kp.get_fourfactors(browser)
team_stats = kp.get_teamstats(browser)
point_dist = kp.get_pointdist(browser)
heights = kp.get_height(browser)
player_stats = kp.get_playerstats(browser)

In [8]:
eff_stats.to_csv("eff_stats.csv")
four_factors.to_csv("four_factors.csv")
team_stats.to_csv("team_stats.csv")
point_dist.to_csv("point_dist.csv")
heights.to_csv("hegihts.csv")
player_stats.to_csv("top_player_stats.csv")

## Getting the teams ranked by kenpom (kenpom.com landing page)

In [9]:
# Getting a list of all the teams and then only grabbing the first 100 for the sake of speed
valid_teams = kpt.get_valid_teams(browser)
# valid_teams = valid_teams[0:100]

## Cleaning the data

In [10]:
# Function to remove all of the columns that have "rank" cleans up the data
def remove_rank_cols(df):
    df = df[df.columns.drop(list(df.filter(regex='Rank')))]
    df = df[df.columns.drop(list(df.filter(regex='Raw')))]
    df = df[df.columns.drop(list(df.filter(regex='Conference')))]
    return df

eff_stats = remove_rank_cols(eff_stats)
four_factors = remove_rank_cols(four_factors)
team_stats = remove_rank_cols(team_stats)
point_dist = remove_rank_cols(point_dist)

In [9]:
# Function to remove all of the teams that are not in valid_teams
def remove_low_rank_teams(df):
    df = df[df['Team'].isin(valid_teams)]
    df = df.sort_values(by="Team")
    return df

eff_stats = remove_low_rank_teams(eff_stats)
four_factors = remove_low_rank_teams(four_factors)
team_stats = remove_low_rank_teams(team_stats)
point_dist = remove_low_rank_teams(point_dist)

Creating a final dataset of just the top 100 teams with all of their stats

In [11]:
data = pd.merge(eff_stats, four_factors, on="Team")
data = pd.merge(data, team_stats, on="Team")
data = pd.merge(data, point_dist, on="Team")

In [12]:
data.to_csv("full_stats.csv")

In [13]:
full_stats = data

## Using data to predict the outcome of team_name's games

In [14]:
# Function to clean the schedule of a given team
def clean_schedules(team_name):
    # Uses 'data' data frame to compile a new dataframe that has all of the entered team name's stats with their opponents
    schedule = kpt.get_schedule(browser, team_name)
    
    schedule.rename(columns={"Opponent Name":"Opponent"},inplace=True)
    
    opponents = schedule['Opponent']
    results = schedule['Result']
    
    schedule = schedule.drop(columns='Date')
#     schedule = schedule[(schedule.Opponent.str).isin(valid_teams)]
    schedule = schedule[schedule['Opponent'].isin(valid_teams)]
    schedule = schedule[schedule.Result.str.len() < 11]
    schedule = schedule[schedule.Result != 'Result']
    
    schedule = schedule.reset_index()
    
    full_results = schedule['Result']

    if len(full_results[0]) > 3:
        results = []
        team_scores = []
        opp_scores = []
        for i in range(len(full_results)):
            full_results[i] = re.sub("\s", "", full_results[i])
            full_results[i] = re.split(r"[-,]", full_results[i])
            results.append(full_results[i][0])
            team_scores.append(full_results[i][1])
            opp_scores.append(full_results[i][2])
       
    main_team_stats = data[data['Team'] == team_name]
    final_df = pd.DataFrame()
    
    # Filling first part of df with Gonzaga Stats
    for item in opponents:
        final_df = final_df.append(main_team_stats)
           
    # Creating opponent dataframe and filling it 
    opp_df = pd.DataFrame()
    for opp in opponents:
        # This if statement skips over lines in the data that do not show game data
        if (opp in valid_teams):
            opp_df = opp_df.append(data[data['Team'] == opp])
    
    #Writing and reading from a csv to reset the indecies
    final_df.to_csv('clean_schedules_1.csv')
    final_df = pd.read_csv('clean_schedules_1.csv')
    opp_df.to_csv('clean_schedules_2.csv')
    opp_df = pd.read_csv('clean_schedules_2.csv')
    
    # Adding opp_ prefix to all of the opponent columns
    opp_df = opp_df.add_prefix('opp_')
    
    # Combining the two dataframes
    final_df = pd.concat([final_df, opp_df], axis=1, join='inner')
    
    # Removing the extra indexing
    final_df.drop('opp_Unnamed: 0', inplace=True, axis=1)
    final_df.drop('Unnamed: 0', inplace=True, axis=1)
    
    # Literally just removing the opponent name:
    final_df.drop('opp_Team', inplace=True, axis=1)
    final_df.drop('Team', inplace=True, axis=1)
    
    final_df['team_score'] = team_scores
    final_df['opp_score'] = opp_scores
    
    for i in range(len(results)):
        if results[i] == 'W':
            results[i] = 1
        else:
            results[i] = 0
    
    final_df['Result'] = results
    
    return final_df

## Building and training the network:

Adding the first all teams' data to one dataframe for predictions:

# This next part takes a while to run. Gets each team's schedule. 10000+ rows in a dataframe

In [15]:
# season_df = pd.DataFrame()

# for team in valid_teams:
#     try:
#         team_data = clean_schedules(team)
#         season_df = season_df.append(team_data)
#     except (IndexError) as e:
#         print("Error processing ", team, e)
#         continue

In [16]:
season_df = pd.read_csv("full_season_full_stats.csv")
season_df = season_df.drop(columns='Unnamed: 0')
# season_df.to_csv("full_season_full_stats.csv")
season_df

Unnamed: 0,Tempo-Adj,Avg. Poss Length-Offense,Avg. Poss Length-Defense,Off. Efficiency-Adj,Def. Efficiency-Adj,AdjTempo,AdjOE_x,Off-eFG%,Off-TO%,Off-OR%,...,opp_AdjOE_y,opp_Off-FT,opp_Off-2P,opp_Off-3P,opp_Def-FT,opp_Def-2P,opp_Def-3P,team_score,opp_score,Result
0,72.5,14.5,17.8,121.8,88.8,72.5,121.8,59.4,15.9,29.0,...,37.3,20.1,51.5,28.4,17.6,52.9,29.5,97,63,1
1,72.5,14.5,17.8,121.8,88.8,72.5,121.8,59.4,15.9,29.0,...,36.0,19.5,52.4,28.1,20.9,51.9,27.2,86,74,1
2,72.5,14.5,17.8,121.8,88.8,72.5,121.8,59.4,15.9,29.0,...,29.5,20.0,55.7,24.3,20.1,50.7,29.2,84,57,1
3,72.5,14.5,17.8,121.8,88.8,72.5,121.8,59.4,15.9,29.0,...,41.9,13.4,51.8,34.9,16.3,45.6,38.0,92,50,1
4,72.5,14.5,17.8,121.8,88.8,72.5,121.8,59.4,15.9,29.0,...,40.1,18.1,46.7,35.2,19.3,47.0,33.7,107,54,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10586,62.3,20.0,17.7,79.5,111.1,62.3,79.5,42.4,26.1,30.0,...,31.8,17.4,57.1,25.5,16.4,52.4,31.2,66,56,1
10587,62.3,20.0,17.7,79.5,111.1,62.3,79.5,42.4,26.1,30.0,...,38.7,18.0,49.9,32.1,17.3,52.1,30.6,74,61,0
10588,62.3,20.0,17.7,79.5,111.1,62.3,79.5,42.4,26.1,30.0,...,34.7,17.4,53.8,28.8,19.1,45.1,35.8,66,54,0
10589,62.3,20.0,17.7,79.5,111.1,62.3,79.5,42.4,26.1,30.0,...,34.4,19.7,55.3,25.0,18.7,53.6,27.8,67,41,0


Creating and training the model:

In [18]:
X_ta = season_df.drop(['Result', 'team_score', 'opp_score'], axis=1).values

# y_ta = season_df[['team_score','opp_score']].values
y_ta = season_df['Result'].values

model = tree.DecisionTreeRegressor()
# model = RandomForestClassifier(n_estimators=20)

model.fit(X_ta, y_ta)
predictions = model.predict(X_ta)
predictions = np.round(predictions, 2)

# Getting the model's score
model.score(X_ta, y_ta)
# accuracy_score(predictions, y_ta)

0.7647971128245763

Loop to get each team's predicted record based on the above model:

In [19]:
all_teams_pred = []


# Loop to get each team's actual wins/losses and compares them to the model's guesses
for team in valid_teams[0:1]:
    team_data = clean_schedules(team)
    X_ta = team_data.drop(['Result', 'team_score', 'opp_score'], axis=1).values

    predictions = model.predict(X_ta)
    predictions = np.round(predictions, 2)

    for i in range(len(predictions)):
        if predictions[i] > 0.5:
            predictions[i] = 1
        else:
            predictions[i] = 0
    wins = sum(predictions)
    losses = len(predictions) - wins
    
    team_tuple = (team, wins, losses)
    all_teams_pred.append(team_tuple)

In [20]:
all_teams_pred

[('Gonzaga', 28.0, 2.0)]

## Predicting Individual Games:

In [21]:
# Function that puts two teams' stats into one data frame and makes a prediction on the game
def predict_game(team1, team2):
    team1_df = full_stats.loc[full_stats['Team'] == team1]
    team2_df = full_stats.loc[full_stats['Team'] == team2]

    # Adding opp_ prefix to all of the opponent columns
    team2_df = team2_df.add_prefix('opp_')

    # Adding join col to make the merge easy
    team1_df['join'] = [1]
    team2_df['join'] = [1]

    # Merging the two dataframes on the 'join' col
    game_df = pd.merge(team1_df, team2_df, on=['join'])

    # Dropping the unneccessary columns
    game_df.drop('Team', inplace=True, axis=1)
    game_df.drop('opp_Team', inplace=True, axis=1)
    game_df.drop('join', inplace=True, axis=1)
    prediction = model.predict(game_df)

    if(prediction[0] > 0.5):
        return team1
    else:
        return team2

## March Maddness Predictions:

In [22]:
first_four = [
    ['Wyoming', 'Indiana'],
    ['Bryant', 'Wright St.'],
    ['Texas Southern', 'Texas A&M Corpus Chris'],
    ['Rutgers', 'Notre Dame']
]

round_64 = [
    ['Gonzaga', 'Georgia St.'],
    ['Boise St.', 'Memphis'], 
    ['Connecticut', 'New Mexico St.'], 
    ['Arkansas', 'Vermont'],
    ['Alabama', predict_game('Rutgers', 'Notre Dame')],
    ['Texas Tech', 'Montana St.'],
    ['Michigan St.', 'Davidson'],
    ['Duke', 'Cal St. Fullerton'],
    ['Baylor', 'Norfolk St.'],
    ['North Carolina', 'Marquette'],
    ["Saint Mary's", predict_game('Wyoming', 'Indiana')],
    ['UCLA', 'Akron'],
    ['Texas', 'Virginia Tech'],
    ['Purdue', 'Yale'],
    ['Murray St.', 'San Francisco'],
    ['Kentucky', "Saint Peter's"],
    ['Arizona', predict_game('Bryant', 'Wright St.')],
    ['Seton Hall', 'TCU'],
    ['Houston', 'UAB'],
    ['Illinois', 'Chattanooga'],
    ['Colorado St.', 'Michigan'],
    ['Tennessee', 'Longwood'],
    ['Ohio St.', 'Loyola Chicago'],
    ['Villanova', 'Delaware'],
    ['Kansas', predict_game('Texas Southern', 'Texas A&M Corpus Chris')],
    ['San Diego St.', 'Creighton'],
    ['Iowa', 'Richmond'],
    ['Providence', 'South Dakota St.'],
    ['LSU', 'Iowa St.'],
    ['Wisconsin', 'Colgate'],
    ['USC', 'Miami FL'],
    ['Auburn', "Jacksonville St."]
]

In [23]:
X_ta = season_df.drop(['Result', 'team_score', 'opp_score'], axis=1).values

# y_ta = season_df[['team_score','opp_score']].values
y_ta = season_df['Result'].values

model = tree.DecisionTreeRegressor()
# model = RandomForestClassifier(n_estimators=20)

model.fit(X_ta, y_ta)
predictions = model.predict(X_ta)
predictions = np.round(predictions, 2)

# Getting the model's score
model.score(X_ta, y_ta)
# accuracy_score(predictions, y_ta)

preds_64 = []
preds_32 = []
preds_16 = []
preds_8 = []
preds_4 = []

for game in round_64:
    preds_64.append(predict_game(game[0], game[1]))

for i in range(0, len(preds_64), 2):
    preds_32.append(predict_game(preds_64[i],preds_64[i+1]))

for i in range(0, len(preds_32), 2):
    preds_16.append(predict_game(preds_32[i],preds_32[i+1]))

for i in range(0, len(preds_16), 2):
    preds_8.append(predict_game(preds_16[i],preds_16[i+1]))

for i in range(0, len(preds_8), 2):
    preds_4.append(predict_game(preds_8[i],preds_8[i+1]))

champion = predict_game(preds_4[0], preds_4[1])

champion

'Arizona'

In [24]:
preds_4

["Saint Mary's", 'Arizona']

In [25]:
predict_game("UCLA", "Saint Mary's")

'UCLA'

## Running Model Many Times for Best Prediction:

In [26]:
X_ta = season_df.drop(['Result', 'team_score', 'opp_score'], axis=1).values
y_ta = season_df['Result'].values

def get_many_outcomes(num_runs):
    winners = []
    for i in range(num_runs):
        X_ta = season_df.drop(['Result', 'team_score', 'opp_score'], axis=1).values

        # y_ta = season_df[['team_score','opp_score']].values
        y_ta = season_df['Result'].values

        model = tree.DecisionTreeRegressor()
        # model = RandomForestClassifier(n_estimators=20)

        model.fit(X_ta, y_ta)
        predictions = model.predict(X_ta)
        predictions = np.round(predictions, 2)

        # Getting the model's score
        model.score(X_ta, y_ta)
        # accuracy_score(predictions, y_ta)

        preds_64 = []
        preds_32 = []
        preds_16 = []
        preds_8 = []
        preds_4 = []

        for game in round_64:
            preds_64.append(predict_game(game[0], game[1]))

        for i in range(0, len(preds_64), 2):
            preds_32.append(predict_game(preds_64[i],preds_64[i+1]))

        for i in range(0, len(preds_32), 2):
            preds_16.append(predict_game(preds_32[i],preds_32[i+1]))

        for i in range(0, len(preds_16), 2):
            preds_8.append(predict_game(preds_16[i],preds_16[i+1]))

        for i in range(0, len(preds_8), 2):
            preds_4.append(predict_game(preds_8[i],preds_8[i+1]))

        champion = predict_game(preds_4[0], preds_4[1])

        winners.append(champion)
        
    return winners

In [27]:
winners = get_many_outcomes(10)
winners

['Arizona',
 'Arizona',
 'Arizona',
 'Arizona',
 'Arizona',
 'Arizona',
 'Arizona',
 'Arizona',
 'Arizona',
 'Arizona']

## Creating a deep learning model with Keras:

In [28]:
X_ta = season_df.drop(['Result', 'team_score', 'opp_score'], axis=1).values
y_ta = season_df['Result'].values

encoder = LabelEncoder()
encoder.fit(y_ta)
encoded_Y = encoder.transform(y_ta)

# Function to create the baseline model:
def create_baseline():
    dl_model = Sequential()
    dl_model.add(Dense(60, input_dim=60, activation='relu'))
    dl_model.add(Dense(20, activation='relu'))
    dl_model.add(Dense(1, activation='sigmoid'))
    # Compiling:
    dl_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return dl_model

deep_model = create_baseline()
deep_model.fit(X_ta, y_ta, epochs=5, batch_size = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x118fba4ae80>

In [29]:
deep_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 60)                3660      
                                                                 
 dense_1 (Dense)             (None, 20)                1220      
                                                                 
 dense_2 (Dense)             (None, 1)                 21        
                                                                 
Total params: 4,901
Trainable params: 4,901
Non-trainable params: 0
_________________________________________________________________


In [51]:
# Function that puts two teams' stats into one data frame and makes a prediction on the game
def dl_predict_game(team1, team2):
    team1_df = full_stats.loc[full_stats['Team'] == team1]
    team2_df = full_stats.loc[full_stats['Team'] == team2]

    # Adding opp_ prefix to all of the opponent columns
    team2_df = team2_df.add_prefix('opp_')

    # Adding join col to make the merge easy
    team1_df['join'] = [1]
    team2_df['join'] = [1]

    # Merging the two dataframes on the 'join' col
    game_df = pd.merge(team1_df, team2_df, on=['join'])

    # Dropping the unneccessary columns
    game_df.drop('Team', inplace=True, axis=1)
    game_df.drop('opp_Team', inplace=True, axis=1)
    game_df.drop('join', inplace=True, axis=1)
    
    return game_df

In [56]:
deep_model.predict((dl_predict_game('Gonzaga', 'Bellarmine')).astype(float))[0][0]

0.93343663

In [50]:
dl_predict_game('Gonzaga', 'Baylor')

UnimplementedError: Graph execution error:

Detected at node 'sequential/Cast' defined at (most recent call last):
    File "C:\Users\jaymi\anaconda3\lib\runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\Users\jaymi\anaconda3\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "C:\Users\jaymi\anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "C:\Users\jaymi\anaconda3\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
      app.start()
    File "C:\Users\jaymi\anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 677, in start
      self.io_loop.start()
    File "C:\Users\jaymi\anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "C:\Users\jaymi\anaconda3\lib\asyncio\base_events.py", line 596, in run_forever
      self._run_once()
    File "C:\Users\jaymi\anaconda3\lib\asyncio\base_events.py", line 1890, in _run_once
      handle._run()
    File "C:\Users\jaymi\anaconda3\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "C:\Users\jaymi\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 457, in dispatch_queue
      await self.process_one()
    File "C:\Users\jaymi\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 446, in process_one
      await dispatch(*args)
    File "C:\Users\jaymi\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 353, in dispatch_shell
      await result
    File "C:\Users\jaymi\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 648, in execute_request
      reply_content = await reply_content
    File "C:\Users\jaymi\anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 353, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "C:\Users\jaymi\anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell
      return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
    File "C:\Users\jaymi\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2901, in run_cell
      result = self._run_cell(
    File "C:\Users\jaymi\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2947, in _run_cell
      return runner(coro)
    File "C:\Users\jaymi\anaconda3\lib\site-packages\IPython\core\async_helpers.py", line 68, in _pseudo_sync_runner
      coro.send(None)
    File "C:\Users\jaymi\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3172, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "C:\Users\jaymi\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3364, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
    File "C:\Users\jaymi\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3444, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\jaymi\AppData\Local\Temp/ipykernel_15608/491815658.py", line 1, in <module>
      dl_predict_game('Gonzaga', 'Baylor')
    File "C:\Users\jaymi\AppData\Local\Temp/ipykernel_15608/2526544572.py", line 20, in dl_predict_game
      prediction = (deep_model.predict(game_df)).astype(float)
    File "C:\Users\jaymi\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\jaymi\anaconda3\lib\site-packages\keras\engine\training.py", line 1982, in predict
      tmp_batch_outputs = self.predict_function(iterator)
    File "C:\Users\jaymi\anaconda3\lib\site-packages\keras\engine\training.py", line 1801, in predict_function
      return step_function(self, iterator)
    File "C:\Users\jaymi\anaconda3\lib\site-packages\keras\engine\training.py", line 1790, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\jaymi\anaconda3\lib\site-packages\keras\engine\training.py", line 1783, in run_step
      outputs = model.predict_step(data)
    File "C:\Users\jaymi\anaconda3\lib\site-packages\keras\engine\training.py", line 1751, in predict_step
      return self(x, training=False)
    File "C:\Users\jaymi\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\jaymi\anaconda3\lib\site-packages\keras\engine\base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "C:\Users\jaymi\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\jaymi\anaconda3\lib\site-packages\keras\engine\sequential.py", line 374, in call
      return super(Sequential, self).call(inputs, training=training, mask=mask)
    File "C:\Users\jaymi\anaconda3\lib\site-packages\keras\engine\functional.py", line 451, in call
      return self._run_internal_graph(
    File "C:\Users\jaymi\anaconda3\lib\site-packages\keras\engine\functional.py", line 571, in _run_internal_graph
      y = self._conform_to_reference_input(y, ref_input=x)
    File "C:\Users\jaymi\anaconda3\lib\site-packages\keras\engine\functional.py", line 671, in _conform_to_reference_input
      tensor = tf.cast(tensor, dtype=ref_input.dtype)
Node: 'sequential/Cast'
Cast string to float is not supported
	 [[{{node sequential/Cast}}]] [Op:__inference_predict_function_43263]