# Create a neural network to create predictions for tournament

In [1]:
#Import dependencies
import numpy as np
import pandas as pd
import tensorflow

## Prepare data for NN

In [2]:
#Import the derived stats csv
reg_season = pd.read_csv("derived_stats.csv")

#Import csvs that give the team's results and seeds for each year
tourney_result = pd.read_csv('WDataFiles_Stage1/WNCAATourneyCompactResults.csv')
tourney_seed = pd.read_csv('WDataFiles_Stage1/WNCAATourneySeeds.csv')

In [3]:
#Select only the winning and losing team ids for each season
tourney_result = tourney_result[['Season', 'WTeamID', 'LTeamID']]
tourney_result

Unnamed: 0,Season,WTeamID,LTeamID
0,1998,3104,3422
1,1998,3112,3365
2,1998,3163,3193
3,1998,3198,3266
4,1998,3203,3208
...,...,...,...
1381,2019,3124,3234
1382,2019,3323,3390
1383,2019,3124,3332
1384,2019,3323,3163


In [4]:
tourney_seed

Unnamed: 0,Season,Seed,TeamID
0,1998,W01,3330
1,1998,W02,3163
2,1998,W03,3112
3,1998,W04,3301
4,1998,W05,3272
...,...,...,...
1403,2019,Z12,3416
1404,2019,Z13,3195
1405,2019,Z14,3200
1406,2019,Z15,3340


In [5]:
#Merge seed with winning team ids
result_seedW = pd.merge(tourney_result, tourney_seed, how='left', left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'])
result_seedW.rename(columns={'Seed': 'WSeed'}, inplace=True)
result_seedW.drop('TeamID', axis=1, inplace=True)

In [6]:
#Merge seed with losing team ids
result_seed = pd.merge(result_seedW, tourney_seed, how='left', left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'])
result_seed.rename(columns={'Seed': 'LSeed'}, inplace=True)
result_seed.drop('TeamID', axis=1, inplace=True)

In [7]:
#Create function that removes the letter before the seed
def remove_region(region_seed):
    return int(region_seed[1:3])

In [8]:
#Apply the remove_region function to the dataframe
result_seed['WSeed'] = result_seed['WSeed'].apply(remove_region)
result_seed['LSeed'] = result_seed['LSeed'].apply(remove_region)

In [9]:
#Select the seasons that are consistent with the seasons with the derived stats dataframe
result_seed = result_seed[result_seed['Season'].isin([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019])]
result_seed.reset_index(drop=True, inplace=True)
result_seed

Unnamed: 0,Season,WTeamID,LTeamID,WSeed,LSeed
0,2010,3124,3201,4,13
1,2010,3173,3395,8,9
2,2010,3181,3214,2,15
3,2010,3199,3256,3,14
4,2010,3207,3265,5,12
...,...,...,...,...,...
625,2019,3124,3234,1,2
626,2019,3323,3390,1,2
627,2019,3124,3332,1,2
628,2019,3323,3163,1,2


In [10]:
#Create copies of dataframes change the team ids from winning and losing to 1 and 2
rs_win = result_seed.copy()
rs_win.rename(columns={'WTeamID': 'TeamID_1', 'LTeamID': 'TeamID_2', 'WSeed': 'Seed_1', 'LSeed': 'Seed_2'}, inplace=True)
rs_lose = result_seed.copy()
rs_lose.rename(columns={'WTeamID': 'TeamID_2', 'LTeamID': 'TeamID_1', 'WSeed': 'Seed_2', 'LSeed': 'Seed_1'}, inplace=True)
rs_win['Result'] = 1
rs_lose['Result'] = 0

In [11]:
#Combine the winning and losing dataframes
rs_both = pd.concat((rs_win, rs_lose), sort=False).reset_index(drop=True)
rs_both

Unnamed: 0,Season,TeamID_1,TeamID_2,Seed_1,Seed_2,Result
0,2010,3124,3201,4,13,1
1,2010,3173,3395,8,9,1
2,2010,3181,3214,2,15,1
3,2010,3199,3256,3,14,1
4,2010,3207,3265,5,12,1
...,...,...,...,...,...,...
1255,2019,3234,3124,2,1,0
1256,2019,3390,3323,2,1,0
1257,2019,3332,3124,2,1,0
1258,2019,3163,3323,2,1,0


In [12]:
list(reg_season)

['GameID',
 'Season',
 'DayNum',
 'Loc',
 'NumOT',
 'TeamID',
 'Score',
 'FGM',
 'FGA',
 'FGM3',
 'FGA3',
 'FTM',
 'FTA',
 'OR',
 'DR',
 'Ast',
 'TO',
 'Stl',
 'Blk',
 'PF',
 'OTeamID',
 'OScore',
 'OFGM',
 'OFGA',
 'OFGM3',
 'OFGA3',
 'OFTM',
 'OFTA',
 'OOR',
 'ODR',
 'OAst',
 'OTO',
 'OStl',
 'OBlk',
 'OPF',
 'Result',
 'Poss',
 'OPoss',
 'Pace',
 'OFF',
 'DEF',
 'AvgOFF',
 'AvgPace',
 'AvgDEF',
 'Wins',
 'Games',
 'WinP',
 'OWinP',
 'OppOWinP',
 'RPI']

In [13]:
#Select the average pace, offensive, and defensive ratings and the team's winning percentage,
#opponent's winning percentage, opponent's opponent's winning percentage and RPI
reg_season = reg_season[['Season', 'TeamID', 'AvgPace', 'AvgOFF', 'AvgDEF', 'WinP', 'OWinP', 'OppOWinP', 'RPI']]
reg_group = reg_season.groupby(['Season', 'TeamID']).mean()
reg_group.reset_index(inplace=True)
reg_group

Unnamed: 0,Season,TeamID,AvgPace,AvgOFF,AvgDEF,WinP,OWinP,OppOWinP,RPI
0,2010,3102,64.854018,79.972377,92.024747,0.035714,0.554309,0.554309,0.424660
1,2010,3103,68.811250,90.605658,90.253684,0.566667,0.478130,0.478130,0.500264
2,2010,3104,74.860345,85.556421,91.057582,0.379310,0.529278,0.529278,0.491786
3,2010,3105,74.012963,82.125294,83.274580,0.518519,0.408657,0.408657,0.436122
4,2010,3106,69.392672,79.593698,81.661585,0.413793,0.436372,0.436372,0.430727
...,...,...,...,...,...,...,...,...,...
3464,2019,3462,69.659483,86.952443,94.711457,0.344828,0.545711,0.545711,0.495490
3465,2019,3463,70.988362,88.854816,91.206258,0.551724,0.492282,0.492282,0.507143
3466,2019,3464,67.784914,100.523964,88.400773,0.689655,0.458233,0.458233,0.516089
3467,2019,3465,78.640500,97.313376,88.670115,0.560000,0.420086,0.420086,0.455065


In [14]:
#Merge regular season dataframe onto the result and seed dataframe by season and team id for both team ids
rts_1 = pd.merge(rs_both, reg_group, how='left', left_on=['Season', 'TeamID_1'], right_on=['Season', 'TeamID'])
rts_1.drop('TeamID', axis=1, inplace=True)
rts = pd.merge(rts_1, reg_group, how='left', left_on=['Season', 'TeamID_2'], right_on=['Season', 'TeamID'], suffixes=('_1', '_2'))
rts.drop(['TeamID_1', 'TeamID_2', 'TeamID', 'Season'], axis=1, inplace=True)
rts

Unnamed: 0,Seed_1,Seed_2,Result,AvgPace_1,AvgOFF_1,AvgDEF_1,WinP_1,OWinP_1,OppOWinP_1,RPI_1,AvgPace_2,AvgOFF_2,AvgDEF_2,WinP_2,OWinP_2,OppOWinP_2,RPI_2
0,4,13,1,72.175781,99.658607,95.616172,0.718750,0.594413,0.594413,0.625498,72.037879,101.686032,90.513622,0.818182,0.507732,0.507732,0.585344
1,8,9,1,72.897115,97.271769,90.130587,0.807692,0.536687,0.536687,0.604438,73.391250,98.346002,91.242103,0.733333,0.523685,0.523685,0.576097
2,2,15,1,73.216797,97.418218,92.736424,0.843750,0.612990,0.612990,0.670680,69.771667,90.506963,85.620101,0.633333,0.465572,0.465572,0.507512
3,3,14,1,73.925833,100.833748,91.287937,0.833333,0.545861,0.545861,0.617729,76.128226,97.814863,89.049872,0.741935,0.478864,0.478864,0.544632
4,5,12,1,70.179583,97.228980,90.336751,0.800000,0.546405,0.546405,0.609804,67.431061,100.934661,88.719918,0.787879,0.505540,0.505540,0.576125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1255,2,1,0,70.774609,113.157432,98.300952,0.812500,0.608047,0.608047,0.659161,71.764453,111.740902,97.748523,0.968750,0.606965,0.606965,0.697411
1256,2,1,0,70.356250,106.042042,101.249238,0.875000,0.603816,0.603816,0.671612,75.517803,117.857692,100.128861,0.909091,0.655450,0.655450,0.718860
1257,2,1,0,69.493750,122.527953,99.927352,0.875000,0.595451,0.595451,0.665338,71.764453,111.740902,97.748523,0.968750,0.606965,0.606965,0.697411
1258,2,1,0,70.557955,116.680039,93.450763,0.939394,0.558694,0.558694,0.653869,75.517803,117.857692,100.128861,0.909091,0.655450,0.655450,0.718860


## Data pre-processing

In [15]:
#Separate results from rest of dataframe
X = rts.drop('Result', axis=1)
y = rts['Result']
print(X.shape, y.shape)

(1260, 16) (1260,)


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [17]:
#Split into training and test data in 80/20 split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=.2)

In [18]:
#Transform training and testing data using X_scaler
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Create model

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [20]:
#Create model with a layer of 100 hidden nodes with 16 inputs
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=16))
model.add(Dense(units=1, activation='sigmoid'))

In [21]:
#Compile model optimizing for log loss
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [22]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               1700      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 1,801
Trainable params: 1,801
Non-trainable params: 0
_________________________________________________________________


In [23]:
#Fit model
model.fit(
    X_train_scaled,
    y_train,
    epochs=100,
    shuffle=True,
    verbose=2
)

Train on 1008 samples
Epoch 1/100
1008/1008 - 0s - loss: 0.5276 - accuracy: 0.7560
Epoch 2/100
1008/1008 - 0s - loss: 0.4347 - accuracy: 0.7887
Epoch 3/100
1008/1008 - 0s - loss: 0.4222 - accuracy: 0.7956
Epoch 4/100
1008/1008 - 0s - loss: 0.4179 - accuracy: 0.8036
Epoch 5/100
1008/1008 - 0s - loss: 0.4161 - accuracy: 0.8026
Epoch 6/100
1008/1008 - 0s - loss: 0.4122 - accuracy: 0.8056
Epoch 7/100
1008/1008 - 0s - loss: 0.4114 - accuracy: 0.8056
Epoch 8/100
1008/1008 - 0s - loss: 0.4090 - accuracy: 0.8016
Epoch 9/100
1008/1008 - 0s - loss: 0.4067 - accuracy: 0.8075
Epoch 10/100
1008/1008 - 0s - loss: 0.4056 - accuracy: 0.8075
Epoch 11/100
1008/1008 - 0s - loss: 0.4037 - accuracy: 0.8075
Epoch 12/100
1008/1008 - 0s - loss: 0.4032 - accuracy: 0.8155
Epoch 13/100
1008/1008 - 0s - loss: 0.4009 - accuracy: 0.8135
Epoch 14/100
1008/1008 - 0s - loss: 0.4006 - accuracy: 0.8115
Epoch 15/100
1008/1008 - 0s - loss: 0.3982 - accuracy: 0.8125
Epoch 16/100
1008/1008 - 0s - loss: 0.3978 - accuracy: 0.

<tensorflow.python.keras.callbacks.History at 0x13b56a390>

In [24]:
#Evaluate model using test data
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

252/252 - 0s - loss: 0.4875 - accuracy: 0.7421
Loss: 0.4875278226912968, Accuracy: 0.7420634627342224


## Test model using submission stage 1 file

In [25]:
test_df = pd.read_csv('WSampleSubmissionStage1_2020.csv')

In [26]:
#Separate game ids to season, and team 1 and 2 ids
test_df['Season'] = test_df['ID'].map(lambda x: int(x[:4]))
test_df['TeamID_1'] = test_df['ID'].map(lambda x: int(x[5:9]))
test_df['TeamID_2'] = test_df['ID'].map(lambda x: int(x[10:14]))
test_df

Unnamed: 0,ID,Pred,Season,TeamID_1,TeamID_2
0,2015_3106_3107,0.5,2015,3106,3107
1,2015_3106_3110,0.5,2015,3106,3110
2,2015_3106_3113,0.5,2015,3106,3113
3,2015_3106_3114,0.5,2015,3106,3114
4,2015_3106_3116,0.5,2015,3106,3116
...,...,...,...,...,...
10075,2019_3413_3417,0.5,2019,3413,3417
10076,2019_3413_3460,0.5,2019,3413,3460
10077,2019_3416_3417,0.5,2019,3416,3417
10078,2019_3416_3460,0.5,2019,3416,3460


In [27]:
#Merge seeds onto test data for both teams
test_seedW = pd.merge(test_df, tourney_seed, how='left', left_on=['Season', 'TeamID_1'], right_on=['Season', 'TeamID'])
test_seedW.rename(columns={'Seed': 'Seed_1'}, inplace=True)
test_seedW.drop('TeamID', axis=1, inplace=True)
test_seed = pd.merge(test_seedW, tourney_seed, how='left', left_on=['Season', 'TeamID_2'], right_on=['Season', 'TeamID'])
test_seed.rename(columns={'Seed': 'Seed_2'}, inplace=True)
test_seed.drop('TeamID', axis=1, inplace=True)
test_seed

Unnamed: 0,ID,Pred,Season,TeamID_1,TeamID_2,Seed_1,Seed_2
0,2015_3106_3107,0.5,2015,3106,3107,Y15,X13
1,2015_3106_3110,0.5,2015,3106,3110,Y15,Z14
2,2015_3106_3113,0.5,2015,3106,3113,Y15,Y03
3,2015_3106_3114,0.5,2015,3106,3114,Y15,Y11
4,2015_3106_3116,0.5,2015,3106,3116,Y15,Z10
...,...,...,...,...,...,...,...
10075,2019_3413_3417,0.5,2019,3413,3417,X15,W06
10076,2019_3413_3460,0.5,2019,3413,3460,X15,X13
10077,2019_3416_3417,0.5,2019,3416,3417,Z12,W06
10078,2019_3416_3460,0.5,2019,3416,3460,Z12,X13


In [28]:
#Merge derived stats data onto test data for both teams
test_1 = pd.merge(test_seed, reg_group, how='left', left_on=['Season', 'TeamID_1'], right_on=['Season', 'TeamID'])
test_1.drop('TeamID', axis=1, inplace=True)
test = pd.merge(test_1, reg_group, how='left', left_on=['Season', 'TeamID_2'], right_on=['Season', 'TeamID'], suffixes=('_1', '_2'))
test.drop(['TeamID_1', 'TeamID_2', 'TeamID', 'Season', 'ID', 'Pred'], axis=1, inplace=True)

#Removes the letter in front of the seeds
test['Seed_1'] = test['Seed_1'].apply(remove_region)
test['Seed_2'] = test['Seed_2'].apply(remove_region)
test

Unnamed: 0,Seed_1,Seed_2,AvgPace_1,AvgOFF_1,AvgDEF_1,WinP_1,OWinP_1,OppOWinP_1,RPI_1,AvgPace_2,AvgOFF_2,AvgDEF_2,WinP_2,OWinP_2,OppOWinP_2,RPI_2
0,15,13,72.978125,83.187459,82.331697,0.5000,0.411379,0.411379,0.433534,67.858468,101.786831,90.061243,0.741935,0.465344,0.465344,0.534492
1,15,14,72.978125,83.187459,82.331697,0.5000,0.411379,0.411379,0.433534,65.419141,97.199834,90.206581,0.750000,0.513614,0.513614,0.572710
2,15,3,72.978125,83.187459,82.331697,0.5000,0.411379,0.411379,0.433534,66.206250,102.225293,94.288897,0.843750,0.554526,0.554526,0.626832
3,15,11,72.978125,83.187459,82.331697,0.5000,0.411379,0.411379,0.433534,65.747656,99.518629,89.532780,0.875000,0.485701,0.485701,0.583026
4,15,10,72.978125,83.187459,82.331697,0.5000,0.411379,0.411379,0.433534,65.814583,91.444377,93.148383,0.566667,0.618365,0.618365,0.605440
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10075,15,6,69.186667,103.044230,90.366138,0.8000,0.442233,0.442233,0.531675,70.850000,103.337474,100.571447,0.625000,0.606097,0.606097,0.610822
10076,15,13,69.186667,103.044230,90.366138,0.8000,0.442233,0.442233,0.531675,71.598047,97.946389,89.259834,0.812500,0.457051,0.457051,0.545913
10077,12,6,68.406641,90.381588,92.700613,0.8125,0.561353,0.561353,0.624140,70.850000,103.337474,100.571447,0.625000,0.606097,0.606097,0.610822
10078,12,13,68.406641,90.381588,92.700613,0.8125,0.561353,0.561353,0.624140,71.598047,97.946389,89.259834,0.812500,0.457051,0.457051,0.545913


In [29]:
#Scale the new test data
test_scaled = X_scaler.transform(test)

In [30]:
#Use model to predict probabilities
predictions = model.predict_proba(test_scaled)

In [31]:
#Reload the submission csv to get format
submission_df = pd.read_csv('WSampleSubmissionStage1_2020.csv')

#Overwrite Pred column with model's predictions
submission_df['Pred'] = predictions

#Save predictions to csv to load to competition
submission_df.to_csv('test_submissions.csv', index=False)

In [32]:
predictions

array([[9.8399557e-03],
       [1.5100473e-02],
       [4.2065259e-04],
       ...,
       [2.2014739e-02],
       [6.7856884e-01],
       [9.7624618e-01]], dtype=float32)