In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
import datetime as dt
import random
from sportsreference.ncaab.teams import Teams
from sklearn.preprocessing import scale
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [2]:
teams_df = pd.DataFrame()
i = 2010
while i <=2018:
    print("Downloading data for the", i, "season.")
    start = dt.datetime.now()
    teams = Teams(year = i)
    end = dt.datetime.now()
    teams_temp = teams.dataframes
    teams_temp['Season'] = i
    teams_df = pd.concat([teams_df, teams_temp])
    span = end - start
    print(i, "took", span, "seconds to download.")
    i += 1

Downloading data for the 2010 season.
2010 took 0:00:34.315943 seconds to download.
Downloading data for the 2011 season.
2011 took 0:00:37.216888 seconds to download.
Downloading data for the 2012 season.
2012 took 0:00:32.471001 seconds to download.
Downloading data for the 2013 season.
2013 took 0:00:34.300984 seconds to download.
Downloading data for the 2014 season.
2014 took 0:00:33.075729 seconds to download.
Downloading data for the 2015 season.
2015 took 0:00:35.611277 seconds to download.
Downloading data for the 2016 season.
2016 took 0:00:34.297576 seconds to download.
Downloading data for the 2017 season.
2017 took 0:00:30.656765 seconds to download.
Downloading data for the 2018 season.
2018 took 0:00:33.198010 seconds to download.


In [3]:
teams_df.head(10)

Unnamed: 0,abbreviation,assist_percentage,assists,away_losses,away_wins,block_percentage,blocks,conference,conference_losses,conference_wins,...,total_rebounds,true_shooting_percentage,turnover_percentage,turnovers,two_point_field_goal_attempts,two_point_field_goal_percentage,two_point_field_goals,win_percentage,wins,Season
AIR-FORCE,AIR-FORCE,61.6,389,10,0,6.2,51,mwc,15,1,...,851,0.529,19.0,394,866,0.527,456,0.323,10,2010
AKRON,AKRON,53.9,473,4,7,8.5,114,mac,4,12,...,1269,0.521,16.4,464,1330,0.483,642,0.686,24,2010
ALABAMA-AM,ALABAMA-AM,48.1,288,9,3,12.7,143,swac,10,8,...,1004,0.463,18.8,444,1197,0.41,491,0.407,11,2010
ALABAMA-BIRMINGHAM,ALABAMA-BIRMINGHAM,51.1,390,4,9,7.3,91,cusa,5,11,...,1248,0.518,17.0,451,1237,0.474,586,0.735,25,2010
ALABAMA-STATE,ALABAMA-STATE,60.0,400,11,6,11.1,129,swac,6,12,...,1101,0.499,20.0,501,1064,0.448,477,0.516,16,2010
ALABAMA,ALABAMA,54.2,429,6,3,10.9,127,sec,10,6,...,1126,0.525,16.4,410,1294,0.476,616,0.531,17,2010
ALBANY-NY,ALBANY-NY,53.5,380,15,3,8.2,97,america-east,14,2,...,1156,0.503,20.6,517,1151,0.448,516,0.219,7,2010
ALCORN-STATE,ALCORN-STATE,41.3,269,18,0,7.4,98,swac,16,2,...,1065,0.456,23.7,642,1280,0.403,516,0.065,2,2010
AMERICAN,AMERICAN,65.6,447,12,4,6.6,81,patriot,7,7,...,1097,0.518,19.6,458,1069,0.465,497,0.355,11,2010
APPALACHIAN-STATE,APPALACHIAN-STATE,46.9,447,8,8,7.7,102,southern,5,13,...,1424,0.581,18.6,544,1303,0.518,675,0.649,24,2010


In [4]:
team_names = teams_df[['abbreviation']]

In [5]:
team_names.head()

Unnamed: 0,abbreviation
AIR-FORCE,AIR-FORCE
AKRON,AKRON
ALABAMA-AM,ALABAMA-AM
ALABAMA-BIRMINGHAM,ALABAMA-BIRMINGHAM
ALABAMA-STATE,ALABAMA-STATE


In [6]:
#Only needed this on first run
#team_names.to_csv(r'data/team_names.csv')
#teams_df.to_csv(r'data/team_data.csv')

In [7]:
games = pd.read_csv(r'data/TourneyCompactResults.csv')

In [8]:
games = games[['Season','WTeamID','LTeamID']]
games['Winner'] = games['WTeamID']
games = games.rename(columns = {'WTeamID' : 'Team_A', 'LTeamID': 'Team_B'})
games.head()

Unnamed: 0,Season,Team_A,Team_B,Winner
0,1985,1116,1234,1116
1,1985,1120,1345,1120
2,1985,1207,1250,1207
3,1985,1229,1425,1229
4,1985,1242,1325,1242


In [9]:
i = 0
games['Team_X'] = 0
games['Team_Y'] = 0
while i < len(games):
    games['Team_X'][i] = random.choice(([games['Team_A'][i],games['Team_B'][i]]))
    i += 1
games['Team_Y'] = np.where(games['Team_X'] == games['Team_A'], games['Team_B'], games['Team_A'])
games['Result'] = np.where(games['Winner'] == games['Team_X'], 'X', 'Y')
games = games.drop(columns = ['Team_A','Team_B'], axis = 1)

In [10]:
games.head(25)

Unnamed: 0,Season,Winner,Team_X,Team_Y,Result
0,1985,1116,1234,1116,Y
1,1985,1120,1345,1120,Y
2,1985,1207,1207,1250,X
3,1985,1229,1229,1425,X
4,1985,1242,1325,1242,Y
5,1985,1246,1449,1246,Y
6,1985,1256,1256,1338,X
7,1985,1260,1260,1233,X
8,1985,1314,1292,1314,Y
9,1985,1323,1323,1333,X


In [11]:
team_data = teams_df[['Season','abbreviation','assist_percentage','block_percentage','effective_field_goal_percentage','field_goal_percentage','free_throw_attempt_rate','free_throw_percentage','free_throws_per_field_goal_attempt','net_rating','offensive_rating','offensive_rebound_percentage','opp_assist_percentage','opp_block_percentage','opp_effective_field_goal_percentage','opp_field_goal_percentage','opp_free_throw_attempt_rate','opp_free_throw_percentage','opp_free_throws_per_field_goal_attempt','opp_offensive_rating','opp_offensive_rebound_percentage','opp_steal_percentage','opp_three_point_attempt_rate','opp_three_point_field_goal_percentage','opp_total_rebound_percentage','opp_true_shooting_percentage','opp_two_point_field_goal_percentage','pace','simple_rating_system','steal_percentage','strength_of_schedule','three_point_attempt_rate','three_point_field_goal_percentage','total_rebound_percentage','true_shooting_percentage','turnover_percentage','two_point_field_goal_percentage','win_percentage']]

In [12]:
games = games.loc[games['Season'] >= 2010]
games.head()

Unnamed: 0,Season,Winner,Team_X,Team_Y,Result
1584,2010,1115,1457,1115,Y
1585,2010,1124,1358,1124,Y
1586,2010,1139,1139,1431,X
1587,2010,1140,1196,1140,Y
1588,2010,1242,1250,1242,Y


In [13]:
team_abb = pd.read_csv(r'data/team_names.csv')

In [14]:
team_abb = team_abb[['abbreviation','Team_Id']]

In [15]:
team_data_abb = team_data.merge(team_abb, left_on = 'abbreviation', right_on = 'abbreviation', how = 'left').dropna()
team_data_abb['Team_Id'] = team_data_abb['Team_Id'].astype(np.int64)

In [16]:
team_data_abb.head()

Unnamed: 0,Season,abbreviation,assist_percentage,block_percentage,effective_field_goal_percentage,field_goal_percentage,free_throw_attempt_rate,free_throw_percentage,free_throws_per_field_goal_attempt,net_rating,...,steal_percentage,strength_of_schedule,three_point_attempt_rate,three_point_field_goal_percentage,total_rebound_percentage,true_shooting_percentage,turnover_percentage,two_point_field_goal_percentage,win_percentage,Team_Id
0,2010,AIR-FORCE,61.6,6.2,0.504,0.443,0.367,0.635,0.233,-10.0,...,8.4,3.13,0.394,0.313,46.8,0.529,19.0,0.527,0.323,1102
1,2010,AKRON,53.9,8.5,0.491,0.433,0.363,0.657,0.239,7.6,...,9.0,-1.5,0.343,0.339,51.6,0.521,16.4,0.483,0.686,1103
2,2010,ALABAMA-AM,48.1,12.7,0.416,0.382,0.474,0.635,0.301,-5.2,...,12.5,-13.71,0.237,0.291,46.8,0.463,18.8,0.41,0.407,1105
3,2010,ALABAMA-BIRMINGHAM,51.1,7.3,0.471,0.422,0.457,0.694,0.317,10.0,...,10.0,2.9,0.315,0.311,53.6,0.518,17.0,0.474,0.735,1412
4,2010,ALABAMA-STATE,60.0,11.1,0.462,0.404,0.448,0.641,0.287,-2.1,...,10.8,-12.02,0.356,0.324,51.3,0.499,20.0,0.448,0.516,1106


In [17]:
games_a = games.merge(team_data_abb, left_on = ['Team_X','Season'], right_on = ['Team_Id','Season'], how = 'left')
games_b = games_a.merge(team_data_abb, left_on = ['Team_Y','Season'], right_on = ['Team_Id','Season'], how = 'left')
games_b = games_b.dropna()
print(games_b.head(15))

    Season  Winner  Team_X  Team_Y Result     abbreviation_x  \
0     2010    1115    1457    1115      Y           WINTHROP   
1     2010    1124    1358    1124      Y  SAM-HOUSTON-STATE   
2     2010    1139    1139    1431      X             BUTLER   
3     2010    1140    1196    1140      Y            FLORIDA   
4     2010    1242    1250    1242      Y             LEHIGH   
5     2010    1243    1317    1243      Y        NORTH-TEXAS   
6     2010    1246    1246    1190      X           KENTUCKY   
7     2010    1293    1293    1435      X       MURRAY-STATE   
8     2010    1307    1307    1285      X         NEW-MEXICO   
9     2010    1320    1424    1320      Y   NEVADA-LAS-VEGAS   
10    2010    1325    1207    1325      Y         GEORGETOWN   
11    2010    1330    1323    1330      Y         NOTRE-DAME   
12    2010    1388    1350    1388      Y           RICHMOND   
13    2010    1397    1361    1397      Y    SAN-DIEGO-STATE   
14    2010    1437    1437    1352      

In [18]:
games_b['Winner'] = np.where(games_b['Winner'] == games_b['Team_X'], games_b['abbreviation_x'], games_b['abbreviation_y'])
games_b['Team_X'] = games_b['abbreviation_x']
games_b['Team_Y'] = games_b['abbreviation_y']

In [19]:
games_b.dtypes

Season                                        int64
Winner                                       object
Team_X                                       object
Team_Y                                       object
Result                                       object
abbreviation_x                               object
assist_percentage_x                         float64
block_percentage_x                          float64
effective_field_goal_percentage_x           float64
field_goal_percentage_x                     float64
free_throw_attempt_rate_x                   float64
free_throw_percentage_x                     float64
free_throws_per_field_goal_attempt_x        float64
net_rating_x                                float64
offensive_rating_x                          float64
offensive_rebound_percentage_x              float64
opp_assist_percentage_x                     float64
opp_block_percentage_x                      float64
opp_effective_field_goal_percentage_x       float64
opp_field_go

In [20]:
print(games_b.head(15))

    Season               Winner             Team_X                Team_Y  \
0     2010  ARKANSAS-PINE-BLUFF           WINTHROP   ARKANSAS-PINE-BLUFF   
1     2010               BAYLOR  SAM-HOUSTON-STATE                BAYLOR   
2     2010               BUTLER             BUTLER         TEXAS-EL-PASO   
3     2010        BRIGHAM-YOUNG            FLORIDA         BRIGHAM-YOUNG   
4     2010               KANSAS             LEHIGH                KANSAS   
5     2010         KANSAS-STATE        NORTH-TEXAS          KANSAS-STATE   
6     2010             KENTUCKY           KENTUCKY  EAST-TENNESSEE-STATE   
7     2010         MURRAY-STATE       MURRAY-STATE            VANDERBILT   
8     2010           NEW-MEXICO         NEW-MEXICO               MONTANA   
9     2010        NORTHERN-IOWA   NEVADA-LAS-VEGAS         NORTHERN-IOWA   
10    2010                 OHIO         GEORGETOWN                  OHIO   
11    2010         OLD-DOMINION         NOTRE-DAME          OLD-DOMINION   
12    2010  

In [21]:
ml_input = games_b.drop(columns = ['Season','Winner','Team_X','Team_Y','abbreviation_x','abbreviation_y','Team_Id_x','Team_Id_y','win_percentage_x','win_percentage_y'], axis = 1)
ml_input.dtypes

Result                                       object
assist_percentage_x                         float64
block_percentage_x                          float64
effective_field_goal_percentage_x           float64
field_goal_percentage_x                     float64
free_throw_attempt_rate_x                   float64
free_throw_percentage_x                     float64
free_throws_per_field_goal_attempt_x        float64
net_rating_x                                float64
offensive_rating_x                          float64
offensive_rebound_percentage_x              float64
opp_assist_percentage_x                     float64
opp_block_percentage_x                      float64
opp_effective_field_goal_percentage_x       float64
opp_field_goal_percentage_x                 float64
opp_free_throw_attempt_rate_x               float64
opp_free_throw_percentage_x                 float64
opp_free_throws_per_field_goal_attempt_x    float64
opp_offensive_rating_x                      float64
opp_offensiv

In [22]:
X_all = ml_input.drop(['Result'],1)
y_all = ml_input['Result']

In [23]:
cols = [['assist_percentage_x','block_percentage_x','effective_field_goal_percentage_x','field_goal_percentage_x','free_throw_attempt_rate_x','free_throw_percentage_x','free_throws_per_field_goal_attempt_x','net_rating_x','offensive_rating_x','offensive_rebound_percentage_x','opp_assist_percentage_x','opp_block_percentage_x','opp_effective_field_goal_percentage_x','opp_field_goal_percentage_x','opp_free_throw_attempt_rate_x','opp_free_throw_percentage_x','opp_free_throws_per_field_goal_attempt_x','opp_offensive_rating_x','opp_offensive_rebound_percentage_x','opp_steal_percentage_x','opp_three_point_attempt_rate_x','opp_three_point_field_goal_percentage_x','opp_total_rebound_percentage_x','opp_true_shooting_percentage_x','opp_two_point_field_goal_percentage_x','pace_x','simple_rating_system_x','steal_percentage_x','strength_of_schedule_x','three_point_attempt_rate_x','three_point_field_goal_percentage_x','total_rebound_percentage_x','true_shooting_percentage_x','turnover_percentage_x','two_point_field_goal_percentage_x','assist_percentage_y','block_percentage_y','effective_field_goal_percentage_y','field_goal_percentage_y','free_throw_attempt_rate_y','free_throw_percentage_y','free_throws_per_field_goal_attempt_y','net_rating_y','offensive_rating_y','offensive_rebound_percentage_y','opp_assist_percentage_y','opp_block_percentage_y','opp_effective_field_goal_percentage_y','opp_field_goal_percentage_y','opp_free_throw_attempt_rate_y','opp_free_throw_percentage_y','opp_free_throws_per_field_goal_attempt_y','opp_offensive_rating_y','opp_offensive_rebound_percentage_y','opp_steal_percentage_y','opp_three_point_attempt_rate_y','opp_three_point_field_goal_percentage_y','opp_total_rebound_percentage_y','opp_true_shooting_percentage_y','opp_two_point_field_goal_percentage_y','pace_y','simple_rating_system_y','steal_percentage_y','strength_of_schedule_y','three_point_attempt_rate_y','three_point_field_goal_percentage_y','total_rebound_percentage_y','true_shooting_percentage_y','turnover_percentage_y','two_point_field_goal_percentage_y']]
for col in cols:
    X_all[col] = scale(X_all[col])

In [24]:
len(X_all)

533

In [25]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_all, y_all,
                                                   test_size = 100,
                                                   random_state = 2,
                                                   stratify = y_all)



In [26]:
#for measuring training time
from time import time 
# F1 score (also F-score or F-measure) is a measure of a test's accuracy. 
#It considers both the precision p and the recall r of the test to compute 
#the score: p is the number of correct positive results divided by the number of 
#all positive results, and r is the number of correct positive results divided by 
#the number of positive results that should have been returned. The F1 score can be 
#interpreted as a weighted average of the precision and recall, where an F1 score 
#reaches its best value at 1 and worst at 0.
from sklearn.metrics import f1_score

def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    time_taken = end - start
    
    # Print the results
    print('Trained model', clf.__class__.__name__, 'in', time_taken, ' seconds')

    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    
    end = time()
    # Print and return results
    time_taken = end - start
    print("Made predictions in", time_taken, "seconds.")
    
    return f1_score(target, y_pred, pos_label='X'), sum(target == y_pred) / float(len(y_pred))


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    #print("Training something using a training set size of %d. . .").format(len(X_train))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    f1, acc = predict_labels(clf, X_train, y_train)
    print("F1 score and accuracy score for training set:", f1, " , ", acc)
    
    f1, acc = predict_labels(clf, X_test, y_test)
    print("F1 score and accuracy score for test set:", f1, " , ", acc)

In [27]:
clf_A = LogisticRegression(random_state = 64)
clf_B = SVC(random_state = 64, kernel = 'rbf')
clf_C = xgb.XGBClassifier(seed = 64)

train_predict(clf_A, X_train, y_train, X_test, y_test)
print('')
train_predict(clf_B, X_train, y_train, X_test, y_test)
print('')
train_predict(clf_C, X_train, y_train, X_test, y_test)
print('')

Trained model LogisticRegression in 0.2737720012664795  seconds
Made predictions in 0.003008127212524414 seconds.
F1 score and accuracy score for training set: 0.7528868360277137  ,  0.7528868360277137
Made predictions in 0.0 seconds.
F1 score and accuracy score for test set: 0.7722772277227723  ,  0.77

Trained model SVC in 0.021053075790405273  seconds
Made predictions in 0.01654338836669922 seconds.
F1 score and accuracy score for training set: 0.8787185354691075  ,  0.8775981524249422
Made predictions in 0.003509521484375 seconds.
F1 score and accuracy score for test set: 0.7378640776699028  ,  0.73

Trained model XGBClassifier in 0.3313789367675781  seconds
Made predictions in 0.008519411087036133 seconds.
F1 score and accuracy score for training set: 0.9931034482758622  ,  0.9930715935334873
Made predictions in 0.006516933441162109 seconds.
F1 score and accuracy score for test set: 0.6728971962616822  ,  0.65



  if diff:
  if diff:


In [28]:
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer


# TODO: Create the parameters list you wish to tune
parameters = { 'learning_rate' : [0.1],
               'n_estimators' : [40],
               'max_depth': [3],
               'min_child_weight': [3],
               'gamma':[0.4],
               'subsample' : [0.8],
               'colsample_bytree' : [0.8],
               'scale_pos_weight' : [1],
               'reg_alpha':[1e-5]
             }  

# TODO: Initialize the classifier
clf = xgb.XGBClassifier(seed=2)

# TODO: Make an f1 scoring function using 'make_scorer' 
f1_scorer = make_scorer(f1_score,pos_label='X')

# TODO: Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(clf,
                        scoring=f1_scorer,
                        param_grid=parameters,
                        cv=5)

# TODO: Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train,y_train)

# Get the estimator
clf = grid_obj.best_estimator_
print(clf)

# Report the final F1 score for training and testing after parameter tuning
f1, acc = predict_labels(clf, X_train, y_train)
print("F1 score and accuracy score for training set:", f1, " , ", acc)
    
f1, acc = predict_labels(clf, X_test, y_test)
print("F1 score and accuracy score for test set:", f1, " , ", acc)

  if diff:
  if diff:
  if diff:
  if diff:


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0.4, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=3, missing=None,
       n_estimators=40, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=1e-05,
       reg_lambda=1, scale_pos_weight=1, seed=2, silent=True,
       subsample=0.8)
Made predictions in 0.004517555236816406 seconds.
F1 score and accuracy score for training set: 0.8995433789954338  ,  0.8983833718244804
Made predictions in 0.0025017261505126953 seconds.
F1 score and accuracy score for test set: 0.7184466019417476  ,  0.71


  if diff:
  if diff:
  if diff:


In [29]:
X_all.head()

Unnamed: 0,assist_percentage_x,block_percentage_x,effective_field_goal_percentage_x,field_goal_percentage_x,free_throw_attempt_rate_x,free_throw_percentage_x,free_throws_per_field_goal_attempt_x,net_rating_x,offensive_rating_x,offensive_rebound_percentage_x,...,pace_y,simple_rating_system_y,steal_percentage_y,strength_of_schedule_y,three_point_attempt_rate_y,three_point_field_goal_percentage_y,total_rebound_percentage_y,true_shooting_percentage_y,turnover_percentage_y,two_point_field_goal_percentage_y
0,-1.548259,0.323555,-4.063465,-3.577455,-0.435558,-1.879694,-1.058887,-2.04358,-3.588619,0.084751,...,0.198303,-2.940622,-0.094953,-2.817779,-1.488934,-2.12553,0.957865,-2.267817,3.225681,-2.145726
1,3.793433,-1.268948,0.636121,0.180869,-0.236371,-0.148678,-0.281074,0.131144,0.321262,0.281047,...,0.135743,0.493223,-0.150386,0.475267,-0.202707,0.93054,1.039467,0.993155,1.150821,0.944728
2,0.000633,-1.434834,-0.349276,-0.770032,1.73558,0.878196,2.107923,0.179831,-0.621119,-0.823114,...,0.949024,-0.20113,1.346294,-0.797094,-0.468119,-0.634764,-0.551769,0.07241,0.676568,0.876806
3,-0.2178,-1.202594,-0.955674,-0.67947,-0.953444,-0.148678,-0.97555,-0.761467,-0.500815,0.796322,...,1.324385,0.604372,1.069131,-0.579095,-0.100626,2.123152,0.468254,1.606985,-0.568348,0.265508
4,0.775079,-1.368479,-0.197677,-0.22666,0.97867,0.291411,1.107878,-1.13474,-0.681271,-0.970336,...,0.823904,1.370645,1.124564,0.521382,-0.570201,1.638653,1.121068,1.069884,0.320877,0.639079


In [30]:
teams_19 = Teams(year = 2019)
teams_df_19 = teams.dataframes
teams_df_19 = teams_df_19[['abbreviation','assist_percentage','block_percentage','effective_field_goal_percentage','field_goal_percentage','free_throw_attempt_rate','free_throw_percentage','free_throws_per_field_goal_attempt','net_rating','offensive_rating','offensive_rebound_percentage','opp_assist_percentage','opp_block_percentage','opp_effective_field_goal_percentage','opp_field_goal_percentage','opp_free_throw_attempt_rate','opp_free_throw_percentage','opp_free_throws_per_field_goal_attempt','opp_offensive_rating','opp_offensive_rebound_percentage','opp_steal_percentage','opp_three_point_attempt_rate','opp_three_point_field_goal_percentage','opp_total_rebound_percentage','opp_true_shooting_percentage','opp_two_point_field_goal_percentage','pace','simple_rating_system','steal_percentage','strength_of_schedule','three_point_attempt_rate','three_point_field_goal_percentage','total_rebound_percentage','true_shooting_percentage','turnover_percentage','two_point_field_goal_percentage']]

In [31]:
teams_df_19.head()

Unnamed: 0,abbreviation,assist_percentage,block_percentage,effective_field_goal_percentage,field_goal_percentage,free_throw_attempt_rate,free_throw_percentage,free_throws_per_field_goal_attempt,net_rating,offensive_rating,...,pace,simple_rating_system,steal_percentage,strength_of_schedule,three_point_attempt_rate,three_point_field_goal_percentage,total_rebound_percentage,true_shooting_percentage,turnover_percentage,two_point_field_goal_percentage
ABILENE-CHRISTIAN,ABILENE-CHRISTIAN,55.4,11.6,0.521,0.464,0.309,0.701,0.217,3.5,102.2,...,71.6,-9.14,11.3,-6.82,0.35,0.325,49.7,0.549,17.7,0.539
AIR-FORCE,AIR-FORCE,60.7,8.1,0.49,0.419,0.318,0.734,0.233,-5.7,100.8,...,67.7,-4.31,9.5,1.72,0.431,0.331,48.7,0.527,16.3,0.485
AKRON,AKRON,52.7,7.5,0.518,0.435,0.319,0.696,0.222,-5.1,102.6,...,69.1,-6.82,8.4,-1.92,0.467,0.358,49.2,0.547,17.3,0.502
ALABAMA-AM,ALABAMA-AM,50.5,3.9,0.45,0.397,0.314,0.647,0.203,-23.2,88.1,...,68.3,-23.97,5.8,-8.04,0.354,0.303,48.2,0.48,20.9,0.448
ALABAMA-BIRMINGHAM,ALABAMA-BIRMINGHAM,59.3,11.6,0.545,0.488,0.291,0.75,0.218,10.1,109.8,...,69.5,4.9,7.7,-0.65,0.334,0.345,54.8,0.575,16.5,0.559


In [78]:
game_list = [['RHODE-ISLAND','VIRGINIA-COMMONWEALTH'],['FAIRLEIGH-DICKINSON','SAINT-FRANCIS-PA'],['FLORIDA-STATE','VIRGINIA'],['DUKE','NORTH-CAROLINA'],['OHIO-STATE','MICHIGAN-STATE'],['SAINT-MARYS-CA','GONZAGA']]
game_df_teams = pd.DataFrame(game_list)
game_df_teams.columns = ['Away','Home']
game_df_teams

Unnamed: 0,Away,Home
0,RHODE-ISLAND,VIRGINIA-COMMONWEALTH
1,FAIRLEIGH-DICKINSON,SAINT-FRANCIS-PA
2,FLORIDA-STATE,VIRGINIA
3,DUKE,NORTH-CAROLINA
4,OHIO-STATE,MICHIGAN-STATE
5,SAINT-MARYS-CA,GONZAGA


In [79]:
game_df = pd.DataFrame(game_list)
game_df.columns = ['Away','Home']
game_df = game_df.merge(teams_df_19, left_on = 'Away', right_on = 'abbreviation', how = 'left')
game_df = game_df.merge(teams_df_19, left_on = 'Home', right_on = 'abbreviation', how = 'left')
print(game_df)

                  Away                   Home       abbreviation_x  \
0         RHODE-ISLAND  VIRGINIA-COMMONWEALTH         RHODE-ISLAND   
1  FAIRLEIGH-DICKINSON       SAINT-FRANCIS-PA  FAIRLEIGH-DICKINSON   
2        FLORIDA-STATE               VIRGINIA        FLORIDA-STATE   
3                 DUKE         NORTH-CAROLINA                 DUKE   
4           OHIO-STATE         MICHIGAN-STATE           OHIO-STATE   
5       SAINT-MARYS-CA                GONZAGA       SAINT-MARYS-CA   

   assist_percentage_x  block_percentage_x  effective_field_goal_percentage_x  \
0                 54.5                 9.5                              0.511   
1                 49.4                 8.7                              0.508   
2                 52.6                14.0                              0.532   
3                 57.6                12.4                              0.559   
4                 52.5                11.3                              0.541   
5                 54.9 

In [80]:
game_input = game_df.drop(columns = ['Away','Home','abbreviation_x','abbreviation_y'], axis = 1)

In [81]:
game_input

Unnamed: 0,assist_percentage_x,block_percentage_x,effective_field_goal_percentage_x,field_goal_percentage_x,free_throw_attempt_rate_x,free_throw_percentage_x,free_throws_per_field_goal_attempt_x,net_rating_x,offensive_rating_x,offensive_rebound_percentage_x,...,pace_y,simple_rating_system_y,steal_percentage_y,strength_of_schedule_y,three_point_attempt_rate_y,three_point_field_goal_percentage_y,total_rebound_percentage_y,true_shooting_percentage_y,turnover_percentage_y,two_point_field_goal_percentage_y
0,54.5,9.5,0.511,0.455,0.301,0.697,0.209,10.3,108.9,30.8,...,71.4,3.55,8.8,2.42,0.386,0.352,51.0,0.549,16.3,0.524
1,49.4,8.7,0.508,0.451,0.307,0.726,0.223,-1.6,104.0,30.8,...,70.9,-6.13,9.4,-6.71,0.363,0.373,48.3,0.571,14.3,0.519
2,52.6,14.0,0.532,0.468,0.37,0.691,0.256,9.9,109.4,31.7,...,60.6,22.21,11.2,9.07,0.35,0.383,51.2,0.555,12.3,0.501
3,57.6,12.4,0.559,0.492,0.346,0.71,0.246,20.7,118.5,38.6,...,71.7,20.08,8.0,11.78,0.351,0.359,56.4,0.551,14.0,0.51
4,52.5,11.3,0.541,0.48,0.311,0.731,0.227,12.6,110.9,29.8,...,68.2,22.41,6.0,7.1,0.363,0.4,57.6,0.601,16.4,0.552
5,54.9,6.8,0.587,0.515,0.304,0.769,0.234,19.5,119.8,29.2,...,69.8,18.43,8.9,2.59,0.394,0.368,56.0,0.593,13.9,0.582


In [82]:
y_pred_C = clf_C.predict(game_input)
y_pred_B = clf_B.predict(game_input)
y_pred_A = clf_A.predict(game_input)
Y_pred = clf.predict(game_input)

  if diff:
  if diff:


In [83]:
print(y_pred_C,y_pred_B,y_pred_A,Y_pred)

['Y' 'Y' 'Y' 'Y' 'Y' 'Y'] ['Y' 'Y' 'Y' 'Y' 'Y' 'Y'] ['X' 'X' 'X' 'X' 'X' 'X'] ['Y' 'Y' 'Y' 'Y' 'Y' 'Y']


In [73]:
test = pd.DataFrame(Y_pred)

In [74]:
test

Unnamed: 0,0
0,Y
1,Y
2,Y
3,Y


In [75]:
games_df_out = game_df_teams.merge(test, left_index = True, right_index = True)
games_df_out.columns = ['Away','Home','Forecast']

In [76]:
games_df_out['Winner'] = np.where(games_df_out['Forecast'] == 'Y', games_df_out['Home'], games_df_out['Away'])

In [77]:
games_df_out[['Away','Home','Winner']]

Unnamed: 0,Away,Home,Winner
0,FLORIDA-STATE,VIRGINIA,VIRGINIA
1,DUKE,NORTH-CAROLINA,NORTH-CAROLINA
2,OHIO-STATE,MICHIGAN-STATE,MICHIGAN-STATE
3,SAINT-MARYS-CA,GONZAGA,GONZAGA
