In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
import datetime as dt
import random
from sportsreference.ncaab.teams import Teams
from sklearn.preprocessing import scale
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [2]:
teams_df = pd.DataFrame()
i = 2010
while i <=2018:
    print("Downloading data for the", i, "season.")
    start = dt.datetime.now()
    teams = Teams(year = i)
    end = dt.datetime.now()
    teams_temp = teams.dataframes
    teams_temp['Season'] = i
    teams_df = pd.concat([teams_df, teams_temp])
    span = end - start
    print(i, "took", span, "seconds to download.")
    i += 1

Downloading data for the 2010 season.
2010 took 0:00:25.118594 seconds to download.
Downloading data for the 2011 season.
2011 took 0:00:27.356081 seconds to download.
Downloading data for the 2012 season.
2012 took 0:00:28.633548 seconds to download.
Downloading data for the 2013 season.
2013 took 0:00:28.535353 seconds to download.
Downloading data for the 2014 season.
2014 took 0:00:28.533286 seconds to download.
Downloading data for the 2015 season.
2015 took 0:00:28.541572 seconds to download.
Downloading data for the 2016 season.
2016 took 0:00:28.391020 seconds to download.
Downloading data for the 2017 season.
2017 took 0:00:28.140616 seconds to download.
Downloading data for the 2018 season.
2018 took 0:00:28.292149 seconds to download.


In [3]:
teams_df.head(10)

Unnamed: 0,abbreviation,assist_percentage,assists,away_losses,away_wins,block_percentage,blocks,conference,conference_losses,conference_wins,...,total_rebounds,true_shooting_percentage,turnover_percentage,turnovers,two_point_field_goal_attempts,two_point_field_goal_percentage,two_point_field_goals,win_percentage,wins,Season
AIR-FORCE,AIR-FORCE,61.6,389,10,0,6.2,51,mwc,15,1,...,851,0.529,19.0,394,866,0.527,456,0.323,10,2010
AKRON,AKRON,53.9,473,4,7,8.5,114,mac,4,12,...,1269,0.521,16.4,464,1330,0.483,642,0.686,24,2010
ALABAMA-AM,ALABAMA-AM,48.1,288,9,3,12.7,143,swac,10,8,...,1004,0.463,18.8,444,1197,0.41,491,0.407,11,2010
ALABAMA-BIRMINGHAM,ALABAMA-BIRMINGHAM,51.1,390,4,9,7.3,91,cusa,5,11,...,1248,0.518,17.0,451,1237,0.474,586,0.735,25,2010
ALABAMA-STATE,ALABAMA-STATE,60.0,400,11,6,11.1,129,swac,6,12,...,1101,0.499,20.0,501,1064,0.448,477,0.516,16,2010
ALABAMA,ALABAMA,54.2,429,6,3,10.9,127,sec,10,6,...,1126,0.525,16.4,410,1294,0.476,616,0.531,17,2010
ALBANY-NY,ALBANY-NY,53.5,380,15,3,8.2,97,america-east,14,2,...,1156,0.503,20.6,517,1151,0.448,516,0.219,7,2010
ALCORN-STATE,ALCORN-STATE,41.3,269,18,0,7.4,98,swac,16,2,...,1065,0.456,23.7,642,1280,0.403,516,0.065,2,2010
AMERICAN,AMERICAN,65.6,447,12,4,6.6,81,patriot,7,7,...,1097,0.518,19.6,458,1069,0.465,497,0.355,11,2010
APPALACHIAN-STATE,APPALACHIAN-STATE,46.9,447,8,8,7.7,102,southern,5,13,...,1424,0.581,18.6,544,1303,0.518,675,0.649,24,2010


In [4]:
team_names = teams_df[['abbreviation']]

In [5]:
team_names.head()

Unnamed: 0,abbreviation
AIR-FORCE,AIR-FORCE
AKRON,AKRON
ALABAMA-AM,ALABAMA-AM
ALABAMA-BIRMINGHAM,ALABAMA-BIRMINGHAM
ALABAMA-STATE,ALABAMA-STATE


In [None]:
#Only needed this on first run
#team_names.to_csv(r'data/team_names.csv')
#teams_df.to_csv(r'data/team_data.csv')

In [6]:
games = pd.read_csv(r'data/TourneyCompactResults.csv')

In [7]:
games = games[['Season','WTeamID','LTeamID']]
games['Winner'] = games['WTeamID']
games = games.rename(columns = {'WTeamID' : 'Team_A', 'LTeamID': 'Team_B'})
games.head()

Unnamed: 0,Season,Team_A,Team_B,Winner
0,1985,1116,1234,1116
1,1985,1120,1345,1120
2,1985,1207,1250,1207
3,1985,1229,1425,1229
4,1985,1242,1325,1242


In [8]:
i = 0
games['Team_X'] = 0
games['Team_Y'] = 0
while i < len(games):
    games['Team_X'][i] = random.choice(([games['Team_A'][i],games['Team_B'][i]]))
    i += 1
games['Team_Y'] = np.where(games['Team_X'] == games['Team_A'], games['Team_B'], games['Team_A'])
games['Result'] = np.where(games['Winner'] == games['Team_X'], 'X', 'Y')
games = games.drop(columns = ['Team_A','Team_B'], axis = 1)

In [9]:
games.head(25)

Unnamed: 0,Season,Winner,Team_X,Team_Y,Result
0,1985,1116,1234,1116,Y
1,1985,1120,1345,1120,Y
2,1985,1207,1250,1207,Y
3,1985,1229,1229,1425,X
4,1985,1242,1325,1242,Y
5,1985,1246,1246,1449,X
6,1985,1256,1338,1256,Y
7,1985,1260,1233,1260,Y
8,1985,1314,1314,1292,X
9,1985,1323,1333,1323,Y


In [75]:
team_data = teams_df[['Season','abbreviation','assist_percentage','block_percentage','effective_field_goal_percentage','field_goal_percentage','free_throw_attempt_rate','free_throw_percentage','free_throws_per_field_goal_attempt','net_rating','offensive_rating','offensive_rebound_percentage','opp_assist_percentage','opp_block_percentage','opp_effective_field_goal_percentage','opp_field_goal_percentage','opp_free_throw_attempt_rate','opp_free_throw_percentage','opp_free_throws_per_field_goal_attempt','opp_offensive_rating','opp_offensive_rebound_percentage','opp_steal_percentage','opp_three_point_attempt_rate','opp_three_point_field_goal_percentage','opp_total_rebound_percentage','opp_true_shooting_percentage','opp_two_point_field_goal_percentage','pace','simple_rating_system','steal_percentage','strength_of_schedule','three_point_attempt_rate','three_point_field_goal_percentage','total_rebound_percentage','true_shooting_percentage','turnover_percentage','two_point_field_goal_percentage','win_percentage']]

,'opp_assist_percentage','opp_block_percentage','opp_effective_field_goal_percentage','opp_field_goal_percentage','opp_free_throw_attempt_rate','opp_free_throw_percentage','opp_free_throws_per_field_goal_attempt','opp_offensive_rating','opp_offensive_rebound_percentage','opp_steal_percentage','opp_three_point_attempt_rate','opp_three_point_field_goal_percentage','opp_total_rebound_percentage','opp_true_shooting_percentage','opp_two_point_field_goal_percentage'

In [76]:
games = games.loc[games['Season'] >= 2010]
games.head()

Unnamed: 0,Season,Winner,Team_X,Team_Y,Result
1584,2010,1115,1457,1115,Y
1585,2010,1124,1124,1358,X
1586,2010,1139,1431,1139,Y
1587,2010,1140,1140,1196,X
1588,2010,1242,1250,1242,Y


In [77]:
team_abb = pd.read_csv(r'data/team_names.csv')

In [78]:
team_abb = team_abb[['abbreviation','Team_Id']]

In [79]:
team_data_abb = team_data.merge(team_abb, left_on = 'abbreviation', right_on = 'abbreviation', how = 'left').dropna()
team_data_abb['Team_Id'] = team_data_abb['Team_Id'].astype(np.int64)

In [80]:
team_data_abb.head()

Unnamed: 0,Season,abbreviation,assist_percentage,block_percentage,effective_field_goal_percentage,field_goal_percentage,free_throw_attempt_rate,free_throw_percentage,free_throws_per_field_goal_attempt,net_rating,...,steal_percentage,strength_of_schedule,three_point_attempt_rate,three_point_field_goal_percentage,total_rebound_percentage,true_shooting_percentage,turnover_percentage,two_point_field_goal_percentage,win_percentage,Team_Id
0,2010,AIR-FORCE,61.6,6.2,0.504,0.443,0.367,0.635,0.233,-10.0,...,8.4,3.13,0.394,0.313,46.8,0.529,19.0,0.527,0.323,1102
1,2010,AKRON,53.9,8.5,0.491,0.433,0.363,0.657,0.239,7.6,...,9.0,-1.5,0.343,0.339,51.6,0.521,16.4,0.483,0.686,1103
2,2010,ALABAMA-AM,48.1,12.7,0.416,0.382,0.474,0.635,0.301,-5.2,...,12.5,-13.71,0.237,0.291,46.8,0.463,18.8,0.41,0.407,1105
3,2010,ALABAMA-BIRMINGHAM,51.1,7.3,0.471,0.422,0.457,0.694,0.317,10.0,...,10.0,2.9,0.315,0.311,53.6,0.518,17.0,0.474,0.735,1412
4,2010,ALABAMA-STATE,60.0,11.1,0.462,0.404,0.448,0.641,0.287,-2.1,...,10.8,-12.02,0.356,0.324,51.3,0.499,20.0,0.448,0.516,1106


In [81]:
games_a = games.merge(team_data_abb, left_on = ['Team_X','Season'], right_on = ['Team_Id','Season'], how = 'left')
games_b = games_a.merge(team_data_abb, left_on = ['Team_Y','Season'], right_on = ['Team_Id','Season'], how = 'left')
games_b = games_b.dropna()
print(games_b.head(15))

    Season  Winner  Team_X  Team_Y Result    abbreviation_x  \
0     2010    1115    1457    1115      Y          WINTHROP   
1     2010    1124    1124    1358      X            BAYLOR   
2     2010    1139    1431    1139      Y     TEXAS-EL-PASO   
3     2010    1140    1140    1196      X     BRIGHAM-YOUNG   
4     2010    1242    1250    1242      Y            LEHIGH   
5     2010    1243    1243    1317      X      KANSAS-STATE   
6     2010    1246    1246    1190      X          KENTUCKY   
7     2010    1293    1293    1435      X      MURRAY-STATE   
8     2010    1307    1307    1285      X        NEW-MEXICO   
9     2010    1320    1424    1320      Y  NEVADA-LAS-VEGAS   
10    2010    1325    1207    1325      Y        GEORGETOWN   
11    2010    1330    1330    1323      X      OLD-DOMINION   
12    2010    1388    1350    1388      Y          RICHMOND   
13    2010    1397    1397    1361      X         TENNESSEE   
14    2010    1437    1352    1437      Y     ROBERT-MO

In [82]:
games_b['Winner'] = np.where(games_b['Winner'] == games_b['Team_X'], games_b['abbreviation_x'], games_b['abbreviation_y'])
games_b['Team_X'] = games_b['abbreviation_x']
games_b['Team_Y'] = games_b['abbreviation_y']

In [83]:
games_b.dtypes

Season                                        int64
Winner                                       object
Team_X                                       object
Team_Y                                       object
Result                                       object
abbreviation_x                               object
assist_percentage_x                         float64
block_percentage_x                          float64
effective_field_goal_percentage_x           float64
field_goal_percentage_x                     float64
free_throw_attempt_rate_x                   float64
free_throw_percentage_x                     float64
free_throws_per_field_goal_attempt_x        float64
net_rating_x                                float64
offensive_rating_x                          float64
offensive_rebound_percentage_x              float64
opp_assist_percentage_x                     float64
opp_block_percentage_x                      float64
opp_effective_field_goal_percentage_x       float64
opp_field_go

In [84]:
print(games_b.head(15))

    Season               Winner            Team_X                Team_Y  \
0     2010  ARKANSAS-PINE-BLUFF          WINTHROP   ARKANSAS-PINE-BLUFF   
1     2010               BAYLOR            BAYLOR     SAM-HOUSTON-STATE   
2     2010               BUTLER     TEXAS-EL-PASO                BUTLER   
3     2010        BRIGHAM-YOUNG     BRIGHAM-YOUNG               FLORIDA   
4     2010               KANSAS            LEHIGH                KANSAS   
5     2010         KANSAS-STATE      KANSAS-STATE           NORTH-TEXAS   
6     2010             KENTUCKY          KENTUCKY  EAST-TENNESSEE-STATE   
7     2010         MURRAY-STATE      MURRAY-STATE            VANDERBILT   
8     2010           NEW-MEXICO        NEW-MEXICO               MONTANA   
9     2010        NORTHERN-IOWA  NEVADA-LAS-VEGAS         NORTHERN-IOWA   
10    2010                 OHIO        GEORGETOWN                  OHIO   
11    2010         OLD-DOMINION      OLD-DOMINION            NOTRE-DAME   
12    2010       SAINT-MA

In [85]:
ml_input = games_b.drop(columns = ['Season','Winner','Team_X','Team_Y','abbreviation_x','abbreviation_y','Team_Id_x','Team_Id_y','win_percentage_x','win_percentage_y'], axis = 1)
ml_input.dtypes

Result                                       object
assist_percentage_x                         float64
block_percentage_x                          float64
effective_field_goal_percentage_x           float64
field_goal_percentage_x                     float64
free_throw_attempt_rate_x                   float64
free_throw_percentage_x                     float64
free_throws_per_field_goal_attempt_x        float64
net_rating_x                                float64
offensive_rating_x                          float64
offensive_rebound_percentage_x              float64
opp_assist_percentage_x                     float64
opp_block_percentage_x                      float64
opp_effective_field_goal_percentage_x       float64
opp_field_goal_percentage_x                 float64
opp_free_throw_attempt_rate_x               float64
opp_free_throw_percentage_x                 float64
opp_free_throws_per_field_goal_attempt_x    float64
opp_offensive_rating_x                      float64
opp_offensiv

In [86]:
X_all = ml_input.drop(['Result'],1)
y_all = ml_input['Result']

In [87]:
cols = [['assist_percentage_x','block_percentage_x','effective_field_goal_percentage_x','field_goal_percentage_x','free_throw_attempt_rate_x','free_throw_percentage_x','free_throws_per_field_goal_attempt_x','net_rating_x','offensive_rating_x','offensive_rebound_percentage_x','opp_assist_percentage_x','opp_block_percentage_x','opp_effective_field_goal_percentage_x','opp_field_goal_percentage_x','opp_free_throw_attempt_rate_x','opp_free_throw_percentage_x','opp_free_throws_per_field_goal_attempt_x','opp_offensive_rating_x','opp_offensive_rebound_percentage_x','opp_steal_percentage_x','opp_three_point_attempt_rate_x','opp_three_point_field_goal_percentage_x','opp_total_rebound_percentage_x','opp_true_shooting_percentage_x','opp_two_point_field_goal_percentage_x','pace_x','simple_rating_system_x','steal_percentage_x','strength_of_schedule_x','three_point_attempt_rate_x','three_point_field_goal_percentage_x','total_rebound_percentage_x','true_shooting_percentage_x','turnover_percentage_x','two_point_field_goal_percentage_x','assist_percentage_y','block_percentage_y','effective_field_goal_percentage_y','field_goal_percentage_y','free_throw_attempt_rate_y','free_throw_percentage_y','free_throws_per_field_goal_attempt_y','net_rating_y','offensive_rating_y','offensive_rebound_percentage_y','opp_assist_percentage_y','opp_block_percentage_y','opp_effective_field_goal_percentage_y','opp_field_goal_percentage_y','opp_free_throw_attempt_rate_y','opp_free_throw_percentage_y','opp_free_throws_per_field_goal_attempt_y','opp_offensive_rating_y','opp_offensive_rebound_percentage_y','opp_steal_percentage_y','opp_three_point_attempt_rate_y','opp_three_point_field_goal_percentage_y','opp_total_rebound_percentage_y','opp_true_shooting_percentage_y','opp_two_point_field_goal_percentage_y','pace_y','simple_rating_system_y','steal_percentage_y','strength_of_schedule_y','three_point_attempt_rate_y','three_point_field_goal_percentage_y','total_rebound_percentage_y','true_shooting_percentage_y','turnover_percentage_y','two_point_field_goal_percentage_y']]
for col in cols:
    X_all[col] = scale(X_all[col])

In [88]:
len(X_all)

533

In [89]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_all, y_all,
                                                   test_size = 100,
                                                   random_state = 2,
                                                   stratify = y_all)

In [90]:
#for measuring training time
from time import time 
# F1 score (also F-score or F-measure) is a measure of a test's accuracy. 
#It considers both the precision p and the recall r of the test to compute 
#the score: p is the number of correct positive results divided by the number of 
#all positive results, and r is the number of correct positive results divided by 
#the number of positive results that should have been returned. The F1 score can be 
#interpreted as a weighted average of the precision and recall, where an F1 score 
#reaches its best value at 1 and worst at 0.
from sklearn.metrics import f1_score

def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    time_taken = end - start
    
    # Print the results
    print('Trained model', clf.__class__.__name__, 'in', time_taken, ' seconds')

    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    
    end = time()
    # Print and return results
    time_taken = end - start
    print("Made predictions in", time_taken, "seconds.")
    
    return f1_score(target, y_pred, pos_label='X'), sum(target == y_pred) / float(len(y_pred))


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    #print("Training something using a training set size of %d. . .").format(len(X_train))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    f1, acc = predict_labels(clf, X_train, y_train)
    print("F1 score and accuracy score for training set:", f1, " , ", acc)
    
    f1, acc = predict_labels(clf, X_test, y_test)
    print("F1 score and accuracy score for test set:", f1, " , ", acc)

In [91]:
clf_A = LogisticRegression(random_state = 64)
clf_B = SVC(random_state = 64, kernel = 'rbf')
clf_C = xgb.XGBClassifier(seed = 64)

train_predict(clf_A, X_train, y_train, X_test, y_test)
print('')
train_predict(clf_B, X_train, y_train, X_test, y_test)
print('')
train_predict(clf_C, X_train, y_train, X_test, y_test)
print('')

Trained model LogisticRegression in 0.00899362564086914  seconds
Made predictions in 0.0 seconds.
F1 score and accuracy score for training set: 0.7409200968523003  ,  0.7528868360277137
Made predictions in 0.0 seconds.
F1 score and accuracy score for test set: 0.6956521739130435  ,  0.72

Trained model SVC in 0.015990018844604492  seconds
Made predictions in 0.012990951538085938 seconds.
F1 score and accuracy score for training set: 0.8786407766990291  ,  0.8845265588914549
Made predictions in 0.003999233245849609 seconds.
F1 score and accuracy score for test set: 0.6595744680851064  ,  0.68

Trained model XGBClassifier in 0.290820837020874  seconds
Made predictions in 0.003997087478637695 seconds.
F1 score and accuracy score for training set: 0.9928400954653939  ,  0.9930715935334873
Made predictions in 0.0019989013671875 seconds.
F1 score and accuracy score for test set: 0.68  ,  0.68



  if diff:
  if diff:


In [92]:
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer


# TODO: Create the parameters list you wish to tune
parameters = { 'learning_rate' : [0.1],
               'n_estimators' : [40],
               'max_depth': [3],
               'min_child_weight': [3],
               'gamma':[0.4],
               'subsample' : [0.8],
               'colsample_bytree' : [0.8],
               'scale_pos_weight' : [1],
               'reg_alpha':[1e-5]
             }  

# TODO: Initialize the classifier
clf = xgb.XGBClassifier(seed=2)

# TODO: Make an f1 scoring function using 'make_scorer' 
f1_scorer = make_scorer(f1_score,pos_label='X')

# TODO: Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(clf,
                        scoring=f1_scorer,
                        param_grid=parameters,
                        cv=5)

# TODO: Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train,y_train)

# Get the estimator
clf = grid_obj.best_estimator_
print(clf)

# Report the final F1 score for training and testing after parameter tuning
f1, acc = predict_labels(clf, X_train, y_train)
print("F1 score and accuracy score for training set:", f1, " , ", acc)
    
f1, acc = predict_labels(clf, X_test, y_test)
print("F1 score and accuracy score for test set:", f1, " , ", acc)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0.4, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=3, missing=None,
       n_estimators=40, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=1e-05,
       reg_lambda=1, scale_pos_weight=1, seed=2, silent=True,
       subsample=0.8)
Made predictions in 0.00299835205078125 seconds.
F1 score and accuracy score for training set: 0.8846153846153846  ,  0.8891454965357968
Made predictions in 0.0029985904693603516 seconds.
F1 score and accuracy score for test set: 0.6597938144329897  ,  0.67


  if diff:
  if diff:


In [93]:
X_all.head()

Unnamed: 0,assist_percentage_x,block_percentage_x,effective_field_goal_percentage_x,field_goal_percentage_x,free_throw_attempt_rate_x,free_throw_percentage_x,free_throws_per_field_goal_attempt_x,net_rating_x,offensive_rating_x,offensive_rebound_percentage_x,...,pace_y,simple_rating_system_y,steal_percentage_y,strength_of_schedule_y,three_point_attempt_rate_y,three_point_field_goal_percentage_y,total_rebound_percentage_y,true_shooting_percentage_y,turnover_percentage_y,two_point_field_goal_percentage_y
0,-1.538693,0.345975,-3.805478,-3.466166,-0.441544,-1.948029,-1.12126,-1.936346,-3.532127,0.096324,...,0.258294,-3.017458,-0.146633,-2.71633,-1.566322,-2.227977,0.927778,-2.396183,3.140625,-2.2595
1,-0.711101,2.238346,1.048826,1.194863,-0.483106,0.455373,-0.300418,0.57258,0.767046,1.243606,...,1.243637,-1.407187,0.50421,-1.837739,1.197943,0.395536,0.460418,0.539548,0.547404,0.58188
2,0.015565,-0.194702,0.304736,0.62857,0.306566,-1.087552,-0.095208,0.323246,-0.652864,-1.259554,...,-0.75784,0.02327,0.449973,-0.115211,1.260295,-0.700857,0.109898,0.213356,0.662658,0.042377
3,0.116491,-0.566418,1.155125,1.020619,0.140319,2.384029,1.077424,1.569917,1.220629,-0.659839,...,-0.018833,-0.309519,0.287262,0.308796,-0.298501,-1.836408,0.070952,-1.050639,0.144014,-0.065523
4,0.822972,-1.377434,-0.191324,-0.24265,1.033896,0.247672,1.165371,-1.063676,-0.672585,-1.024883,...,0.874133,1.406761,1.046579,0.530991,-0.631044,1.726871,1.083564,1.151159,0.316895,0.689781


In [95]:
teams_19 = Teams(year = 2019)
teams_df_19 = teams.dataframes
teams_df_19 = teams_df_19[['abbreviation','assist_percentage','block_percentage','effective_field_goal_percentage','field_goal_percentage','free_throw_attempt_rate','free_throw_percentage','free_throws_per_field_goal_attempt','net_rating','offensive_rating','offensive_rebound_percentage','opp_assist_percentage','opp_block_percentage','opp_effective_field_goal_percentage','opp_field_goal_percentage','opp_free_throw_attempt_rate','opp_free_throw_percentage','opp_free_throws_per_field_goal_attempt','opp_offensive_rating','opp_offensive_rebound_percentage','opp_steal_percentage','opp_three_point_attempt_rate','opp_three_point_field_goal_percentage','opp_total_rebound_percentage','opp_true_shooting_percentage','opp_two_point_field_goal_percentage','pace','simple_rating_system','steal_percentage','strength_of_schedule','three_point_attempt_rate','three_point_field_goal_percentage','total_rebound_percentage','true_shooting_percentage','turnover_percentage','two_point_field_goal_percentage']]

In [96]:
teams_df_19.head()

Unnamed: 0,abbreviation,assist_percentage,block_percentage,effective_field_goal_percentage,field_goal_percentage,free_throw_attempt_rate,free_throw_percentage,free_throws_per_field_goal_attempt,net_rating,offensive_rating,...,pace,simple_rating_system,steal_percentage,strength_of_schedule,three_point_attempt_rate,three_point_field_goal_percentage,total_rebound_percentage,true_shooting_percentage,turnover_percentage,two_point_field_goal_percentage
ABILENE-CHRISTIAN,ABILENE-CHRISTIAN,55.4,11.6,0.521,0.464,0.309,0.701,0.217,3.5,102.2,...,71.6,-9.14,11.3,-6.82,0.35,0.325,49.7,0.549,17.7,0.539
AIR-FORCE,AIR-FORCE,60.7,8.1,0.49,0.419,0.318,0.734,0.233,-5.7,100.8,...,67.7,-4.31,9.5,1.72,0.431,0.331,48.7,0.527,16.3,0.485
AKRON,AKRON,52.7,7.5,0.518,0.435,0.319,0.696,0.222,-5.1,102.6,...,69.1,-6.82,8.4,-1.92,0.467,0.358,49.2,0.547,17.3,0.502
ALABAMA-AM,ALABAMA-AM,50.5,3.9,0.45,0.397,0.314,0.647,0.203,-23.2,88.1,...,68.3,-23.97,5.8,-8.04,0.354,0.303,48.2,0.48,20.9,0.448
ALABAMA-BIRMINGHAM,ALABAMA-BIRMINGHAM,59.3,11.6,0.545,0.488,0.291,0.75,0.218,10.1,109.8,...,69.5,4.9,7.7,-0.65,0.334,0.345,54.8,0.575,16.5,0.559


In [103]:
game_list = [['NORTH-CAROLINA-STATE','VIRGINIA'],['VIRGINIA-TECH','FLORIDA-STATE'],['LOUISVILLE','NORTH-CAROLINA'],['SYRACUSE','DUKE']]
game_df = pd.DataFrame(game_list)
game_df.columns = ['Away','Home']
game_df

Unnamed: 0,Away,Home
0,NORTH-CAROLINA-STATE,VIRGINIA
1,VIRGINIA-TECH,FLORIDA-STATE
2,LOUISVILLE,NORTH-CAROLINA
3,SYRACUSE,DUKE


In [108]:
game_df = pd.DataFrame(game_list)
game_df.columns = ['Away','Home']
game_df = game_df.merge(teams_df_19, left_on = 'Away', right_on = 'abbreviation', how = 'left')
game_df = game_df.merge(teams_df_19, left_on = 'Home', right_on = 'abbreviation', how = 'left')
print(game_df)

                   Away            Home        abbreviation_x  \
0  NORTH-CAROLINA-STATE        VIRGINIA  NORTH-CAROLINA-STATE   
1         VIRGINIA-TECH   FLORIDA-STATE         VIRGINIA-TECH   
2            LOUISVILLE  NORTH-CAROLINA            LOUISVILLE   
3              SYRACUSE            DUKE              SYRACUSE   

   assist_percentage_x  block_percentage_x  effective_field_goal_percentage_x  \
0                 55.0                 9.2                              0.534   
1                 58.2                 6.6                              0.577   
2                 50.5                15.0                              0.518   
3                 47.5                17.4                              0.469   

   field_goal_percentage_x  free_throw_attempt_rate_x  \
0                    0.470                      0.342   
1                    0.499                      0.365   
2                    0.453                      0.319   
3                    0.417              

In [109]:
game_input = game_df.drop(columns = ['Away','Home','abbreviation_x','abbreviation_y'], axis = 1)

In [110]:
game_input

Unnamed: 0,assist_percentage_x,block_percentage_x,effective_field_goal_percentage_x,field_goal_percentage_x,free_throw_attempt_rate_x,free_throw_percentage_x,free_throws_per_field_goal_attempt_x,net_rating_x,offensive_rating_x,offensive_rebound_percentage_x,...,pace_y,simple_rating_system_y,steal_percentage_y,strength_of_schedule_y,three_point_attempt_rate_y,three_point_field_goal_percentage_y,total_rebound_percentage_y,true_shooting_percentage_y,turnover_percentage_y,two_point_field_goal_percentage_y
0,55.0,9.2,0.534,0.47,0.342,0.696,0.238,8.6,112.3,33.3,...,60.6,22.21,11.2,9.07,0.35,0.383,51.2,0.555,12.3,0.501
1,58.2,6.6,0.577,0.499,0.365,0.709,0.259,10.8,112.9,23.7,...,72.2,15.79,9.4,8.5,0.368,0.35,51.8,0.562,15.4,0.536
2,50.5,15.0,0.518,0.453,0.319,0.736,0.235,7.4,106.9,28.6,...,71.7,20.08,8.0,11.78,0.351,0.359,56.4,0.551,14.0,0.51
3,47.5,17.4,0.469,0.417,0.405,0.736,0.298,4.4,102.2,34.1,...,70.7,24.44,10.4,9.71,0.363,0.372,55.5,0.586,14.7,0.56


In [115]:
y_pred_C = clf_C.predict(game_input)
y_pred_B = clf_B.predict(game_input)
y_pred_A = clf_A.predict(game_input)
Y_pred = clf.predict(game_input)

  if diff:
  if diff:


In [116]:
print(y_pred_C,y_pred_B,y_pred_A,Y_pred)

['Y' 'Y' 'Y' 'Y'] ['Y' 'Y' 'Y' 'Y'] ['Y' 'X' 'X' 'Y'] ['Y' 'Y' 'Y' 'Y']
