In [23]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.ensemble import VotingClassifier
from imblearn.under_sampling import RandomUnderSampler

In [24]:
# IMPORT DATA
tourney_data = pd.read_csv("./TourneyStatsAndSeeds.csv")
tourney_data.head()
tourney_data.columns

Index(['Season', 'WTeamID', 'LTeamID', 'WSeed', 'LSeed', 'WW', 'WL', 'WScore',
       'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst',
       'WTO', 'WStl', 'WBlk', 'WPF', 'LW', 'LL', 'LScore', 'LFGM', 'LFGA',
       'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl',
       'LBlk', 'LPF'],
      dtype='object')

In [25]:
# CREATE COLUMN FOR DIFFERENCE IN SEEDING
seed_diff = np.array(tourney_data['WSeed'] - tourney_data['LSeed'])
tourney_data['seed_diff'] = seed_diff
tourney_data.head()

Unnamed: 0,Season,WTeamID,LTeamID,WSeed,LSeed,WW,WL,WScore,WFGM,WFGA,...,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,seed_diff
0,2003,1421,1411,16,16,13,16,71.206897,24.37931,56.793103,...,17.4,28.066667,13.166667,24.8,14.2,15.233333,6.433333,2.233333,18.3,0
1,2003,1112,1436,1,16,25,3,85.214286,30.321429,65.714286,...,12.862069,19.551724,12.965517,25.724138,14.206897,14.068966,6.862069,2.965517,15.896552,-15
2,2003,1112,1211,1,9,25,3,85.214286,30.321429,65.714286,...,17.774194,24.645161,11.935484,25.322581,15.741935,14.548387,6.806452,3.516129,18.645161,-8
3,2003,1112,1323,1,5,25,3,85.214286,30.321429,65.714286,...,17.354839,22.83871,11.387097,26.870968,16.903226,12.774194,7.451613,5.645161,16.225806,-4
4,2003,1113,1272,10,7,18,11,75.965517,27.206897,56.896552,...,14.965517,22.896552,14.068966,25.965517,16.62069,13.793103,7.37931,5.068966,18.758621,3


In [26]:
# FILTER OUT GAMES WHERE DIFFERENCE WAS NOT AT LEAST 5
potential_upsets = tourney_data.loc[(tourney_data['seed_diff'] < -4) | (tourney_data['seed_diff'] > 4)]

In [27]:
# CREATE UPSET LABEL
upsets = potential_upsets.apply(lambda row: row['WSeed'] > row['LSeed'], axis=1)
potential_upsets['UPSET'] = upsets
potential_upsets.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Season,WTeamID,LTeamID,WSeed,LSeed,WW,WL,WScore,WFGM,WFGA,...,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,seed_diff,UPSET
1,2003,1112,1436,1,16,25,3,85.214286,30.321429,65.714286,...,19.551724,12.965517,25.724138,14.206897,14.068966,6.862069,2.965517,15.896552,-15,False
2,2003,1112,1211,1,9,25,3,85.214286,30.321429,65.714286,...,24.645161,11.935484,25.322581,15.741935,14.548387,6.806452,3.516129,18.645161,-8,False
5,2003,1141,1166,11,6,23,6,79.344828,26.62069,52.689655,...,20.030303,10.878788,23.181818,16.818182,13.363636,8.393939,4.454545,17.272727,5,True
7,2003,1163,1140,5,12,21,9,80.033333,29.533333,62.2,...,24.16129,10.870968,24.419355,13.419355,13.741935,6.935484,2.516129,21.419355,-7,False
9,2003,1181,1161,3,14,24,6,81.966667,27.366667,60.333333,...,24.266667,10.8,23.466667,15.5,16.133333,5.333333,4.233333,20.566667,-11,False


In [28]:
# RELABEL COLUMNS SO THAT HIGH SEED DATA FIRST THEN LOW SEED DATA. 
# FROM NOW ON W = HIGH SEED and L = LOW SEED
correctly_labeled = potential_upsets[potential_upsets.WSeed < potential_upsets.LSeed]
fix_labels = potential_upsets[potential_upsets.WSeed > potential_upsets.LSeed]
fix_labels.columns = ['Season', 'LTeamID', 'WTeamID', 'LSeed', 'WSeed', 'LW', 'LL', 'LScore',
       'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst',
       'LTO', 'LStl', 'LBlk', 'LPF', 'WW', 'WL', 'WScore', 'WFGM', 'WFGA',
       'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl',
       'WBlk', 'WPF', 'seed_diff', 'UPSET']

In [29]:
cleaned_data = pd.concat([correctly_labeled, fix_labels])
cleaned_data = cleaned_data[['Season', 'WTeamID', 'LTeamID', 'WSeed', 'LSeed', 'WW', 'WL', 'WScore',
       'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst',
       'WTO', 'WStl', 'WBlk', 'WPF', 'LW', 'LL', 'LScore', 'LFGM', 'LFGA',
       'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl',
       'LBlk', 'LPF', 'seed_diff', 'UPSET']]
cleaned_data.sample(n=5)

Unnamed: 0,Season,WTeamID,LTeamID,WSeed,LSeed,WW,WL,WScore,WFGM,WFGA,...,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,seed_diff,UPSET
894,2016,1332,1386,1,8,27,6,78.545455,27.30303,58.878788,...,22.617647,10.147059,28.441176,15.117647,10.117647,5.147059,3.382353,16.029412,-7,False
28,2003,1390,1360,4,13,23,8,72.258065,25.645161,58.032258,...,26.321429,11.535714,26.357143,16.392857,18.178571,5.392857,3.428571,20.142857,-9,False
189,2005,1458,1320,6,11,22,8,67.166667,23.2,53.066667,...,17.866667,8.766667,24.566667,15.666667,11.666667,6.1,4.333333,17.633333,-5,False
210,2006,1261,1401,4,12,22,8,74.766667,27.7,58.266667,...,23.034483,10.37931,21.310345,16.758621,13.413793,8.655172,2.965517,21.586207,-8,False
528,2011,1163,1137,3,14,26,9,73.4,26.0,60.057143,...,18.727273,8.151515,25.969697,14.727273,11.363636,4.757576,3.181818,18.242424,-11,False


In [30]:
# PERCENTAGE OF GAMES WITH SEED DIFERENCE >=5 THAT RESULTED IN UPSET
potential_upsets.UPSET.mean()

0.21626297577854672

In [31]:
# DROP UNNECESSARY/BAD COLUMNS
saved_attributes = cleaned_data[['WTeamID', 'LTeamID','Season','WSeed', 'LSeed']]
potential_upsets = cleaned_data.drop(['WTeamID', 'LTeamID','Season','WSeed', 'LSeed', 'seed_diff'], axis= 1)
#saved_attributes = cleaned_data[['WTeamID', 'LTeamID','Season']]
#potential_upsets = cleaned_data.drop(['WTeamID', 'LTeamID','Season','seed_diff'], axis= 1)

In [32]:
# Sample .7
potential_upsets_train = potential_upsets.sample(frac=0.7)

# Remaining .3 for testing
potential_upsets_test = potential_upsets.loc[~potential_upsets.index.isin(potential_upsets_train.index)]


In [33]:
# UNDER SAMPLE SO THAT 50/50 UPSET VS NOT UPSET IN TRAINING DATA
x_col = potential_upsets_train.columns[:-1]
X = potential_upsets_train.iloc[:,:-1]
y = np.array(potential_upsets_train.iloc[:,-1:]['UPSET'])
rus = RandomUnderSampler(return_indices=True)
X_resampled, y_resampled, idx_resampled = rus.fit_sample(X, y)

In [34]:
under_sampled_data = pd.DataFrame(X_resampled, columns=x_col )
under_sampled_data['UPSET'] = y_resampled
under_sampled_data.head()

Unnamed: 0,WW,WL,WScore,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,WOR,...,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,UPSET
0,23.0,10.0,76.272727,26.424242,57.363636,9.939394,25.484848,13.484848,18.939394,11.454545,...,15.59375,23.75,11.1875,24.9375,12.0,14.8125,6.53125,4.75,19.09375,False
1,24.0,6.0,71.166667,25.266667,56.533333,7.466667,18.966667,13.166667,18.6,12.133333,...,18.033333,25.8,12.966667,24.266667,10.266667,19.466667,8.0,2.9,20.733333,False
2,32.0,2.0,70.735294,24.382353,52.764706,6.852941,18.617647,15.117647,22.882353,11.676471,...,15.484848,22.727273,11.636364,23.666667,13.060606,12.0,5.818182,2.606061,20.060606,False
3,24.0,5.0,78.413793,26.137931,57.241379,6.103448,17.724138,20.034483,26.62069,14.758621,...,18.333333,24.814815,14.0,24.037037,14.037037,18.740741,10.703704,5.259259,19.518519,False
4,24.0,8.0,79.5,28.65625,59.84375,5.75,15.03125,16.4375,23.53125,12.625,...,15.354839,20.354839,13.032258,26.741935,15.483871,14.645161,7.741935,2.645161,17.935484,False


In [35]:
# FIT MODEL TO UNDER SAMPLED DATA
model = LogisticRegression()
model.fit(X_resampled, y_resampled)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [36]:
# TEST MODEL ON UNDERSAMPLED DATA
def predict( train, test, feat_cols, label_col, model ):
    model.fit( train[feat_cols], train[label_col] )
    return model.score( test[feat_cols], test[label_col] )

accs = []
for i in range(10):
    train = under_sampled_data.sample( frac=.7 )
    test = under_sampled_data.drop( train.index )
    accs += [predict(train, test, x_col[:],['UPSET'], model )]
sum(accs) / len(accs)

  y = column_or_1d(y, warn=True)


0.53584905660377358

In [37]:
# TEST MODEL ON TRAIN SET
accs = []
for i in range(10):
    train = potential_upsets_train.sample( frac=.99 )
    test = potential_upsets_train.drop( train.index )
    accs += [predict(train, test, x_col[:],['UPSET'], model )]
sum(accs) / len(accs)

  y = column_or_1d(y, warn=True)


0.72499999999999998

In [38]:
# TEST MODEL ON TEST SET
(model.predict(X=potential_upsets_test.iloc[:,:-1]) == potential_upsets_test.iloc[:,-1]).mean()

0.75722543352601157

In [39]:
# ADD OUTPUT COLUMN TO DATASET
output = model.predict(X=potential_upsets.iloc[:,:-1])
potential_upsets['output'] = output

In [40]:
# ADD PROBABILITY OF UPSET TO DATAFRAME
probs = model.predict_proba(X=potential_upsets.iloc[:,:-2])
output_probs = [item[1] for item in probs]
potential_upsets['probability'] = output_probs

final_results = pd.concat([saved_attributes, potential_upsets], axis=1)

In [41]:
final_results = pd.concat([saved_attributes, potential_upsets], axis=1)

In [42]:
# SORT BY MOST LIKELY UPSETS
final_results.sort_values(by='probability', ascending= False)

Unnamed: 0,WTeamID,LTeamID,Season,WSeed,LSeed,WW,WL,WScore,WFGM,WFGA,...,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,UPSET,output,probability
63,1462,1407,2003,3,14,25,5,78.233333,26.800000,59.200000,...,15.066667,24.700000,15.066667,13.733333,9.533333,5.200000,16.600000,False,True,0.731853
841,1242,1455,2015,2,7,26,8,71.205882,24.176471,54.970588,...,11.612903,23.258065,13.645161,9.354839,7.064516,3.774194,16.612903,True,True,0.730888
933,1278,1292,2017,5,12,24,9,75.272727,26.151515,60.181818,...,9.696970,25.424242,15.030303,10.363636,6.939394,2.848485,18.272727,True,True,0.704766
849,1112,1455,2016,6,11,25,8,81.212121,28.060606,58.242424,...,11.612903,25.870968,14.322581,9.838710,7.258065,3.580645,20.225806,True,True,0.670489
551,1452,1155,2011,5,12,20,11,69.548387,23.677419,55.419355,...,11.468750,23.437500,12.906250,13.343750,8.031250,4.875000,18.375000,False,True,0.650940
812,1462,1209,2015,6,14,21,13,73.558824,26.029412,55.000000,...,9.375000,23.218750,13.218750,10.687500,8.906250,4.250000,18.437500,False,True,0.643182
453,1139,1293,2010,5,13,28,4,70.093750,22.718750,50.187500,...,12.000000,24.250000,15.187500,14.500000,9.281250,5.406250,17.468750,False,True,0.616824
499,1338,1324,2010,3,14,24,8,68.093750,23.562500,52.843750,...,13.593750,24.437500,12.875000,13.718750,6.406250,4.937500,17.937500,False,True,0.614583
465,1435,1293,2010,4,13,23,8,77.548387,26.064516,55.225806,...,12.000000,24.250000,15.187500,14.500000,9.281250,5.406250,17.468750,True,True,0.592641
936,1345,1436,2017,4,13,25,7,80.125000,27.937500,58.250000,...,9.181818,24.424242,13.242424,11.151515,5.575758,4.454545,16.090909,False,True,0.589905


In [43]:
# FEATURE IMPORTANCE
x_col = np.array(x_col)
model.coef_
pd.DataFrame({'Attribute': x_col, 'Val': model.coef_[0]}).sort_values(by = 'Val')

Unnamed: 0,Attribute,Val
21,LFGM3,-0.408639
28,LTO,-0.31915
13,WStl,-0.287732
3,WFGM,-0.226788
9,WOR,-0.209042
20,LFGA,-0.199324
2,WScore,-0.145574
23,LFTM,-0.043507
27,LAst,-0.035556
0,WW,-0.031664


In [44]:
final_results['output'].mean()

0.06228373702422145