In [664]:
import pandas
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.ensemble import VotingClassifier

In [665]:
# IMPORT DATA
tourney_data = pd.read_csv("./TourneyStatsAndSeeds.csv")
tourney_data.head()

Unnamed: 0,Season,WTeamID,LTeamID,WSeed,LSeed,WW,WL,WScore,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,1421,1411,16,16,13,16,71.206897,24.37931,56.793103,...,18.5,17.4,28.066667,13.166667,24.8,14.2,15.233333,6.433333,2.233333,18.3
1,2003,1112,1436,1,16,25,3,85.214286,30.321429,65.714286,...,15.482759,12.862069,19.551724,12.965517,25.724138,14.206897,14.068966,6.862069,2.965517,15.896552
2,2003,1112,1211,1,9,25,3,85.214286,30.321429,65.714286,...,19.064516,17.774194,24.645161,11.935484,25.322581,15.741935,14.548387,6.806452,3.516129,18.645161
3,2003,1112,1323,1,5,25,3,85.214286,30.321429,65.714286,...,21.774194,17.354839,22.83871,11.387097,26.870968,16.903226,12.774194,7.451613,5.645161,16.225806
4,2003,1113,1272,10,7,18,11,75.965517,27.206897,56.896552,...,20.068966,14.965517,22.896552,14.068966,25.965517,16.62069,13.793103,7.37931,5.068966,18.758621


In [666]:
# CREATE COLUMN FOR DIFFERENCE IN SEEDING
seed_diff = np.array(tourney_data['WSeed'] - tourney_data['LSeed'])
tourney_data['seed_diff'] = seed_diff
tourney_data.head()

Unnamed: 0,Season,WTeamID,LTeamID,WSeed,LSeed,WW,WL,WScore,WFGM,WFGA,...,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,seed_diff
0,2003,1421,1411,16,16,13,16,71.206897,24.37931,56.793103,...,17.4,28.066667,13.166667,24.8,14.2,15.233333,6.433333,2.233333,18.3,0
1,2003,1112,1436,1,16,25,3,85.214286,30.321429,65.714286,...,12.862069,19.551724,12.965517,25.724138,14.206897,14.068966,6.862069,2.965517,15.896552,-15
2,2003,1112,1211,1,9,25,3,85.214286,30.321429,65.714286,...,17.774194,24.645161,11.935484,25.322581,15.741935,14.548387,6.806452,3.516129,18.645161,-8
3,2003,1112,1323,1,5,25,3,85.214286,30.321429,65.714286,...,17.354839,22.83871,11.387097,26.870968,16.903226,12.774194,7.451613,5.645161,16.225806,-4
4,2003,1113,1272,10,7,18,11,75.965517,27.206897,56.896552,...,14.965517,22.896552,14.068966,25.965517,16.62069,13.793103,7.37931,5.068966,18.758621,3


In [667]:
# FILTER OUT GAMES WHERE DIFFERENCE WAS NOT AT LEAST 5
potential_upsets = tourney_data.loc[(tourney_data['seed_diff'] < -4) | (tourney_data['seed_diff'] > 4)]

In [668]:
# CREATE UPSET LABEL
upsets = potential_upsets.apply(lambda row: row['WSeed'] > row['LSeed'], axis=1)
potential_upsets['UPSET'] = upsets

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [669]:
# PERCENTAGE OF GAMES WITH SEED DIFERENCE >=5 THAT RESULTED IN UPSET
potential_upsets.UPSET.mean()

0.21626297577854672

In [670]:
# DROP UNNECESSARY/BAD COLUMNS
saved_attributes = potential_upsets[['WTeamID', 'LTeamID','Season','WSeed', 'LSeed']]
potential_upsets = potential_upsets.drop(['WTeamID', 'LTeamID','Season','WSeed', 'LSeed', 'seed_diff'], axis= 1)

In [671]:
# Sample .7
potential_upsets_train = potential_upsets.sample(frac=0.7)

# Remaining .3 for testing
potential_upsets_test = potential_upsets.loc[~potential_upsets.index.isin(potential_upsets_train.index)]


In [672]:
# UNDER SAMPLE SO THAT 50/50 UPSET VS NOT UPSET IN TRAINING DATA
x_col = potential_upsets_train.columns[:-1]
X = potential_upsets_train.iloc[:,:-1]
y = np.array(potential_upsets_train.iloc[:,-1:]['UPSET'])
rus = RandomUnderSampler(return_indices=True)
X_resampled, y_resampled, idx_resampled = rus.fit_sample(X, y)

In [673]:
under_sampled_data = pd.DataFrame(X_resampled, columns=x_col )
under_sampled_data['UPSET'] = y_resampled
under_sampled_data.head()

Unnamed: 0,WW,WL,WScore,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,WOR,...,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,UPSET
0,26.0,7.0,74.272727,26.090909,56.0,6.727273,17.848485,15.363636,20.424242,12.121212,...,15.28125,22.25,9.5625,25.25,11.375,12.5625,5.875,3.25,17.09375,False
1,25.0,7.0,79.46875,26.25,58.53125,6.71875,18.84375,20.25,30.46875,15.3125,...,19.933333,27.766667,10.533333,25.9,11.966667,14.766667,4.433333,3.0,18.1,False
2,22.0,8.0,68.433333,24.233333,52.866667,5.466667,15.933333,14.5,20.833333,11.3,...,16.757576,22.727273,11.515152,24.787879,17.333333,12.212121,7.363636,2.242424,18.272727,False
3,23.0,10.0,81.454545,27.424242,59.757576,9.272727,23.969697,17.333333,24.0,12.242424,...,15.433333,21.933333,11.833333,25.4,13.233333,11.433333,6.9,4.166667,23.866667,False
4,19.0,10.0,70.655172,25.517241,57.310345,10.034483,29.310345,9.586207,12.862069,7.827586,...,12.516129,18.193548,11.83871,20.967742,12.290323,14.322581,8.580645,3.193548,19.645161,False


In [674]:
# FIT MODEL TO UNDER SAMPLED DATA

model = LogisticRegression()
model.fit(X_resampled, y_resampled)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [675]:
# TEST MODEL ON UNDERSAMPLED DATA
accs = []
for i in range(10):
    train = under_sampled_data.sample( frac=.7 )
    test = under_sampled_data.drop( train.index )
    accs += [predict(train, test, x_col[:],['UPSET'], model )]
sum(accs) / len(accs)

  y = column_or_1d(y, warn=True)


0.80181818181818199

In [676]:
# TEST MODEL ON TRAIN SET
accs = []
for i in range(10):
    train = potential_upsets.sample( frac=.99 )
    test = potential_upsets.drop( train.index )
    accs += [predict(train, test, x_col[:],['UPSET'], model )]
sum(accs) / len(accs)


  y = column_or_1d(y, warn=True)


0.84999999999999998

In [677]:
# TEST MODEL ON TEST SET
(model.predict(X=potential_upsets_test.iloc[:,:-1]) == potential_upsets_test.iloc[:,-1]).mean()

0.86127167630057799

In [678]:
# ADD OUTPUT COLUMN TO DATASET
output = model.predict(X=potential_upsets.iloc[:,:-1])
potential_upsets['output'] = output

In [679]:
# ADD PROBABILITY OF UPSET TO DATAFRAME
probs = model.predict_proba(X=potential_upsets.iloc[:,:-2])
output_probs = [item[1] for item in probs]
potential_upsets['probability'] = output_probs

final_results = pd.concat([saved_attributes, potential_upsets], axis=1)

In [680]:
final_results = pd.concat([saved_attributes, potential_upsets], axis=1)

In [681]:
# SORT BY MOST LIKELY UPSETS
final_results.sort_values(by='probability', ascending= False)

Unnamed: 0,WTeamID,LTeamID,Season,WSeed,LSeed,WW,WL,WScore,WFGM,WFGA,...,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,UPSET,output,probability
373,1360,1163,2008,13,4,20,13,65.000000,23.666667,53.090909,...,13.531250,28.375000,14.812500,13.593750,5.750000,8.781250,15.218750,True,True,0.997400
884,1292,1277,2016,15,2,22,9,72.322581,25.903226,57.709677,...,12.294118,29.558824,20.558824,11.794118,4.411765,5.147059,19.088235,True,True,0.987531
236,1206,1163,2006,11,1,23,7,69.400000,25.733333,53.200000,...,16.300000,28.666667,15.933333,14.200000,6.633333,9.333333,15.400000,True,True,0.985845
560,1199,1323,2011,10,2,21,10,69.322581,24.483871,56.161290,...,10.937500,26.000000,16.718750,10.687500,4.593750,3.687500,15.031250,True,True,0.985503
468,1320,1242,2010,9,1,28,4,63.312500,21.500000,49.906250,...,12.941176,27.588235,17.147059,12.970588,8.411765,6.411765,18.264706,True,True,0.983826
519,1433,1242,2011,11,1,23,11,71.529412,24.264706,55.764706,...,11.676471,26.882353,17.882353,13.470588,7.941176,4.147059,17.882353,True,True,0.978469
975,1376,1181,2017,7,2,21,10,71.483871,24.258065,59.064516,...,10.457143,25.857143,13.085714,11.285714,5.914286,4.485714,18.057143,True,True,0.978309
658,1217,1307,2013,14,3,18,9,68.925926,22.703704,47.037037,...,9.382353,25.852941,14.617647,11.500000,6.117647,3.911765,15.441176,True,True,0.978132
176,1301,1163,2005,10,2,19,13,73.843750,25.187500,54.875000,...,15.689655,30.448276,17.413793,15.206897,6.172414,9.137931,15.827586,True,True,0.976663
66,1104,1390,2004,8,1,17,12,72.206897,24.896552,55.000000,...,11.100000,25.200000,16.366667,13.966667,6.366667,4.433333,17.266667,True,True,0.970469
