## Overview ##

Based on Starter Kernel [Logistic Regression on Tournament Seeds by Kasper P. Lauritzen](https://www.kaggle.com/kplauritzen/notebookde27b18258?scriptVersionId=804590) 

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from subprocess import check_output
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix



  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Load the training data ##
We're keeping it simple & using only 2 files for this model: the Tourney seeds & Compact results.

In [133]:
data_dir = '../input/'
df_seeds = pd.read_csv(data_dir + 'NCAATourneySeeds.csv')
#df_tour = pd.read_csv(data_dir + 'NCAATourneyDetailedResults.csv')
df_tour = pd.read_csv(data_dir + 'RegularSeasonCompactResults.csv')
teams = pd.read_csv(data_dir + 'Teams.csv')

def seed_to_int(seed):
    s_int = int(seed[1:3])
    return s_int
df_seeds['seed_int'] = df_seeds.Seed.apply(seed_to_int)
df_seeds.drop(labels=['Seed'], inplace=True, axis=1) # This is the string label
df_tour.drop(labels=['DayNum', 'WScore', 'LScore', 'WLoc', 'NumOT'], inplace=True, axis=1)
df_tour.head()


Unnamed: 0,Season,WTeamID,LTeamID
0,1985,1228,1328
1,1985,1106,1354
2,1985,1112,1223
3,1985,1165,1432
4,1985,1192,1447


Build out the id string (year_wteam_lteam)
Create a prediction at the end.

In [159]:
df_winseeds = df_seeds.rename(columns={'TeamID':'WTeamID', 'seed_int':'WSeed'})
df_lossseeds = df_seeds.rename(columns={'TeamID':'LTeamID', 'seed_int':'LSeed'})
df_dummy = pd.merge(left=df_tour, right=df_winseeds, how='left', on=['Season', 'WTeamID'])
df_concat = pd.merge(left=df_dummy, right=df_lossseeds, on=['Season', 'LTeamID'])
df_true=df_concat.copy()

columnsTitles=["LTeamID","WTeamID"]
df_false = df_concat.rename(columns={'WTeamID': 'LTeamID_', 'LTeamID': 'WTeamID_'}) #idk if i have to do this I just didnt want to create issues
df_false = df_false.rename(columns={'LTeamID_': 'LTeamID', 'WTeamID_': 'WTeamID'})
df_false = df_false.rename(columns={'WSeed': 'LSeed_', 'LSeed': 'WSeed_'}) #idk if i have to do this I just didnt want to create issues
df_false = df_false.rename(columns={'LSeed_': 'LSeed', 'WSeed_': 'WSeed'})


df_true['label'] = 1
df_false['label'] = 0

df = df_true.append(df_false, ignore_index=True)
df = df.fillna(17)

#df['WTID']=df.apply(build_wteamyear, axis=1) #again, probably better way to do this and save a function
#df['LTID']=df.apply(build_lteamyear, axis=1)
df['SeedDiff'] =  df.WSeed - df.LSeed #Lower seeds are better, so Negative means that the lower seed team won.
df = df[df['Season'] > 2013] # We want to train on 2014 and above.

df = df[['WTeamID', 'LTeamID', 'SeedDiff', 'label']]
df.head()



Unnamed: 0,WTeamID,LTeamID,SeedDiff,label
15289,1103,1157,1.0,1
15290,1279,1157,1.0,1
15291,1278,1157,1.0,1
15292,1418,1157,-2.0,1
15293,1155,1157,1.0,1


## Encode and, Scale and Split ##

In [160]:
X = df.iloc[:, 0:3].values
y = df.iloc[:, 3].values


enc_1 = LabelEncoder()
X[:, 0] = enc_1.fit_transform(X[:, 0])
enc_2 = LabelEncoder()
X[:, 1] = enc_2.fit_transform(X[:, 1])
onehotencoder = OneHotEncoder(categorical_features = [0,1],handle_unknown='error') #handle_unknown='unknown'or impute
X = onehotencoder.fit_transform(X).toarray()


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

ds_width=X_test.shape[1]


## Build the classifier and hit the button ##

In [162]:
from keras.layers import LeakyReLU
from keras import metrics
from keras import optimizers

# 61%  //Dense(6, he_normal)//leakyrelu alpha 0.2//Dense(16, he_normal)
# leakyrelu alpha 0.2//Dense(16, he_normal)//leakyrelu alpha 0.2//Dense(1, he_normal, sigmoid)//SGD, Mean_Squared_Error
# Third Attempt.
classifier = Sequential()
classifier.add(Dense(13,input_shape=(ds_width,),kernel_initializer = 'normal',activation='relu'))
#classifier.add(LeakyReLU(alpha=0.2))
classifier.add(Dense(6, activation='relu',kernel_initializer = 'normal',))
#classifier.add(LeakyReLU(alpha=0.2))
#classifier.add(Dense(64, activation='relu',kernel_initializer = 'normal',))
classifier.add(Dense(1,kernel_initializer = 'normal' ))
classifier.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics=['binary_accuracy']) #binary_crossentropy
print("Cooking... Please wait")
classifier.fit(X_train, y_train, batch_size = 250, epochs = 200, verbose=1)
print("*ding*... Testing Accuracy on test set")
#Run Predictions on the test set
pred_test = classifier.predict(X_test)

pred_test = (pred_test > 0.5)
cm = confusion_matrix(pred_test, y_test)
print ((cm[0][0]+cm[1][1])/(sum(cm[0])+sum(cm[1])))

Cooking... Please wait
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200


Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200


Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200
*ding*... Testing Accuracy on test set
0.6536485097636177


## Make the prediction ##

In [None]:
def format_dataset(df_sample):
    df_pred = pd.DataFrame()
    df_pred['Season'], df_pred['WTeamID'], df_pred['LTeamID'] = df_sample_sub['ID'].str.split('_', 2).str
    df_pred = df_pred.astype(int) #DO I NEED THIS?
    df_winseeds = df_seeds.rename(columns={'TeamID':'WTeamID', 'seed_int':'WSeed'})
    df_lossseeds = df_seeds.rename(columns={'TeamID':'LTeamID', 'seed_int':'LSeed'})
    df_preddummy = pd.merge(left=df_pred, right=df_winseeds, how='left', on=['Season', 'WTeamID'])
    df_predmerge = pd.merge(left=df_preddummy, right=df_lossseeds, on=['Season', 'LTeamID'])
    df_predmerge['SeedDiff'] = df_predmerge.WSeed - df_predmerge.LSeed
    df_predmerge = df_predmerge.rename(columns={'Season':'Year'})
    df_predictions = df_predmerge[['WTeamID','LTeamID','SeedDiff']] # We really should add year to this somewhere
    df_predictions['WTeamID'] = df_predictions['WTeamID'].astype(str)
    df_predictions['LTeamID'] = df_predictions['LTeamID'].astype(str)
    for x in df_predictions.columns:
        df_predictions[x]=df_predictions[x].astype(df_preenc[x].dtypes.name)
    return df_predictions.iloc[:, 0:3].values


#Convert the prediction table to our format
df_predX = format_dataset(pd.read_csv(data_dir + 'SampleSubmissionStage1.csv'))
#Encode the Predictions
df_predX[:, 0] = enc_1.transform(df_predX[:, 0])
df_predX[:, 1] = enc_2.transform(df_predX[:, 1])
df_predX = onehotencoder.transform(df_predX).toarray()
df_predX = sc.transform(df_predX)
#Generate the predictions and save
y_p = classifier.predict(df_predX)
clipped_preds = np.clip(y_p, 0.0001, 0.999)
df_out=df_sample_sub
df_out.Pred = clipped_preds
df_out.to_csv('out.csv', index=False)
display(df_out)
print(df_out['Pred'].std())
