In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from subprocess import check_output
import csv
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.svm import SVC
from sklearn import cross_validation
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import AdaBoostClassifier
%matplotlib inline



In [3]:
def get_train_test_data(year = 2017):
    train_data = pd.read_csv('../ouput/train_'+str(year)+'.csv')

    train_data['WTseed']= train_data['WTseed'].fillna(18)####
    train_data['LTseed']= train_data['LTseed'].fillna(18)####

    train_data['ID'] = train_data.apply(lambda r: '_'.join(map(str, [r['Season']]+sorted([r['WTeamID'],r['LTeamID']]))), axis=1)
    train_data['IDTeams'] = train_data.apply(lambda r: '_'.join(map(str, sorted([r['WTeamID'],r['LTeamID']]))), axis=1)
    train_data['Team1'] = train_data.apply(lambda r: sorted([r['WTeamID'],r['LTeamID']])[0], axis=1)
    train_data['Team2'] = train_data.apply(lambda r: sorted([r['WTeamID'],r['LTeamID']])[1], axis=1)
    train_data['IDTeam1'] = train_data.apply(lambda r: '_'.join(map(str, [r['Season'], r['Team1']])), axis=1)
    train_data['IDTeam2'] = train_data.apply(lambda r: '_'.join(map(str, [r['Season'], r['Team2']])), axis=1)
    train_data['Pred'] = train_data.apply(lambda r: 1. if sorted([r['WTeamID'],r['LTeamID']])[0]==r['WTeamID'] else 0., axis=1)
    train_data['ScoreDiff'] = train_data['WScore']-train_data['LScore']
    train_data['ScoreDiff'] = train_data.apply(lambda r: r['ScoreDiff'] * -1 if r['Pred'] == 0. else r['ScoreDiff'], axis=1)
    train_data['Team1Score'] = train_data.apply(lambda r: r['WScore'] if r['Pred'] == 1. else r['LScore'],axis=1)
    train_data['Team2Score'] = train_data.apply(lambda r: r['WScore'] if r['Pred'] == 0. else r['LScore'],axis=1)
    train_data['Team1Seed'] = train_data.apply(lambda r: r['WTseed']if r['Pred'] == 1. else r['LTseed'],axis=1)
    train_data['Team2Seed'] = train_data.apply(lambda r: r['WTseed']if r['Pred'] == 0. else r['LTseed'],axis=1)
    train_data['SeedDiff'] = train_data.apply(lambda r: r['Team1Seed']-r['Team2Seed'],axis = 1)
    train_data['Team1ScoreRatio'] = train_data['Team1Score']/(train_data['Team1Score']+train_data['Team2Score'])
    train_data['Team2ScoreRatio'] = train_data['Team2Score']/(train_data['Team1Score']+train_data['Team2Score'])
    train_data['Team1Loc'] = train_data.apply(lambda r: r['WLoc'] if r['Pred'] == 1 else None,axis=1)
    train_data['Team2Loc'] = train_data.apply(lambda r: r['WLoc'] if r['Pred'] == 0 else None,axis=1)
    train_data['Team1Loc'] = train_data[['Team1Loc','Team2Loc']].apply(lambda r: 'N' if r[1]=='N' else r[0],axis=1)
    train_data['Team2Loc'] = train_data[['Team1Loc','Team2Loc']].apply(lambda r: 'N' if r[0]=='N' else r[1],axis=1)
    train_data['Team1Loc'] = train_data[['Team1Loc','Team2Loc']].apply(lambda r: 'A' if r[1]=='H' else r[0],axis=1)
    train_data['Team1Loc'] = train_data[['Team1Loc','Team2Loc']].apply(lambda r: 'H' if r[1]=='A' else r[0],axis=1)
    train_data['Team2Loc'] = train_data[['Team1Loc','Team2Loc']].apply(lambda r: 'A' if r[0]=='H' else r[1],axis=1)
    train_data['Team2Loc'] = train_data[['Team1Loc','Team2Loc']].apply(lambda r: 'H' if r[0]=='A' else r[1],axis=1)
    
    train_data = train_data.drop(['index','DayNum','WScore','LScore','WTseed','LTseed','WTeamID','LTeamID'],axis=1)


    team1d_score_spread = train_data.groupby(['Season', 'Team1'])[['ScoreDiff', 'Team1Score','Team1ScoreRatio']].mean().reset_index()\
    .set_index('Season').rename(columns = {'ScoreDiff' : 'ScoreDiff1','Team1ScoreRatio':'Team1ScoreRatio_avg','Team1Score':'Team1Score_avg'})
    team2d_score_spread = train_data.groupby(['Season', 'Team2'])[['ScoreDiff', 'Team2Score','Team2ScoreRatio']].mean().reset_index()\
    .set_index('Season').rename(columns = {'ScoreDiff' : 'ScoreDiff2','Team2ScoreRatio':'Team2ScoreRatio_avg','Team2Score':'Team2Score_avg'})
    score_spread = team1d_score_spread.join(team2d_score_spread).reset_index()

    X_train = pd.merge(train_data,score_spread,on=['Team1','Team2','Season'])

    y_train = X_train['Pred']
    X_train = X_train.drop(['WLoc','NumOT','Pred','Team1Score','Team2Score','Team1ScoreRatio','Team2ScoreRatio','ScoreDiff'],axis=1)
    
    test_data = pd.read_csv('../ouput/test_'+str(year)+'.csv')
    test_data = test_data.drop(['index','DayNum','NumOT'],axis=1)
    test_data['WTseed']= test_data['WTseed'].fillna(18)####
    test_data['LTseed']= test_data['LTseed'].fillna(18)####
    test_data['ID'] = test_data.apply(lambda r: '_'.join(map(str, [r['Season']]+sorted([r['WTeamID'],r['LTeamID']]))), axis=1)
    test_data['IDTeams'] = test_data.apply(lambda r: '_'.join(map(str, sorted([r['WTeamID'],r['LTeamID']]))), axis=1)
    test_data['Team1'] = test_data.apply(lambda r: sorted([r['WTeamID'],r['LTeamID']])[0], axis=1)
    test_data['Team2'] = test_data.apply(lambda r: sorted([r['WTeamID'],r['LTeamID']])[1], axis=1)
    test_data['IDTeam1'] = test_data.apply(lambda r: '_'.join(map(str, [r['Season'], r['Team1']])), axis=1)
    test_data['IDTeam2'] = test_data.apply(lambda r: '_'.join(map(str, [r['Season'], r['Team2']])), axis=1)
    test_data['Pred'] = test_data.apply(lambda r: 1. if sorted([r['WTeamID'],r['LTeamID']])[0]==r['WTeamID'] else 0., axis=1)
    test_data['Team1Seed'] = test_data.apply(lambda r: r['WTseed']if r['Pred'] == 1. else r['LTseed'],axis=1)
    test_data['Team2Seed'] = test_data.apply(lambda r: r['WTseed']if r['Pred'] == 0. else r['LTseed'],axis=1)
    test_data['SeedDiff'] = test_data.apply(lambda r: r['Team1Seed']-r['Team2Seed'],axis = 1)
    test_data = test_data.drop(['WScore','LScore','WTseed','LTseed','WTeamID','LTeamID'],axis=1)
    test_data['Team1Loc'] = test_data.apply(lambda r: r['WLoc'] if r['Pred'] == 1 else None,axis=1)
    test_data['Team2Loc'] = test_data.apply(lambda r: r['WLoc'] if r['Pred'] == 0 else None,axis=1)
    test_data['Team1Loc'] = test_data[['Team1Loc','Team2Loc']].apply(lambda r: 'N' if r[1]=='N' else r[0],axis=1)
    test_data['Team2Loc'] = test_data[['Team1Loc','Team2Loc']].apply(lambda r: 'N' if r[0]=='N' else r[1],axis=1)
    test_data['Team1Loc'] = test_data[['Team1Loc','Team2Loc']].apply(lambda r: 'A' if r[1]=='H' else r[0],axis=1)
    test_data['Team1Loc'] = test_data[['Team1Loc','Team2Loc']].apply(lambda r: 'H' if r[1]=='A' else r[0],axis=1)
    test_data['Team2Loc'] = test_data[['Team1Loc','Team2Loc']].apply(lambda r: 'A' if r[0]=='H' else r[1],axis=1)
    test_data['Team2Loc'] = test_data[['Team1Loc','Team2Loc']].apply(lambda r: 'H' if r[0]=='A' else r[1],axis=1)
    X_test = pd.merge(test_data,score_spread,on=['Team1','Team2','Season'])
    y_test = X_test['Pred']
    X_test = X_test.drop(['Pred',"WLoc"],axis=1)
    
    return X_train,y_train,X_test,y_test


In [4]:
X_train,y_train,X_test,y_test = get_train_test_data()

In [5]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,Lsea,Season,Wsea,ID,IDTeams,Team1,Team2,IDTeam1,IDTeam2,...,Team2Seed,SeedDiff,Team1Loc,Team2Loc,ScoreDiff1,Team1Score_avg,Team1ScoreRatio_avg,ScoreDiff2,Team2Score_avg,Team2ScoreRatio_avg
0,0,,2017,,2017_1104_1157,1104_1157,1104,1157,2017_1104,2017_1157,...,18.0,0.0,H,A,4.151515,68.666667,0.516269,7.1,68.5,0.476035
1,1,,2017,,2017_1107_1336,1107_1336,1107,1336,2017_1107,2017_1336,...,18.0,0.0,A,H,4.787879,71.030303,0.515335,0.72,73.0,0.497694
2,2,Y,2017,X,2017_1112_1277,1112_1277,1112,1277,2017_1112,2017_1277,...,9.0,-7.0,N,N,10.823529,76.264706,0.538339,6.272727,65.909091,0.476381
3,3,,2017,,2017_1113_1340,1113_1340,1113,1340,2017_1113,2017_1340,...,18.0,0.0,H,A,-2.354839,79.290323,0.490979,1.913043,80.913043,0.493927
4,4,,2017,Z,2017_1116_1236,1116_1236,1116,1236,2017_1116,2017_1236,...,18.0,-10.0,H,A,5.393939,79.575758,0.517562,-3.5,83.125,0.509532


In [6]:
def train_process1(df):
    df =df.drop(['Unnamed: 0',"Season","ID","IDTeams","Team1","Team2","IDTeam1","IDTeam2"],axis=1)
    Lsea=pd.get_dummies(df["Lsea"],prefix="Lsea")
    Wsea=pd.get_dummies(df["Wsea"],prefix="Wsea")
    df=pd.concat([df,Lsea,Wsea],axis=1)
    df["Team1Loc"]=df["Team1Loc"].astype("category")
    df["Team2Loc"]=df["Team2Loc"].astype("category")
    df.Team1Loc=df.Team1Loc.cat.rename_categories([0,2,1])
    df.Team2Loc=df.Team2Loc.cat.rename_categories([0,2,1])
    df=df.drop(["Lsea","Wsea"],axis=1)
    return df

In [7]:
def test_process2(df):
    df =df.drop(['Unnamed: 0',"Season","ID","IDTeams","Team1","Team2","IDTeam1","IDTeam2"],axis=1)
    Lsea=pd.get_dummies(df["Lsea"],prefix="Lsea")
    Wsea=pd.get_dummies(df["Wsea"],prefix="Wsea")
    df=pd.concat([df,Lsea,Wsea],axis=1)
    df["Team1Loc"]=df["Team1Loc"].astype("category")
    df["Team2Loc"]=df["Team2Loc"].astype("category")
    df.Team1Loc=df.Team1Loc.cat.rename_categories([1])
    df.Team2Loc=df.Team2Loc.cat.rename_categories([1])
    df=df.drop(["Lsea","Wsea"],axis=1)
    return df

In [8]:
def logloss(y,y_hat):
    scale = len(y)
    loss = -np.sum(y*np.log(y_hat[:,1])+(1-y)*np.log(1-y_hat[:,0]))/scale
    return loss

In [9]:
X_train=train_process1(X_train)

In [10]:
X_test=test_process2(X_test)

In [11]:
X_train.head()

Unnamed: 0,Team1Seed,Team2Seed,SeedDiff,Team1Loc,Team2Loc,ScoreDiff1,Team1Score_avg,Team1ScoreRatio_avg,ScoreDiff2,Team2Score_avg,Team2ScoreRatio_avg,Lsea_W,Lsea_X,Lsea_Y,Lsea_Z,Wsea_W,Wsea_X,Wsea_Y,Wsea_Z
0,18.0,18.0,0.0,2,0,4.151515,68.666667,0.516269,7.1,68.5,0.476035,0,0,0,0,0,0,0,0
1,18.0,18.0,0.0,0,2,4.787879,71.030303,0.515335,0.72,73.0,0.497694,0,0,0,0,0,0,0,0
2,2.0,9.0,-7.0,1,1,10.823529,76.264706,0.538339,6.272727,65.909091,0.476381,0,0,1,0,0,1,0,0
3,18.0,18.0,0.0,2,0,-2.354839,79.290323,0.490979,1.913043,80.913043,0.493927,0,0,0,0,0,0,0,0
4,8.0,18.0,-10.0,2,0,5.393939,79.575758,0.517562,-3.5,83.125,0.509532,0,0,0,0,0,0,0,1


# Baseline Model

In [12]:
#Logistic Regression & Feature selection

In [13]:
lr=LogisticRegression()

In [14]:
Find_features=RFECV(lr,step=1,cv =5,scoring='neg_log_loss')

In [15]:
Find_features.fit(X_train,y_train)

RFECV(cv=5,
   estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
   n_jobs=1, scoring='neg_log_loss', step=1, verbose=0)

In [29]:
key=[]
for i in range(len(Find_features.support_)):
    if Find_features.support_[i] == True:
        key.append(X_train.keys()[i])


In [17]:
y_hat = Find_features.predict_proba(X_test)

In [18]:
logloss_=logloss(y_test,y_hat)

In [36]:
radm = RandomForestClassifier(n_estimators=1000,
                                  max_depth=None,
                                  min_samples_split=10,
                                  class_weight="balanced",
                                  random_state=2).fit(X_train, y_train)
y_val_1 = radm.predict_proba(X_test)
print(logloss(y_test,y_val_1))

0.730775613967


In [35]:
#####

log_reg = LogisticRegression().fit(X_train, y_train)
y_val_2 = log_reg.predict_proba(X_test)
print(logloss(y_test,y_val_2))

0.755504933372


In [43]:
#####
gnb = GaussianNB().fit(X_train,y_train)
y_val_3 = gnb.predict_proba(X_test)
print(logloss(y_test,y_val_3))

1.42151467504


In [44]:
#####
kNN = KNeighborsClassifier(n_neighbors=25).fit(X_train, y_train)
y_val_4 = kNN.predict_proba(X_test)
print(logloss(y_test,y_val_4))

0.837050069743


In [45]:
####
svm = SVC(C=1,kernel='rbf',probability = True).fit(X_train,y_train)

y_val_5 = svm.predict_proba(X_test)
print(logloss(y_test,y_val_5))

0.756483374805


In [46]:
####
ada = AdaBoostClassifier(n_estimators=400, learning_rate=0.1).fit(X_train,y_train)
y_val_6 = ada.predict_proba(X_test)
print(logloss(y_test,y_val_6))

0.684435890754


In [None]:
fpr_5