In [177]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split

In [92]:
def get_train_test_data(year = 2017):
    train_data = pd.read_csv('../ouput/train_'+str(year)+'.csv')

    train_data['WTseed']= train_data['WTseed'].fillna(18)####
    train_data['LTseed']= train_data['LTseed'].fillna(18)####

    train_data['ID'] = train_data.apply(lambda r: '_'.join(map(str, [r['Season']]+sorted([r['WTeamID'],r['LTeamID']]))), axis=1)
    train_data['IDTeams'] = train_data.apply(lambda r: '_'.join(map(str, sorted([r['WTeamID'],r['LTeamID']]))), axis=1)
    train_data['Team1'] = train_data.apply(lambda r: sorted([r['WTeamID'],r['LTeamID']])[0], axis=1)
    train_data['Team2'] = train_data.apply(lambda r: sorted([r['WTeamID'],r['LTeamID']])[1], axis=1)
    train_data['IDTeam1'] = train_data.apply(lambda r: '_'.join(map(str, [r['Season'], r['Team1']])), axis=1)
    train_data['IDTeam2'] = train_data.apply(lambda r: '_'.join(map(str, [r['Season'], r['Team2']])), axis=1)
    train_data['Pred'] = train_data.apply(lambda r: 1. if sorted([r['WTeamID'],r['LTeamID']])[0]==r['WTeamID'] else 0., axis=1)
    train_data['ScoreDiff'] = train_data['WScore']-train_data['LScore']
    train_data['ScoreDiff'] = train_data.apply(lambda r: r['ScoreDiff'] * -1 if r['Pred'] == 0. else r['ScoreDiff'], axis=1)
    train_data['Team1Score'] = train_data.apply(lambda r: r['WScore'] if r['Pred'] == 1. else r['LScore'],axis=1)
    train_data['Team2Score'] = train_data.apply(lambda r: r['WScore'] if r['Pred'] == 0. else r['LScore'],axis=1)
    train_data['Team1Seed'] = train_data.apply(lambda r: r['WTseed']if r['Pred'] == 1. else r['LTseed'],axis=1)
    train_data['Team2Seed'] = train_data.apply(lambda r: r['WTseed']if r['Pred'] == 0. else r['LTseed'],axis=1)
    train_data['SeedDiff'] = train_data.apply(lambda r: r['Team1Seed']-r['Team2Seed'],axis = 1)
    train_data['Team1ScoreRatio'] = train_data['Team1Score']/(train_data['Team1Score']+train_data['Team2Score'])
    train_data['Team2ScoreRatio'] = train_data['Team2Score']/(train_data['Team1Score']+train_data['Team2Score'])
    train_data['Team1Loc'] = train_data.apply(lambda r: r['WLoc'] if r['Pred'] == 1 else None,axis=1)
    train_data['Team2Loc'] = train_data.apply(lambda r: r['WLoc'] if r['Pred'] == 0 else None,axis=1)
    train_data['Team1Loc'] = train_data[['Team1Loc','Team2Loc']].apply(lambda r: 'N' if r[1]=='N' else r[0],axis=1)
    train_data['Team2Loc'] = train_data[['Team1Loc','Team2Loc']].apply(lambda r: 'N' if r[0]=='N' else r[1],axis=1)
    train_data['Team1Loc'] = train_data[['Team1Loc','Team2Loc']].apply(lambda r: 'A' if r[1]=='H' else r[0],axis=1)
    train_data['Team1Loc'] = train_data[['Team1Loc','Team2Loc']].apply(lambda r: 'H' if r[1]=='A' else r[0],axis=1)
    train_data['Team2Loc'] = train_data[['Team1Loc','Team2Loc']].apply(lambda r: 'A' if r[0]=='H' else r[1],axis=1)
    train_data['Team2Loc'] = train_data[['Team1Loc','Team2Loc']].apply(lambda r: 'H' if r[0]=='A' else r[1],axis=1)
    
    train_data = train_data.drop(['index','DayNum','WScore','LScore','WTseed','LTseed','WTeamID','LTeamID'],axis=1)


    team1d_score_spread = train_data.groupby(['Season', 'Team1'])[['ScoreDiff', 'Team1Score','Team1ScoreRatio']].mean().reset_index()\
    .set_index('Season').rename(columns = {'ScoreDiff' : 'ScoreDiff1','Team1ScoreRatio':'Team1ScoreRatio_avg','Team1Score':'Team1Score_avg'})
    team2d_score_spread = train_data.groupby(['Season', 'Team2'])[['ScoreDiff', 'Team2Score','Team2ScoreRatio']].mean().reset_index()\
    .set_index('Season').rename(columns = {'ScoreDiff' : 'ScoreDiff2','Team2ScoreRatio':'Team2ScoreRatio_avg','Team2Score':'Team2Score_avg'})
    score_spread = team1d_score_spread.join(team2d_score_spread).reset_index()

    X_train = pd.merge(train_data,score_spread,on=['Team1','Team2','Season'])

    y_train = X_train['Pred']
    X_train = X_train.drop(['WLoc','NumOT','Pred','Team1Score','Team2Score','Team1ScoreRatio','Team2ScoreRatio','ScoreDiff'],axis=1)
    
    test_data = pd.read_csv('../ouput/test_'+str(year)+'.csv')
    test_data = test_data.drop(['index','DayNum','NumOT'],axis=1)
    test_data['WTseed']= test_data['WTseed'].fillna(18)####
    test_data['LTseed']= test_data['LTseed'].fillna(18)####
    test_data['ID'] = test_data.apply(lambda r: '_'.join(map(str, [r['Season']]+sorted([r['WTeamID'],r['LTeamID']]))), axis=1)
    test_data['IDTeams'] = test_data.apply(lambda r: '_'.join(map(str, sorted([r['WTeamID'],r['LTeamID']]))), axis=1)
    test_data['Team1'] = test_data.apply(lambda r: sorted([r['WTeamID'],r['LTeamID']])[0], axis=1)
    test_data['Team2'] = test_data.apply(lambda r: sorted([r['WTeamID'],r['LTeamID']])[1], axis=1)
    test_data['IDTeam1'] = test_data.apply(lambda r: '_'.join(map(str, [r['Season'], r['Team1']])), axis=1)
    test_data['IDTeam2'] = test_data.apply(lambda r: '_'.join(map(str, [r['Season'], r['Team2']])), axis=1)
    test_data['Pred'] = test_data.apply(lambda r: 1. if sorted([r['WTeamID'],r['LTeamID']])[0]==r['WTeamID'] else 0., axis=1)
    test_data['Team1Seed'] = test_data.apply(lambda r: r['WTseed']if r['Pred'] == 1. else r['LTseed'],axis=1)
    test_data['Team2Seed'] = test_data.apply(lambda r: r['WTseed']if r['Pred'] == 0. else r['LTseed'],axis=1)
    test_data['SeedDiff'] = test_data.apply(lambda r: r['Team1Seed']-r['Team2Seed'],axis = 1)
    test_data = test_data.drop(['WScore','LScore','WTseed','LTseed','WTeamID','LTeamID'],axis=1)
    test_data['Team1Loc'] = test_data.apply(lambda r: r['WLoc'] if r['Pred'] == 1 else None,axis=1)
    test_data['Team2Loc'] = test_data.apply(lambda r: r['WLoc'] if r['Pred'] == 0 else None,axis=1)
    test_data['Team1Loc'] = test_data[['Team1Loc','Team2Loc']].apply(lambda r: 'N' if r[1]=='N' else r[0],axis=1)
    test_data['Team2Loc'] = test_data[['Team1Loc','Team2Loc']].apply(lambda r: 'N' if r[0]=='N' else r[1],axis=1)
    test_data['Team1Loc'] = test_data[['Team1Loc','Team2Loc']].apply(lambda r: 'A' if r[1]=='H' else r[0],axis=1)
    test_data['Team1Loc'] = test_data[['Team1Loc','Team2Loc']].apply(lambda r: 'H' if r[1]=='A' else r[0],axis=1)
    test_data['Team2Loc'] = test_data[['Team1Loc','Team2Loc']].apply(lambda r: 'A' if r[0]=='H' else r[1],axis=1)
    test_data['Team2Loc'] = test_data[['Team1Loc','Team2Loc']].apply(lambda r: 'H' if r[0]=='A' else r[1],axis=1)
    X_test = pd.merge(test_data,score_spread,on=['Team1','Team2','Season'])
    y_test = X_test['Pred']
    X_test = X_test.drop(['Pred',"WLoc"],axis=1)
    
    return X_train,y_train,X_test,y_test


In [93]:
X_train,y_train,X_test,y_test = get_train_test_data()

In [94]:
def train_process1(df):
    df =df.drop(['Unnamed: 0',"Season","ID","IDTeams","Team1","Team2","IDTeam1","IDTeam2"],axis=1)
    Lsea=pd.get_dummies(df["Lsea"],prefix="Lsea")
    Wsea=pd.get_dummies(df["Wsea"],prefix="Wsea")
    df=pd.concat([df,Lsea,Wsea],axis=1)
    df["Team1Loc"]=df["Team1Loc"].astype("category")
    df["Team2Loc"]=df["Team2Loc"].astype("category")
    df.Team1Loc=df.Team1Loc.cat.rename_categories([0,2,1])
    df.Team2Loc=df.Team2Loc.cat.rename_categories([0,2,1])
    df=df.drop(["Lsea","Wsea"],axis=1)
    return df

In [95]:
def test_process2(df):
    df =df.drop(['Unnamed: 0',"Season","ID","IDTeams","Team1","Team2","IDTeam1","IDTeam2"],axis=1)
    Lsea=pd.get_dummies(df["Lsea"],prefix="Lsea")
    Wsea=pd.get_dummies(df["Wsea"],prefix="Wsea")
    df=pd.concat([df,Lsea,Wsea],axis=1)
    df["Team1Loc"]=df["Team1Loc"].astype("category")
    df["Team2Loc"]=df["Team2Loc"].astype("category")
    df.Team1Loc=df.Team1Loc.cat.rename_categories([1])
    df.Team2Loc=df.Team2Loc.cat.rename_categories([1])
    df=df.drop(["Lsea","Wsea"],axis=1)
    return df

In [193]:
def logloss(y,y_hat):
    scale = len(y)
    loss = -np.sum(y*np.log(y_hat[:,1])+(1-y)*np.log(1-y_hat[:,0]))/scale
    return loss

In [96]:
X_train=cat_process1(X_train)

In [97]:
X_test=cat_process2(X_test)

# Baseline Model

In [146]:
#Logistic Regression & Feature selection

In [154]:
lr=LogisticRegression()

In [167]:
Find_features=RFECV(lr,step=1,cv =5,scoring='neg_log_loss')

In [168]:
Find_features.fit(X_train,y_train)

RFECV(cv=5,
   estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
   n_jobs=1, scoring='neg_log_loss', step=1, verbose=0)

In [169]:
key=[]
for i in range(len(F_features.support_)):
    if F_features.support_[i] == True:
        key.append(X_train.keys()[i])
key

['SeedDiff',
 'Team1Loc',
 'Team2Loc',
 'ScoreDiff1',
 'Team1ScoreRatio_avg',
 'ScoreDiff2',
 'Lsea_W',
 'Lsea_Y',
 'Lsea_Z',
 'Wsea_X',
 'Wsea_Y',
 'Wsea_Z']

In [186]:
y_hat = F_features.predict_proba(X_test)

In [195]:
logloss_=logloss(y_test,y_hat)