In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import itertools
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn import metrics


np.random.seed(15)

#read in datasets
teams = pd.read_csv('Teams.csv')
seasons = pd.read_csv('Seasons.csv')
tourneyCompactResults = pd.read_csv('TourneyCompactResults.csv')
tourneyDetailedResults = pd.read_csv('TourneyDetailedResults.csv')
tourneySeeds = pd.read_csv('TourneySeeds_Thru2017.csv')
tourneySlots = pd.read_csv('TourneySlots_Thru2017.csv')
regularSeasonCompactResults = pd.read_csv('RegularSeasonCompactResults.csv')
regularSeasonDetailedResults = pd.read_csv('RegularSeasonDetailedResults.csv')
submission_df = pd.read_csv( 'SampleSubmission_5050Benchmark.csv')

#turn all datasets into DataFrames
teams_df = pd.DataFrame(teams)
seasons_df = pd.DataFrame(seasons)
tourneyCompact_df = pd.DataFrame(tourneyCompactResults)
tourneyDetailed_df = pd.DataFrame(tourneyDetailedResults)
tourneySeeds_df = pd.DataFrame(tourneySeeds)
tourneySlots_df = pd.DataFrame(tourneySlots)
regSeasonCompact_df = pd.DataFrame(regularSeasonCompactResults)
regSeasonDetailed_df = pd.DataFrame(regularSeasonDetailedResults)

Jayant's Code help

In [None]:
# Transform the data, get the seed number for winning and losing team and remove unnecessary columns
tourney_merged_df = pd.merge(left=tourneyCompact_df, right=tourneySeeds_df, how='left',
                           left_on=['Season', 'Wteam'], right_on=['Season', 'Team'])
tourney_merged_df.rename(columns={'Seed' : 'Wseed', 'Team' : 'W_team'}, inplace=True)
tourney_merged_df = pd.merge(left=tourney_merged_df, right=tourneySeeds_df, how='left',
                           left_on=['Season', 'Lteam'], right_on=['Season', 'Team'])
tourney_merged_df.rename(columns={'Seed' : 'Lseed', 'Team' : 'L_team'}, inplace=True)

In [None]:
tourney_sanitized_df = tourney_merged_df[['Season', 'Wteam', 'Lteam', 'Wseed', 'Lseed']].copy()
tourney_sanitized_df['Wseed'] = tourney_sanitized_df['Wseed'].map(lambda x: int(x[1:3]))
tourney_sanitized_df['Lseed'] = tourney_sanitized_df['Lseed'].map(lambda x: int(x[1:3]))
tourney_sanitized_df['Seed_diff'] = tourney_sanitized_df['Wseed'] - tourney_sanitized_df['Lseed']


In [None]:
tourney_winning_df = pd.DataFrame()
tourney_winning_df[['Team1', 'Team2', 'Seed_diff']] = tourney_sanitized_df[['Wteam', 'Lteam', 'Seed_diff']].copy()
tourney_winning_df['result'] = 1
tourney_losing_df = pd.DataFrame()
tourney_losing_df[['Team1', 'Team2', 'Seed_diff']] = tourney_sanitized_df[['Lteam', 'Wteam', 'Seed_diff']].copy()
tourney_losing_df['Seed_diff'] = -tourney_losing_df['Seed_diff']
tourney_losing_df['result'] = 0
tourney_train_final_df = pd.concat((tourney_winning_df, tourney_losing_df))
team_dict = {t: i for i, t in enumerate(tourney_train_final_df.Team1.unique())}

In [None]:
X_train = tourney_train_final_df[['Team1', 'Team2', 'Seed_diff']].copy()
X_train['Team1'] = X_train['Team1'].apply(lambda x: team_dict[x])
X_train['Team2'] = X_train['Team2'].apply(lambda x: team_dict[x])
X_train['Seed_diff'] = X_train['Seed_diff'].values.reshape(-1,1)
Y_train = tourney_train_final_df.result.values

In [None]:
%%time
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    X_train, Y_train, test_size=0.2, random_state=5)

In [None]:
model = LogisticRegression(solver='sag')
model = model.fit(X_train,y_train)
model.score(X_train,y_train)

In [None]:
predicted = model.predict(X_test)


In [None]:
prob = model.predict_proba(X_test)
print prob

In [None]:
print metrics.accuracy_score(y_test, predicted)
print metrics.confusion_matrix(y_test, predicted)
print metrics.classification_report(y_test, predicted)

In [None]:
test_et_df = pd.DataFrame()
test_et_df['Season'] = submission_df['id'].apply(lambda x: int(x.split("_")[0]))
test_et_df['Team1'] = submission_df['id'].apply(lambda x: int(x.split("_")[1]))
test_et_df['Team2'] = submission_df['id'].apply(lambda x: int(x.split("_")[2]))

In [None]:
test_et_merged_df = pd.merge(left=test_et_df, right=tourneySeeds_df, how='left',
                           left_on=['Season', 'Team1'], right_on=['Season', 'Team'])
test_et_merged_df.rename(columns={'Seed' : 'Seed1', 'Team' : 'team_1'}, inplace=True)
test_et_merged_df = pd.merge(left=test_et_merged_df, right=tourneySeeds_df, how='left',
                           left_on=['Season', 'Team2'], right_on=['Season', 'Team'])
test_et_merged_df.rename(columns={'Seed' : 'Seed2', 'Team' : 'team_2'}, inplace=True)
test_et_sanitized_df = test_et_merged_df[['Season', 'Team1', 'Team2', 'Seed1', 'Seed2']].copy()
test_et_sanitized_df['Seed1'] = test_et_sanitized_df['Seed1'].map(lambda x: int(x[1:3]))
test_et_sanitized_df['Seed2'] = test_et_sanitized_df['Seed2'].map(lambda x: int(x[1:3]))
test_et_sanitized_df['Seed_diff'] = test_et_sanitized_df['Seed1'] - test_et_sanitized_df['Seed2']

In [35]:
X_et_test = test_et_sanitized_df[['Team1', 'Team2', 'Seed_diff']].copy()
X_et_test['Team1'] = X_et_test['Team1'].apply(lambda x: team_dict[x])
X_et_test['Team2'] = X_et_test['Team2'].apply(lambda x: team_dict[x])
X_et_test['Seed_diff'] = X_et_test['Seed_diff'].values.reshape(-1,1)

In [36]:
test_et_pred_prob = model.predict_proba(X_et_test)[:,1]

In [37]:
final_et_output = pd.DataFrame()
final_et_output['id'] = submission_df['id']
final_et_output['pred'] = test_et_pred_prob
#final_et_output.to_csv('logisticRegression2.csv', index=False)
final_et_output

Unnamed: 0,id,pred
0,2013_1103_1107,0.569383
1,2013_1103_1112,0.172159
2,2013_1103_1125,0.565835
3,2013_1103_1129,0.539784
4,2013_1103_1137,0.398694
5,2013_1103_1139,0.239970
6,2013_1103_1143,0.337961
7,2013_1103_1153,0.297719
8,2013_1103_1160,0.328421
9,2013_1103_1161,0.225086
