In [139]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import math

In [140]:
# taken from fragile families assignment
def factorize(df):
    """Convert features of type 'object', e.g. 'string', to categorical
    variables or factors."""
    for col in df.columns:
        if df.loc[:,col].dtype == object:
            factors, values = pd.factorize(df[col])
            df.loc[:,col] = factors
    return df

In [141]:
train = pd.read_csv('train_2010_2017.csv')
train = factorize(train)
test = pd.read_csv('test_2018.csv')
test = factorize(test)

In [142]:
external_data = ['PomeroyRank', 'Conf', 'AdjEM', 'AdjO', 'AdjD', 'AdjT', 'Luck', 'SOSAdjEM', 'OppO', 'OppD', 'NCSOSAdjEM', 'MooreRank', 'MooreSOS', 'MoorePR', 'OppPomeroyRank', 'OppConf', 'OppAdjEM', 'OppAdjO', 'OppAdjD', 'OppAdjT', 'OppLuck', 'OppSOSAdjEM', 'OppOppO', 'OppOppD', 'OppNCSOSAdjEM', 'OppMooreRank', 'OppMooreSOS', 'OppMoorePR']
train = train.drop(labels=external_data, axis=1)
test = test.drop(labels=external_data, axis=1)

In [143]:
train_Y = train['Outcome']
train_X = train.drop(labels=['Outcome'], axis=1)

test_Y = test['Outcome']
test_X = test.drop(labels=['Outcome'], axis=1)

In [144]:
lr = LogisticRegression()
lr.fit(train_X, train_Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [145]:
# binary prediction on double set
predicted_Y = lr.predict(test_X)
print metrics.accuracy_score(test_Y, predicted_Y)
print metrics.recall_score(test_Y, predicted_Y)

0.8125
0.8125


In [146]:
# check coefficients
adj = []
coefs = lr.coef_
for c in coefs[0]:
    adj.append(math.exp(c))

features = pd.DataFrame(data=list(test_X))
weights = pd.DataFrame(data=adj)

feature_weights = pd.concat([features, weights], axis=1)
feature_weights.columns = ['Feature', 'Weight']
feature_weights = feature_weights.sort_values(by='Weight', ascending=False)
feature_weights.to_csv('feature_weights.csv', index=False)

In [147]:
# use probabilities
teams = test_X[['TeamID', 'OppTeamID']]
win_probs = pd.DataFrame(data=lr.predict_proba(test_X), columns=['Loss', 'Win'])
results = pd.concat([teams, win_probs], axis=1)

results_1 = results.iloc[:32]
results_1.loc[:,'Matchup'] = results_1.index
results_2 = results.iloc[32:].reset_index()
results_2.loc[:,'Matchup'] = results_2.index
results_concat = results_1.join(results_2, on='Matchup', lsuffix='1', rsuffix='2')

results_concat = results_concat[['TeamID1', 'OppTeamID1', 'Win1', 'Win2']]
results_concat.columns = ['Team1', 'Team2', 'Win1', 'Win2']
results_concat['Sum'] = results_concat['Win1'] + results_concat['Win2']

results_concat['Win1Adj'] = results_concat['Win1'] / results_concat['Sum']
results_concat['Win2Adj'] = results_concat['Win2'] / results_concat['Sum']

results_concat['Team1Win - Pred'] = np.where(results_concat['Win1Adj'] > results_concat['Win2Adj'], 1, 0)
adj_results = test_Y.iloc[:32]
results_concat['Team1Win - Act'] = adj_results
print results_concat

    Team1  Team2      Win1      Win2       Sum   Win1Adj   Win2Adj  \
0    1104   1439  0.843036  0.190929  1.033965  0.815343  0.184657   
1    1120   1158  0.868842  0.185735  1.054577  0.823878  0.176122   
2    1153   1209  0.955898  0.047331  1.003229  0.952822  0.047178   
3    1155   1308  0.564341  0.420155  0.984496  0.573229  0.426771   
4    1181   1233  0.969765  0.032446  1.002211  0.967626  0.032374   
5    1196   1382  0.669243  0.358882  1.028125  0.650936  0.349064   
6    1199   1281  0.684601  0.363677  1.048278  0.653072  0.346928   
7    1211   1422  0.933853  0.092945  1.026798  0.909481  0.090519   
8    1222   1361  0.772993  0.262455  1.035447  0.746530  0.253470   
9    1242   1335  0.951018  0.050953  1.001972  0.949147  0.050853   
10   1243   1166  0.804151  0.239328  1.043479  0.770644  0.229356   
11   1260   1274  0.361012  0.703893  1.064904  0.339009  0.660991   
12   1267   1455  0.190946  0.849873  1.040819  0.183457  0.816543   
13   1276   1285  0.

In [148]:
adj_predictions = results_concat['Team1Win - Pred'].values
print metrics.accuracy_score(adj_results, adj_predictions)
print metrics.recall_score(adj_results, adj_predictions)

0.8125
0.8125


In [None]:
results_concat.to_csv('logistic_round_1.csv', index=False)