In [1]:
from helper_functions import validate_results
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss,accuracy_score

# can test this on our subset of 10,000 matches as well as all matches in the database:
df = pd.read_csv('../my_data/elo_pbp_with_surface_10_2.csv')
del df['Unnamed: 0']

# currently looking at 2014 tour-level matches, excluding Davis Cup
df = df[df['match_year'].isin([2010,2011,2012,2013,2014])].reset_index(drop=True)
df = df[df['tny_name']!='Davis Cup'].reset_index(drop=True)
df.head(2)

Unnamed: 0,tny_id,tny_name,surface,tny_date,match_year,match_month,p0_name,p1_name,p0_elo,p1_elo,...,match_prob_sf_kls_JS,match_prob_adj_kls,match_prob_adj_kls_JS,elo_prob,elo_prob_538,sf_elo_prob,sf_elo_prob_538,s_total,p0_s_kls_elo,p1_s_kls_elo
0,2010-375,Montpellier,Hard,2010-10-25,2010,10,Romain Jouan,Taylor Dent,1514.919312,1659.920646,...,0.580319,0.213863,0.211482,0.302653,0.209638,0.337195,0.269816,1.346929,0.65209,0.694839
1,2010-375,Montpellier,Hard,2010-10-25,2010,10,Julian Reister,Richard Gasquet,1557.42105,1886.196622,...,0.243984,0.28259,0.28565,0.13095,0.147258,0.130129,0.043,1.353852,0.629991,0.723861


In [2]:
match_d = dict(zip(list(set(df['match_id'])),[1]*len(df)))
train_d = dict(zip(list(set(df[df['match_year']<=2013]['match_id'])),[1]*len(df)))
test_d = dict(zip(list(set(df[df['match_year']==2014]['match_id'])),[1]*len(df)))
col_d = {'Clay':0,'Hard':1,'Grass':2}

In [3]:
df2 = pd.read_csv('../my_data/feature_df_pbp3_10_2.csv')
df2 = df2.loc[np.array([m_id in match_d for m_id in df2['match_id']])]
df2['current_set'] = df2['sets_0'] + df2['sets_1'] + 1
df2['final_set'] = df2['current_set']==3
df2['surface_num'] = [col_d[surface] for surface in df2['surface']]
df2['set_diff'] = df2['sets_0'] - df2['sets_1']
df2['point_diff'] = df2['points_0'] - df2['points_1']

In [4]:
cols = ['match_id','final_set','surface_num','elo_diff','set_diff','break_adv','point_diff','winner']

train_df = df2.loc[np.array([m_id in train_d for m_id in df2['match_id']])][cols]
test_df = df2.loc[np.array([m_id in test_d for m_id in df2['match_id']])][cols]

In [46]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from helper_functions import *
# try with smaller df first
val_df = train_df

hyper_params = {'max_features':[2,3,6],'min_samples_split':[100,500,1000]}
clf = GridSearchCV(RandomForestClassifier(),hyper_params,scoring='neg_log_loss',cv=5)
clf.fit(val_df[cols[:-1]],val_df[cols[-1]])

#cross_validate(val_df,clf,cols=cols[:-1],target=cols[-1:],hyper_parameters=hyper_params,n_splits=5)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_features': [2, 3, 6], 'min_samples_split': [100, 500, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_log_loss', verbose=0)

In [47]:
best_idx = np.argmin(clf.cv_results_['rank_test_score'])
best = clf.cv_results_['params'][best_idx]
print best

{'max_features': 3, 'min_samples_split': 1000}


In [48]:
clf.cv_results_['mean_test_score']

array([-1.00743419, -0.62372245, -0.61753299, -1.20548113, -0.70073222,
       -0.60705003, -2.11536779, -0.95272265, -0.74734569])

5

In [None]:
# for m in m_features:
#     for t_node in t_node_sizes:
#         RF = RandomForestRegressor(n_estimators=300,max_features=m,min_samples_split=t_node)

In [None]:
# now, train a random forest model on 2010-2013 match data





In [9]:
def cross_validate(val_df,clf,cols,target,hyper_parameters,n_splits):
    print 'searching for hyperparams...'
    ids = list(set(val_df['match_id']))
    vfunc = np.vectorize(in_dict)
    kfold = KFold(n_splits=n_splits,shuffle=True)
    key = hyper_parameters.keys()[0]
    scores = np.array()
    
    for train_index,____ in kfold.split(ids):
        train_dict = dict(zip(train_index,[1]*len(train_index)))
        train_ind = vfunc(np.array(val_df['match_id']),train_dict)
        test_ind = (1 - train_ind)==1
        Xtrain, ytrain = val_df[cols][train_ind], np.array(val_df[target][train_ind]).reshape([(sum(train_ind),)])
        Xtest, ytest = val_df[cols][test_ind], np.array(val_df[target][test_ind]).reshape([(sum(test_ind),)])
        
        # retrieve classification score for every hyper_parameter fed into this function
        # LOOP THROUGH ALL KEYS here if you want to test multiple hyper_params
        for j in xrange(len(hyper_parameters[key])):
            setattr(clf,key,hyper_parameters[key][j])
            clf.fit(Xtrain,ytrain)
            score = clf.score(Xtest,ytest)
            scores[j].append(score)
    for i in range(len(scores)):
        print hyper_parameters[key][i],': ',np.mean(scores[i])
    best_ind = np.argmax([np.mean(a) for a in scores])
    print 'best: ',{key:hyper_parameters[key][best_ind]}
    return {key:hyper_parameters[key][best_ind]}
