In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import svm
from sklearn import ensemble
from sklearn import preprocessing
from sklearn.model_selection import learning_curve
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import mean_squared_error, log_loss
import matplotlib.pyplot as plt

In [2]:
def match_date_to_rating_date(df, rating_dates):
    pos = rating_dates.searchsorted(df.date)-1
    df['rating_date'] = pos.clip(min=0)

In [3]:
def player_id_to_nearest_rating(df, dictionary):
    player_attrs = pd.read_csv('Player_Attributes.csv')
    player_attrs['date'] = pd.to_datetime(player_attrs['date'])
    player_attrs = player_attrs.sort_values('date', ascending = True)
    player_attrs['date'] = player_attrs['date'].map(dictionary)
    player_attrs = player_attrs.set_index(['player_api_id', 'date'])['overall_rating']
    positions = ['home_player_1',
                 'home_player_2',
                 'home_player_3',
                 'home_player_4',
                 'home_player_5',
                 'home_player_6',
                 'home_player_7',
                 'home_player_8',
                 'home_player_9',
                 'home_player_10',
                 'home_player_11',
                 'away_player_1',
                 'away_player_2',
                 'away_player_3',
                 'away_player_4',
                 'away_player_5',
                 'away_player_6',
                 'away_player_7',
                 'away_player_8',
                 'away_player_9',
                 'away_player_10',
                 'away_player_11']
    for pos in positions:
        multiindex = list(zip(df[pos], df.rating_date))
        print(player_attrs.index)
        print(player_attrs.loc[( 39902,   0)])
        for mult in multiindex:
            print(player_attrs.loc[mult])
        df[pos] = player_attrs.loc[multiindex]

In [None]:
df = pd.read_csv('Match.csv')
df['date'] = pd.to_datetime(df['date'])

In [None]:
date = pd.to_datetime(df.loc[122,'date'])
print(date)
date1 = pd.to_datetime(df.loc[1,'date'])
print(date1)
print(date < date1)

In [None]:
p = pd.read_csv('Player_Attributes.csv')['date']
p = pd.to_datetime(p)
p = p.sort_values(ascending = True)
p = pd.unique(p)
match_date_to_rating_date(df, pd.unique(p))

In [None]:
dictionary = dict()
for i, date in enumerate(p):
    dictionary[date] = i
player_id_to_nearest_rating(df, dictionary)
df.home_player_1

In [82]:
def load_in_data():
    x_tr = pd.read_csv('x_train.csv', index_col = 'match_api_id')
    y_tr = pd.read_csv('y_train.csv', index_col = 'match_api_id').target_binary
    x_cv = pd.read_csv('x_val.csv', index_col = 'match_api_id')
    y_cv = pd.read_csv('y_val.csv', index_col = 'match_api_id').target_binary
    x_te = pd.read_csv('x_test.csv', index_col = 'match_api_id')
    y_te = pd.read_csv('y_test.csv', index_col = 'match_api_id').target_binary
    return x_tr, y_tr, x_cv, y_cv, x_te, y_te

In [83]:
x_tr, y_tr, x_cv, y_cv, x_te, y_te = load_in_data()

In [205]:
y_tr[y_tr == -1] = 0
y_cv[y_cv == -1] = 0
y_te[y_te == -1] = 0

In [None]:
def add_historical_data()

In [39]:
x_tr.columns[[0,4,8,10,11,12,14,15,17,19,21,22,23,25,26]]

Index(['home_player_1_overall_rating', 'home_player_1_short_passing',
       'home_player_1_long_passing', 'home_player_1_acceleration',
       'home_player_1_sprint_speed', 'home_player_1_reactions',
       'home_player_1_shot_power', 'home_player_1_strength',
       'home_player_1_aggression', 'home_player_1_positioning',
       'home_player_1_penalties', 'home_player_1_marking',
       'home_player_1_standing_tackle', 'home_player_1_gk_diving',
       'home_player_1_gk_handling'],
      dtype='object')

In [206]:
columns_keep = ['overall_rating']
positions = ['home_player_1',
             'home_player_2',
             'home_player_3',
             'home_player_4',
             'home_player_5',
             'home_player_6',
             'home_player_7',
             'home_player_8',
             'home_player_9',
             'home_player_10',
             'home_player_11',
             'away_player_1',
             'away_player_2',
             'away_player_3',
             'away_player_4',
             'away_player_5',
             'away_player_6',
             'away_player_7',
             'away_player_8',
             'away_player_9',
             'away_player_10',
             'away_player_11']
columns = []
for pos in positions:
    arr = [pos + '_' + column_name for column_name in columns_keep]
    columns = columns + arr
x_tr = x_tr[columns]
x_cv = x_cv[columns]
x_te = x_te[columns]

In [84]:
min_max_scaler = preprocessing.MinMaxScaler()
x_tr = pd.DataFrame(min_max_scaler.fit_transform(x_tr), index = x_tr.index, columns = x_tr.columns)
x_cv = pd.DataFrame(min_max_scaler.fit_transform(x_cv), index = x_cv.index, columns = x_cv.columns)
x_te = pd.DataFrame(min_max_scaler.fit_transform(x_te), index = x_te.index, columns = x_te.columns)

In [85]:
x_tr.fillna(x_tr.mean(), inplace=True)
x_cv.fillna(x_cv.mean(), inplace=True)
x_te.fillna(x_te.mean(), inplace=True)

In [12]:
def plot_learning_curve(estimator, x_tr, y_tr):
    train_sizes, train_scores, test_scores = \
        learning_curve(estimator, x_tr, y_tr, train_sizes = np.linspace(0.2, 1, 20))
    
    plt.plot(train_sizes, -train_scores.mean(1), 'o-', label = str(estimator.__class__.__name__) + ' train')
    plt.plot(train_sizes, -test_scores.mean(1), 'o-', label = str(estimator.__class__.__name__) + ' test')
    plt.legend(loc="best")

In [13]:
def do_stuff(model, x_tr, y_tr, x_cv, y_cv):
    clf = OneVsRestClassifier(model).fit(x_tr, y_tr)
    predict_tr = clf.predict(x_tr).reshape((-1, 1))
    predict_cv = clf.predict(x_cv).reshape((-1, 1))
    print(np.mean(predict_tr == y_tr))
    print(np.mean(predict_cv == y_cv))
    #plot_learning_curve(clf, x_tr, y_tr)
    return clf

In [89]:
def logistic_model(x_tr, y_tr, x_cv, y_cv):
    log = linear_model.LogisticRegression(max_iter = 200, C = 0.1, l1_ratio = 0.5, penalty = 'elasticnet', solver = 'saga')
    clf = log.fit(x_tr, y_tr)
    return clf

def svm_model(x_tr, y_tr, x_cv, y_cv):
    svc = svm.SVC(kernel = 'rbf', max_iter = 1000, verbose = 5)
    return do_stuff(svc, x_tr, y_tr, x_cv, y_cv)
    
def rf_model(x_tr, y_tr, x_cv, y_cv):
    rf = ensemble.RandomForestClassifier(n_estimators = 100, max_features = 'auto', max_depth = 6)
    clf = rf.fit(x_tr, y_tr)
    return clf

In [90]:
clf_log = logistic_model(x_tr, y_tr, x_cv, y_cv)

In [91]:
clf_rf = rf_model(x_tr, y_tr, x_cv, y_cv)

In [92]:
print(mean_squared_error(y_tr, clf_log.predict(x_tr).reshape((-1, 1))))
print(mean_squared_error(y_cv, clf_log.predict(x_cv).reshape((-1, 1))))
print(mean_squared_error(y_tr, clf_rf.predict(x_tr).reshape((-1, 1))))
print(mean_squared_error(y_cv, clf_rf.predict(x_cv).reshape((-1, 1))))

0.35361171106311495
0.35091083413231067
0.31725461963042956
0.3576222435282838


In [93]:
print(log_loss(y_tr, clf_log.predict_proba(x_tr)))
print(log_loss(y_cv, clf_log.predict_proba(x_cv)))
print(log_loss(y_tr, clf_rf.predict_proba(x_tr)))
print(log_loss(y_cv, clf_rf.predict_proba(x_cv)))

0.619194183799871
0.6193223346263451
0.5869469409768365
0.6187522993536477


In [66]:
coefs = clf_log.coef_
coefs[0].reshape((-1,14))
np.sum(coefs[0].reshape((-1, 14)), axis=0)

array([ 0.06795003, -0.2396197 ,  0.1660052 ,  0.08840537,  0.5680267 ,
        0.75287973, -0.51416432, -0.42405566, -0.24202101,  0.53834616,
       -0.29261512, -0.21489304,  0.10548235, -0.00081843])

In [71]:
p = clf_log.predict(x_tr)
print(len(p))
print(len(p[p == 1]))
print(len(p[p == 0]))
print(len(p[p == -1]))

16673
11984
137
4552


In [67]:
x_tr.columns[[0,1,2,4,5,6,7,8,9,10,11,12]]


Index(['home_player_1_overall_rating', 'home_player_2_overall_rating',
       'home_player_3_overall_rating', 'home_player_1_potential',
       'home_player_2_potential', 'home_player_3_potential',
       'home_player_4_potential', 'home_player_1_crossing',
       'home_player_2_crossing', 'home_player_3_crossing',
       'home_player_4_crossing', 'home_player_1_finishing'],
      dtype='object')

In [95]:
df = pd.read_csv('Match.csv')

In [96]:
df = df.set_index('match_api_id')

In [97]:
col = 'B365'
cols = [col + 'H', col + 'D', col + 'A']

In [98]:
df = df[cols]

In [99]:
df = np.reciprocal(df)
df = df.div(df.sum(axis=1), axis=0)

In [100]:
b_tr = 1 - df.loc[y_tr.index][col + 'H'].fillna(0.5)
b_cv = 1 - df.loc[y_cv.index][col + 'H'].fillna(0.5)
b_te = 1 - df.loc[y_te.index][col + 'H'].fillna(0.5)

In [101]:
b_te

match_api_id
1989891    0.402207
2030467    0.338871
2030536    0.461078
1989960    0.863636
2060469    0.670921
             ...   
1988783    0.466907
1988787    0.575211
1988790    0.267026
1988794    0.625115
1988799    0.521768
Name: B365H, Length: 1046, dtype: float64

In [122]:
a = 0.3
pred_tr_log = a * b_tr + (1-a) * clf_log.predict_proba(x_tr)[:,1]
pred_cv_log = a * b_cv + (1-a) * clf_log.predict_proba(x_cv)[:,1]

a = 0.3
pred_tr_rf = a * b_tr + (1-a) * clf_rf.predict_proba(x_tr)[:,1]
pred_cv_rf = a * b_cv + (1-a) * clf_rf.predict_proba(x_cv)[:,1]

In [123]:
print(log_loss(y_tr, pred_tr_log))
print(log_loss(y_cv, pred_cv_log))
print(log_loss(y_tr, pred_tr_rf))
print(log_loss(y_cv, pred_cv_rf))

0.6196670895056657
0.6190025844139819
0.5967842731359135
0.618452673736341


In [177]:
b_tr

match_api_id
1260188    0.562008
674537     0.707714
1989891    0.597793
1474945    0.261146
1506175    0.505973
             ...   
1709842    0.780312
1223991    0.751067
1239571    0.614186
704398     0.458695
1019336    0.358441
Name: B365H, Length: 16673, dtype: float64

In [263]:
np.sum(x_te.index - y_te.index)

0