In [1]:
import numpy as np
import pandas as pd
import lxml.etree as etree
import sqlite3 as sql
from process import *
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_recall_fscore_support, precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

In [3]:
data = pd.read_csv('../data/prem_data_elo.csv', index_col = 'Unnamed: 0')
data.head()

Unnamed: 0,match_api_id,league_id,date,home_team_api_id,home_team_name,away_team_api_id,away_team_name,home_team_goal,away_team_goal,home_team_foul_reason_from_behind,...,away_team_card_reason_pushing,home_team_card_reason_diving,away_team_card_reason_diving,home_team_card_reason_hands,away_team_card_reason_hands,home_team_card_reason_Removing Shirt,away_team_card_reason_Removing Shirt,result,home_elo,away_elo
0,489044,1729,2008-08-16 00:00:00,8472,Sunderland,8650,Liverpool,0,2,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,L,1600.0,1600.0
1,489045,1729,2008-08-16 00:00:00,8654,West Ham United,8528,Wigan Athletic,2,4,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,L,1600.0,1600.0
2,489047,1729,2008-08-16 00:00:00,8668,Everton,8655,Blackburn Rovers,2,12,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,L,1600.0,1600.0
3,489048,1729,2008-08-16 00:00:00,8549,Middlesbrough,8586,Tottenham Hotspur,2,4,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,L,1600.0,1600.0
4,489049,1729,2008-08-16 00:00:00,8559,Bolton Wanderers,10194,Stoke City,3,4,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,L,1600.0,1600.0


In [38]:
def transform_target(X, y, class_labels = ['W', 'D', 'L'], weight = True):
    if weight:
        size = np.min(np.unique(pd.concat([y[y == x] for x in class_labels]), return_counts = True)[1])
        idx = [np.random.choice(y[y == x].index.values, size = size) for x in class_labels]
        down_idx = np.concatenate(idx)
        weights = {}
        for i, class_ in enumerate(class_labels):
            weights[class_] = y[y == class_].shape[0]/len(idx[i])

        return X.iloc[down_idx], y[down_idx], weights
    else:
        y_ = pd.concat([y[y == x] for x in class_labels])
        return X.iloc[y_.index], y_, None

RFpipe = Pipeline(steps = [
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier())
])
Logitpipe = Pipeline(steps = [
    ('scaler', StandardScaler()),
    ('logit', LogisticRegression())
])

In [29]:
X = data[data.columns[9:]].drop(
    ['result', 'home_elo', 'away_elo'],
axis = 1)
y = data['result']

In [53]:
X_ds, y_ds, weights = transform_target(X, y, weight = False)
X_train, X_test, y_train, y_test = train_test_split(X_ds, y_ds, test_size = 0.2, stratify = y_ds)

print('>>>Preprocess done', weights)

clf = RFpipe.set_params(rf__class_weight = weights)
clf = clf.fit(X_train, y_train)
print(RFpipe.get_params()['steps'][])
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

>>>Preprocess done None
{'memory': None, 'steps': [('scaler', StandardScaler()), ('rf', RandomForestClassifier())], 'verbose': False, 'scaler': StandardScaler(), 'rf': RandomForestClassifier(), 'scaler__copy': True, 'scaler__with_mean': True, 'scaler__with_std': True, 'rf__bootstrap': True, 'rf__ccp_alpha': 0.0, 'rf__class_weight': None, 'rf__criterion': 'gini', 'rf__max_depth': None, 'rf__max_features': 'sqrt', 'rf__max_leaf_nodes': None, 'rf__max_samples': None, 'rf__min_impurity_decrease': 0.0, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__min_weight_fraction_leaf': 0.0, 'rf__n_estimators': 100, 'rf__n_jobs': None, 'rf__oob_score': False, 'rf__random_state': None, 'rf__verbose': 0, 'rf__warm_start': False}
0.9943429068755439
0.5982608695652174


In [54]:
X_ds, y_ds, weights = transform_target(X, y, weight = True)
X_train, X_test, y_train, y_test = train_test_split(X_ds, y_ds, test_size = 0.2, stratify = y_ds)

print('>>>Preprocess done', weights)

clf = RFpipe.set_params(rf__class_weight = weights)
print(RFpipe.get_params())
clf = clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

>>>Preprocess done {'W': 2.5789473684210527, 'D': 1.0, 'L': 5.315789473684211}
{'memory': None, 'steps': [('scaler', StandardScaler()), ('rf', RandomForestClassifier(class_weight={'D': 1.0, 'L': 5.315789473684211,
                                     'W': 2.5789473684210527}))], 'verbose': False, 'scaler': StandardScaler(), 'rf': RandomForestClassifier(class_weight={'D': 1.0, 'L': 5.315789473684211,
                                     'W': 2.5789473684210527}), 'scaler__copy': True, 'scaler__with_mean': True, 'scaler__with_std': True, 'rf__bootstrap': True, 'rf__ccp_alpha': 0.0, 'rf__class_weight': {'W': 2.5789473684210527, 'D': 1.0, 'L': 5.315789473684211}, 'rf__criterion': 'gini', 'rf__max_depth': None, 'rf__max_features': 'sqrt', 'rf__max_leaf_nodes': None, 'rf__max_samples': None, 'rf__min_impurity_decrease': 0.0, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__min_weight_fraction_leaf': 0.0, 'rf__n_estimators': 100, 'rf__n_jobs': None, 'rf__oob_score': False, 'rf__ran