# Prepare Workbook

## Install python dependencies

In [364]:
!pip install -q -r ./dependencies/requirements.txt

You should consider upgrading via the '/home/jakob/dev/fm-analytics/src/notebooks/venv/bin/python3 -m pip install --upgrade pip' command.[0m


## Load python libraries

In [365]:
import pandas as pd
from tqdm import tqdm
import numpy as np

from sklearn.preprocessing import MinMaxScaler
import sklearn.metrics as metrics

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn import svm

from scipy.optimize import curve_fit
from math import sqrt

import researchpy as rp

import autosklearn.regression

# Load Data

In [366]:
df = pd.read_csv('./data/final_scores.csv')

# Preprocess Data

## Add prev_score coloumn

Calculated after research in **smoothing_comparison.ipynb**.

In [367]:
df_prev_score = pd.DataFrame()

for player, df_player in df.groupby(['name']):
    
    df_player = df_player.sort_values('matchday')
    df_player.set_index('matchday')
    df_player['prev_score'] = df_player.final_score.ewm(alpha=0.5, adjust=False).mean().map(lambda x: int(x)).shift(periods=1, fill_value=0)

    df_prev_score = df_prev_score.append(df_player, ignore_index=True)

df = df_prev_score

## Min-Max-Scaling

In [368]:
scaled_columns = ['prev_score' , 'odds_win', 'odds_draw', 'odds_lose']
df_scaled = pd.DataFrame(MinMaxScaler().fit_transform(df[scaled_columns]), columns=scaled_columns)
df = df.drop(scaled_columns, axis='columns').join(df_scaled)

## One-Hot-Encoding

In [369]:
one_hot_columns = ['club_id', 'position']
df = pd.get_dummies(data=df, columns=one_hot_columns)

# Calculate accuracies for different models

## Calculate price function for statistical test

In [370]:
# Values from SPITCH
df_price = pd.DataFrame()
df_price['percentage_above_threshhold'] = [0.15,0.14,0.13,0.12,0.11,0.10,0.09,0.08,0.07,0.06,0.05,0.04,0.03,0.02,0.01]
df_price['price_in_%'] = [15, 7, 4.5, 3.3, 2.6, 2.2, 1.8, 1.6, 1.4, 1.3,1.1,1.1,0.89,0.89,0.89]

def exponenial_func(x, a, b, c):
    return a*np.exp(-b*x)+c

# Calculate best parameters p for exponential function 
popt, pcov = curve_fit(exponenial_func, df_price['percentage_above_threshhold'], df_price['price_in_%'])

df_price['price_in_%_calc'] = exponenial_func(df_price['percentage_above_threshhold'], *popt)

def calculate_price(threshhold, percentage, stake=2, participants=1000):
    price = float(stake * -1)

    percentage_above_threshhold = percentage - threshhold
    percentage_above_threshhold = 0.15 if percentage_above_threshhold > 0.15 else percentage_above_threshhold

    total_price = participants * stake
    percentage_of_price = exponenial_func(percentage_above_threshhold, *popt) / 100

    if percentage_above_threshhold >= 0:
        price = (total_price * percentage_of_price) + price

    return price

calculate_price(0.75, 0.76)

23.92812087022581

## Load helper methods

In [371]:
def calculate_best_lineup(df, score_column):
    possible_lineups = [[3,4,3], [3,5,2], [4,2,4], [4,3,3], [4,4,2], [4,5,1], [5,3,2], [5,4,1], [5,2,3], [3,3,4]]

    best_lineup = pd.DataFrame({score_column: 0}, index=[0])

    for number_of_defender, number_of_midfielder, number_of_attacker in possible_lineups:
        df_goalkeeper = df.loc[df['position_goalkeeper'] == True].nlargest(1, score_column, keep='first')
        df_defender = df.loc[df['position_defender'] == True].nlargest(number_of_defender, score_column, keep='first')
        df_midfielder = df.loc[df['position_midfielder'] == True].nlargest(number_of_midfielder, score_column, keep='first')
        df_attacker = df.loc[df['position_attacker'] == True].nlargest(number_of_attacker, score_column, keep='first')

        df_lineup = pd.concat([df_goalkeeper, df_defender, df_midfielder, df_attacker])

        captain_id = df_lineup['final_score'].idxmax()
        captain_score = df_lineup.at[captain_id, 'final_score']
        df_lineup.at[captain_id, 'final_score'] = captain_score * 2    

        if df_lineup[score_column].sum() > best_lineup[score_column].sum():
            best_lineup = df_lineup

    return best_lineup

def calculate_lineup_accuracies(df):
    df_results = pd.DataFrame()

    for matchday, df_matchday in df.groupby('matchday'):
        df_predicted_lineup = calculate_best_lineup(df_matchday, 'predicted_score')
        df_best_lineup = calculate_best_lineup(df_matchday, 'final_score')

        predicted_lineup_total_score = df_predicted_lineup['final_score'].sum()
        best_lineup_total_score = df_best_lineup['final_score'].sum()

        df_results = df_results.append({'Matchday': matchday, 'Predicted': predicted_lineup_total_score, 'Best': best_lineup_total_score }, ignore_index=True)

    df_results = df_results[['Matchday', 'Predicted', 'Best']]

    df_results['Difference'] = df_results['Best'] - df_results['Predicted']
    df_results['points_in_%'] = round(df_results['Predicted'] / df_results['Best'],2)

    threshhold = 0.75

    df_results['is_over_threshhold'] = df_results['points_in_%'] >= threshhold
    df_results['price_money'] = df_results['points_in_%'].apply(lambda x: calculate_price(threshhold, x))

    return {'MAE_Lineup': df_results['Difference'].mean(), 'Std_Lineup': df_results['Difference'].std(), 'Mean_%_from_Best_Lineup': df_results['points_in_%'].mean(), 'Std_%_from_Best_Lineup': df_results['points_in_%'].std(), 'percentages_from_best_lineup': df_results['points_in_%'], 'differences_from_best_lineup': df_results['Difference'], 'Price_Money': round(df_results['price_money'].sum(),2), 'Mean_Price_Money': round(df_results['price_money'].mean(),2), 'Std_Price_Money': round(df_results['price_money'].std(),2), 'Count_Price_Won': df_results['is_over_threshhold'].sum()}

def calculate_regression_accuracies(y, yhat):
    mae = metrics.mean_absolute_error(y, yhat)
    mse = metrics.mean_squared_error(y, yhat)
    rmse = np.sqrt(mse) 
    r2 = metrics.r2_score(y, yhat)
    return {'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'R2': r2}

def get_models():
    linr = LinearRegression() # linear regression model
    logr = LogisticRegression() # logistic regression model
    dt = DecisionTreeRegressor() # decision tree model
    rf = RandomForestRegressor() # random forest model
    rf_tuned = RandomForestRegressor(bootstrap=True, max_depth=10, max_features='sqrt', min_samples_leaf=5, min_samples_split=2, n_estimators=1635) # random forest model
    kn = KNeighborsRegressor() # k-nearest neighbours model
    sv = svm.SVC() # support vector machine model
    return [linr, logr, dt, rf,rf_tuned, kn, sv]

## Calculation

In [372]:
df_model_accuracies = pd.DataFrame() 

designs = ['treatment', 'baseline']
models = get_models()

for design in designs:
    
    # drop betting odds for baseline models
    if design=='baseline':
        odds_columns = ['odds_win', 'odds_draw', 'odds_lose']
        df = df.drop(odds_columns, axis=1)

    # split into train and test data set
    df_train = df[df['matchday'] <= 28]
    df_test = df[df['matchday'] > 28]

    # drop irrelevant features
    df_train = df_train.drop(['name', 'matchday'], axis=1)

    # split test data into feature and label
    X_train = df_train.drop(['final_score'], axis='columns')
    y_train = df_train['final_score']

    for model in tqdm(models):
        model.fit(X_train, y_train)
        model_inputs = df_test.drop(['name', 'matchday', 'final_score'], axis=1)
        predicted_scores = model.predict(model_inputs)

        df_final = df_test.copy(deep=False)
        df_final['predicted_score'] = predicted_scores

        regression_accuracies = calculate_regression_accuracies(df_final['final_score'], df_final['predicted_score'])
        lineup_accuracies = calculate_lineup_accuracies(df_final)

        accuracies = {**regression_accuracies, **lineup_accuracies}
        accuracies['model'] = model
        accuracies['design'] = design

        df_model_accuracies = df_model_accuracies.append(accuracies, ignore_index=True)

100%|██████████| 7/7 [00:29<00:00,  4.23s/it]
100%|██████████| 7/7 [00:26<00:00,  3.82s/it]


## Compare model accuracies

In [373]:
df_model_accuracies[['design', 'model', 'Mean_%_from_Best_Lineup', 'Std_%_from_Best_Lineup', 'MAE_Lineup', 'Std_Lineup', 'MAE', 'R2', 'Price_Money', 'Mean_Price_Money', 'Std_Price_Money', 'Count_Price_Won']].sort_values('Mean_%_from_Best_Lineup', ascending=False)

Unnamed: 0,design,model,Mean_%_from_Best_Lineup,Std_%_from_Best_Lineup,MAE_Lineup,Std_Lineup,MAE,R2,Price_Money,Mean_Price_Money,Std_Price_Money,Count_Price_Won
0,treatment,LinearRegression(),0.675,0.10895,1483.666667,442.2491,72.594094,0.211413,40.45,6.74,13.54,2.0
7,baseline,LinearRegression(),0.638333,0.081833,1671.666667,365.769691,73.115933,0.200557,13.96,2.33,10.6,1.0
4,treatment,"(DecisionTreeRegressor(max_depth=10, max_featu...",0.625,0.120457,1743.0,602.137526,71.819449,0.21685,13.96,2.33,10.6,1.0
3,treatment,"(DecisionTreeRegressor(max_features='auto', ra...",0.608333,0.087958,1830.666667,509.688009,73.820577,0.169581,-12.0,-2.0,0.0,0.0
11,baseline,"(DecisionTreeRegressor(max_depth=10, max_featu...",0.588333,0.065243,1913.833333,361.920663,72.173631,0.204637,-12.0,-2.0,0.0,0.0
2,treatment,DecisionTreeRegressor(),0.563333,0.099933,2034.5,563.912671,99.665262,-0.519691,-12.0,-2.0,0.0,0.0
10,baseline,"(DecisionTreeRegressor(max_features='auto', ra...",0.563333,0.074744,2044.166667,458.073538,80.895114,0.015363,-12.0,-2.0,0.0,0.0
5,treatment,KNeighborsRegressor(),0.555,0.077136,2070.833333,417.719244,79.589731,0.048406,-12.0,-2.0,0.0,0.0
8,baseline,LogisticRegression(),0.553333,0.117757,2063.0,527.585822,89.476773,-0.260619,-12.0,-2.0,0.0,0.0
12,baseline,KNeighborsRegressor(),0.543333,0.080166,2109.666667,373.172704,78.254279,0.058488,-12.0,-2.0,0.0,0.0


# Hyperparameter Tuning for Linear Regression

In [374]:
# model = LinearRegression()
# parameters = {'fit_intercept':[True,False], 'normalize':[True,False]}
# grid = GridSearchCV(model,parameters, cv=StratifiedKFold(shuffle=True, n_splits=5))

# df_train = df[df['matchday'] <= 28]
# df_test = df[df['matchday'] > 28]

# # drop irrelevant features
# df_train = df_train.drop(['name', 'matchday'], axis=1)

# # split test data into feature and label
# X_train = df_train.drop(['final_score'], axis='columns')
# y_train = df_train['final_score']

# grid.fit(X_train, y_train)

# print(grid.best_score_) # 0.23431358547187112
# print(grid.best_params_) # {'fit_intercept': False, 'normalize': True} --> default Linear Regression

# Hyperparameter Tuning for Random Forest

In [375]:
# # Create RandomForestRegressor
# rf = RandomForestRegressor()

# # Load Data
# df_train = df[df['matchday'] <= 28]
# df_test = df[df['matchday'] > 28]

# # drop irrelevant features
# df_train = df_train.drop(['name', 'matchday'], axis=1)

# # split test data into feature and label
# X_train = df_train.drop(['final_score'], axis='columns')
# y_train = df_train['final_score']



# # Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 1, stop = 2000, num = 10)]
# # Number of features to consider at every split
# max_features = ['auto', 'sqrt']
# # Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# max_depth.append(None)
# # Minimum number of samples required to split a node
# min_samples_split = [2, 5, 10]
# # Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 4]
# # Method of selecting samples for training each tree
# bootstrap = [True, False]

# # Create the random grid
# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap}

# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv=StratifiedKFold(shuffle=True, n_splits=5), verbose=2, random_state=42, n_jobs = -1)

# rf_random.fit(X_train, y_train)

# print(rf_random.best_score_) # 0.25267645010175976
# print(rf_random.best_params_) # {'n_estimators': 1555, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 10, 'bootstrap': True} 

In [376]:
# param_grid = {
#     'bootstrap': [True],
#     'max_depth': [8, 9, 10, 11, 12],
#     'max_features': ['sqrt'],
#     'min_samples_leaf': [3, 4, 5],
#     'min_samples_split': [2, 3, 4],
#     'n_estimators': [1550, 1575, 1600, 1625, 1650]
# }

# grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv=StratifiedKFold(shuffle=True, n_splits=5), n_jobs = -1, verbose=2)

# grid_search.fit(X_train, y_train)

# print(grid_search.best_score_) # 0.25773000067499996
# print(grid_search.best_params_) # {'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 1625}

# Statistical Tests

## Cohens d and p-value

In [377]:
treatment_row = df_model_accuracies.loc[(df_model_accuracies['design'] == 'treatment') & (df_model_accuracies['model'] == models[0])]
baseline_row = df_model_accuracies.loc[(df_model_accuracies['design'] == 'baseline') & (df_model_accuracies['model'] == models[0])]

differences_treatment = treatment_row['differences_from_best_lineup'].iloc[0]
differences_baseline = baseline_row['differences_from_best_lineup'].iloc[0]

rp.ttest(group1= differences_treatment, group1_name= "differences_treatment", group2= differences_baseline, group2_name= "differences_baseline")

(                Variable     N         Mean          SD          SE  \
 0  differences_treatment   6.0  1483.666667  442.249100  180.547439   
 1   differences_baseline   6.0  1671.666667  365.769691  149.324851   
 2               combined  12.0  1577.666667  399.191304  115.236603   
 
      95% Conf.     Interval  
 0  1019.554699  1947.778634  
 1  1287.814917  2055.518416  
 2  1324.032613  1831.300720  ,
                                   Independent t-test   results
 0  Difference (differences_treatment - difference... -188.0000
 1                              Degrees of freedom =    10.0000
 2                                               t =    -0.8024
 3                           Two side test p value =     0.4410
 4                          Difference < 0 p value =     0.2205
 5                          Difference > 0 p value =     0.7795
 6                                       Cohen's d =    -0.4633
 7                                       Hedge's g =    -0.4276
 8       