# Prepare Workbook

## Install python dependencies

In [1]:
!pip install -q -r ./dependencies/requirements.txt

## Load python libraries

In [2]:
import pandas as pd
from tqdm import tqdm
import numpy as np

from sklearn.preprocessing import MinMaxScaler
import sklearn.metrics as metrics

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn import svm

import random

# Load Data

In [3]:
df = pd.read_csv('./data/final_score.csv')

# Preprocess Data

## Add prev_score coloumn

Calculated after research in **smoothing_comparison.ipynb**.

In [4]:
df_prev_score = pd.DataFrame()

for player, df_player in df.groupby(['name']):
    
    df_player = df_player.sort_values('matchday')
    df_player.set_index('matchday')
    df_player['prev_score_smoothed'] = df_player.final_score.ewm(alpha=0.5, adjust=False).mean().map(lambda x: int(x)).shift(periods=1, fill_value=0)
    df_player['prev_score'] = df_player.final_score.shift(periods=1, fill_value=0)

    df_prev_score = df_prev_score.append(df_player, ignore_index=True)

df = df_prev_score

df['guessed_score'] = [ random.randint(df['final_score'].min(),df['final_score'].max())  for k in df.index]

## Min-Max-Scaling

In [5]:
scaled_columns = ['prev_score', 'prev_score_smoothed', 'guessed_score', 'odds_win', 'odds_draw', 'odds_lose']
df_scaled = pd.DataFrame(MinMaxScaler().fit_transform(df[scaled_columns]), columns=scaled_columns)
df = df.drop(scaled_columns, axis='columns').join(df_scaled)

## One-Hot-Encoding

In [6]:
one_hot_columns = ['club_id', 'position']
df = pd.get_dummies(data=df, columns=one_hot_columns)

# Calculate accuracies for different models

## Load helper methods

In [8]:
def calculate_best_lineup(df, score_column):
    possible_lineups = [[3,4,3], [3,5,2], [4,2,4], [4,3,3], [4,4,2], [4,5,1], [5,3,2], [5,4,1], [5,2,3], [3,3,4]]

    best_lineup = pd.DataFrame({score_column: 0}, index=[0])

    for number_of_defender, number_of_midfielder, number_of_attacker in possible_lineups:
        df_goalkeeper = df.loc[df['position_goalkeeper'] == True].nlargest(1, score_column, keep='first')
        df_defender = df.loc[df['position_defender'] == True].nlargest(number_of_defender, score_column, keep='first')
        df_midfielder = df.loc[df['position_midfielder'] == True].nlargest(number_of_midfielder, score_column, keep='first')
        df_attacker = df.loc[df['position_attacker'] == True].nlargest(number_of_attacker, score_column, keep='first')

        df_lineup = pd.concat([df_goalkeeper, df_defender, df_midfielder, df_attacker])

        captain_id = df_lineup['final_score'].idxmax()
        captain_score = df_lineup.at[captain_id, 'final_score']
        df_lineup.at[captain_id, 'final_score'] = captain_score * 2    

        if df_lineup[score_column].sum() > best_lineup[score_column].sum():
            best_lineup = df_lineup

    return best_lineup

def calculate_lineup_accuracies(df, column):
    df_results = pd.DataFrame()

    for matchday, df_matchday in df.groupby('matchday'):
        df_predicted_lineup = calculate_best_lineup(df_matchday, column)
        df_best_lineup = calculate_best_lineup(df_matchday, 'final_score')

        predicted_lineup_total_score = df_predicted_lineup['final_score'].sum()
        best_lineup_total_score = df_best_lineup['final_score'].sum()

        df_results = df_results.append({'Matchday': matchday, 'Predicted': predicted_lineup_total_score, 'Best': best_lineup_total_score }, ignore_index=True)

    df_results = df_results[['Matchday', 'Predicted', 'Best']]

    df_results['Difference'] = df_results['Best'] - df_results['Predicted']
    df_results['points_in_%'] = round(df_results['Predicted'] / df_results['Best'],2)

    return {'MAE_Lineup': df_results['Difference'].mean(), 'Std_Lineup': df_results['Difference'].std(), 'Mean_%_from_Best_Lineup': df_results['points_in_%'].mean(), 'Std_%_from_Best_Lineup': df_results['points_in_%'].std()}


def calculate_regression_accuracies(y, yhat):
    mae = metrics.mean_absolute_error(y, yhat)
    mse = metrics.mean_squared_error(y, yhat)	
    rmse = np.sqrt(mse) 
    r2 = metrics.r2_score(y, yhat)
    return {'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'R2': r2}

def get_models():
    linr = LinearRegression() # linear regression model
    logr = LogisticRegression() # logistic regression model
    dt = DecisionTreeRegressor() # decision tree model
    rf = RandomForestRegressor() # random forest model
    kn = KNeighborsRegressor() # k-nearest neighbours model
    sv = svm.SVC() # support vector machine model
    return [linr, logr, dt, rf, kn, sv]

In [9]:
df_test = df[df['matchday'] > 28]

df_model_accuracies = pd.DataFrame()

regression_accuracies = calculate_regression_accuracies(df_test['final_score'], df_test['prev_score'])
lineup_accuracies = calculate_lineup_accuracies(df_test, 'prev_score')
accuracies = {**regression_accuracies, **lineup_accuracies}
accuracies['model'] = 'prev_score'
df_model_accuracies = df_model_accuracies.append(accuracies, ignore_index=True)

regression_accuracies = calculate_regression_accuracies(df_test['final_score'], df_test['prev_score_smoothed'])
lineup_accuracies = calculate_lineup_accuracies(df_test, 'prev_score_smoothed')
accuracies = {**regression_accuracies, **lineup_accuracies}
accuracies['model'] = 'prev_score_smoothed'
df_model_accuracies = df_model_accuracies.append(accuracies, ignore_index=True)

regression_accuracies = calculate_regression_accuracies(df_test['final_score'], df_test['guessed_score'])
lineup_accuracies = calculate_lineup_accuracies(df_test, 'guessed_score')
accuracies = {**regression_accuracies, **lineup_accuracies}
accuracies['model'] = 'guessed_score'
df_model_accuracies = df_model_accuracies.append(accuracies, ignore_index=True)

## Compare model accuracies

In [10]:
df_model_accuracies[['model', 'Mean_%_from_Best_Lineup', 'Std_%_from_Best_Lineup', 'MAE_Lineup', 'Std_Lineup', 'MAE', 'R2']].sort_values('Mean_%_from_Best_Lineup', ascending=False)

Unnamed: 0,model,Mean_%_from_Best_Lineup,Std_%_from_Best_Lineup,MAE_Lineup,Std_Lineup,MAE,R2
1,prev_score_smoothed,0.511667,0.131972,3391.833333,868.321235,183.378721,-1.455029
0,prev_score,0.453333,0.151745,3872.166667,1289.940993,183.424827,-1.456041
2,guessed_score,0.35,0.111355,4525.833333,962.586602,183.177352,-1.451708
