**Install python dependencies**

In [2]:
!pip install -q -r ./dependencies/requirements.txt

**Load python libraries**

In [3]:
import pandas as pd
from tqdm import tqdm
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, cross_val_predict, cross_validate, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn import tree

import matplotlib.pyplot as plt
import seaborn as sns

**Load Data**

In [26]:
df = pd.read_csv('./data/final_scores.csv')
df_prev_score = pd.DataFrame()
odds_columns = ['odds_win', 'odds_draw', 'odds_lose']

for player, df_player in df.groupby(['name']):
    
    df_player = df_player.sort_values('matchday')
    df_player.set_index('matchday', inplace=True)
    df_player['prev_score'] = df_player.final_score.ewm(alpha=0.5, adjust=False).mean().map(lambda x: int(x)).shift(periods=1, fill_value=0)

    df_prev_score = df_prev_score.append(df_player, ignore_index=True)

df = df_prev_score
df = pd.get_dummies(data= df, columns=['club_id', 'position'])
df = df.drop('name', axis=1)
# df = df.drop(odds_columns, axis=1)

y = df['final_score']
X = df.drop('final_score', axis='columns')

X
# prev_score, is_home, club (one hot), odds, position (one hot) ---> magic ---> final_score

Unnamed: 0,is_home,odds_win,odds_draw,odds_lose,prev_score,club_id_45bfb2cb-63e8-583a-b80f-0e0ab042076c,club_id_4b2b1b8b-a72c-5732-b3b0-fb9ef6459716,club_id_4f9c979f-4120-5d67-b1fa-b0e90aeeaba8,club_id_57a1fdd6-7909-5651-bab2-8f7a70219d06,club_id_5df62268-f904-5f9b-8de4-1eb87d07d4e2,...,club_id_cf742159-06c0-554e-8479-a1a5700445bb,club_id_d80f9ee5-1188-5302-a790-e61352fc45d4,club_id_d9470353-f1e4-5c01-a882-c69422a604b6,club_id_e6c49a1f-35e4-5c77-b845-c71eb4dea7ce,club_id_f00a785b-acfa-5dc4-aecb-93f365778e10,club_id_fd8cfc66-dd84-575a-af56-a5d987ca63e4,position_attacker,position_defender,position_goalkeeper,position_midfielder
0,False,3.19,3.25,2.34,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,True,6.99,4.82,1.43,220,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,False,3.66,3.56,2.02,202,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,False,5.07,4.18,1.64,246,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,True,2.03,3.62,3.61,224,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7305,True,4.29,3.61,1.85,214,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
7306,False,3.88,3.61,1.94,262,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
7307,True,7.52,4.79,1.42,258,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
7308,False,2.92,3.16,2.59,155,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(X_train.corr(), annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
y_train.value_counts(normalize=True)

In [None]:
dt = tree.DecisionTreeRegressor()
dt_cv_score = cross_val_score(dt, X_train, y_train, cv=StratifiedKFold(shuffle=True))
print(dt_cv_score)
print(dt_cv_score.mean())