**Install python dependencies**

In [None]:
!pip install -q -r ./dependencies/requirements.txt

**Load python libraries**

In [None]:
# To get a clearly laid out notebook
import warnings
warnings.filterwarnings('ignore')

import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, r2_score
from sklearn.dummy import DummyClassifier
from sklearn import tree
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

import matplotlib.pyplot as plt
import seaborn as sns

**Load Data**

In [None]:
X = pd.read_csv('./data/inputs_important.csv')
y = pd.read_csv('./data/label_important.csv')

**Drop Bettings Odds and Position**

In [None]:
X = X.drop(['odds_win', 'odds_draw', 'odds_lose', 'position_normalized'], axis='columns')

**Split data into test and split**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

**Descriptive Analytics**

In [None]:
X_train.info()

In [None]:
print(X_train.is_home.value_counts(normalize=True))
X_train.value_counts(X_train['is_home']).plot.bar()

In [None]:
X_train['weighted_occurences'].describe()

In [None]:
X_train.hist(figsize=(15,5), column=['weighted_occurences'], bins=[0, 1, 5, 10, 15, 20])
plt.show()

In [None]:
X_train.value_counts(X_train['team_name_normalized']).sort_index().plot.bar()

In [None]:
X_train.value_counts(X_train['event_type_normalized']).sort_index().plot.bar()

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(X_train.corr(), annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
y_train.value_counts(normalize=True)

**Create Dummy Classifier**

In [None]:
# Apply cross validation for a dummy model that always chooses the most frequent target feature
dummy = DummyClassifier(strategy='most_frequent')
dm_cv_score = cross_val_score(dummy, X_train, y_train, cv=StratifiedKFold(shuffle=True))
print(dm_cv_score)
print(dm_cv_score.mean())

**Apply cross validation for Decision Tree**

In [None]:
dt = tree.DecisionTreeRegressor()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test).astype(int)

y_pred

In [None]:
# 0.38 ohne position, odds
# 0.24 ohne odds
# 0.13 ohne position
# 0.10 mit allem

r2_score(y_test, y_pred)

In [None]:
df = pd.read_csv('./data/final_scores.csv')

df

columns_to_encode = ['club_id', 'name', 'position']

df_one_hot = pd.get_dummies(df[columns_to_encode])

df = df.drop(columns_to_encode, axis=1)

df = df.join(df_one_hot)

# df['position'] = OneHotEncoder().fit_transform(df['position'])
# df['name'] = OneHotEncoder().fit_transform(df['name'])

# print(df)

y = df['final_score']
X = df.drop('final_score', axis='columns')

# from sklearn.preprocessing import OneHotEncoder
# # X = pd.read_csv('./data/inputs_important.csv')
# # y = pd.read_csv('./data/label_important.csv')

# # X = X.drop(['odds_win', 'odds_draw', 'odds_lose', 'position'], axis='columns')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

r2_score(y_test, y_pred)

In [None]:
y_pred = lr.predict(X_test).astype(int)

r2_score(y_test, y_pred)

**Auto SKLearn**

In [None]:
import sklearn
from autosklearn.experimental.askl2 import AutoSklearn2Classifier
automl = AutoSklearn2Classifier(time_left_for_this_task = 600, per_run_time_limit = 130)
automl.fit(X_train, y_train)
y_hat = automl.predict(X_test)
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_hat))
automl.get_models_with_weights()