In [None]:
import sys
sys.path.append('../..')
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from ai import get_team_code
from functools import  partial

## Load and format data

In [None]:
X = pd.read_csv('../data/processed/epl.csv')
X = X[["HomeTeam", "AwayTeam", "FTHG", "FTAG", "Date", "Time"]]
X = X.rename(columns={"HomeTeam": "home", "AwayTeam": "away", "FTHG": "home_goals", "FTAG": "away_goals", "Date": "date", "Time": "time"})
X['date'] = pd.to_datetime(X['date'])
X = X.dropna(how='all')  # Drop rows with all null values
X['time'] = X['time'].fillna(method='bfill').fillna(method='ffill')
X['time'] = pd.to_datetime(X['time'], format='%H:%M').dt.time
X = X.sort_values(by=['date'])


team_code = partial(get_team_code, 'epl')
X['home'] = X['home'].apply(team_code)
X['away'] = X['away'].apply(team_code)

In [None]:
X.head()

In [None]:
X.dtypes

In [None]:
X.info()


In [None]:
team_rank_df = pd.read_csv('../data/processed/epl_team_ranking.csv', index_col=['date', 'ranking'], parse_dates=True)
team_rank_df.columns = [team_code(c) for c in team_rank_df.columns]
team_rank_df = team_rank_df.sort_values(by=['date'])

In [None]:
team_rank_df.head()

In [None]:
team_rank_cp = team_rank_df.copy()
# team_rank_cp = team_rank_cp.reset_index()
# team_rank_cp = team_rank_cp.set_index(['date', 'ranking'])

In [None]:
team_rank_cp.head()

In [None]:
pd.merge_asof(X, team_rank_cp,on='date', by='home')

In [None]:
X.describe()

In [None]:
X.info()

In [None]:
X.dtypes

In [None]:
X.shape

In [None]:
X.head()

In [None]:
y = np.sign(X['home_goals'] - X['away_goals'])

In [None]:
y.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
X_train.head()

In [None]:
pd.get_dummies(X, columns=['home','away'],prefix=['home_', 'away_']).head()

## KNN Model Analysis

In [None]:
# from sklearn.neighbors import KNeighborsClassifier
# import matplotlib.pyplot as plt

# training_accuracy = []
# test_accuracy = []

# # try neighbors from 1 to 10
# neighbors_list = range(1,11)

# for n_neighbors in neighbors_list:
#     # build model
#     clf = KNeighborsClassifier(n_neighbors=n_neighbors)
#     clf.fit(X_train, y_train)
#     # Record training set accuracy
#     training_accuracy.append(clf.score(X_train, y_train))
#     # record generalized accuracy
#     test_accuracy.append(clf.score(X_test, y_test))

# plt.plot(neighbors_list, training_accuracy, label="training accuracy")
# plt.plot(neighbors_list, test_accuracy, label="test accuracy")
# plt.ylabel("Accuracy")
# plt.xlabel('n_neighbors')
# plt.legend()
# plt.show()

In [None]:
raw_df = pd.read_csv('../data/raw/epl/season_1993-1994.csv')
raw_df2 = pd.read_csv('../data/raw/epl/season_2021-2022.csv')

In [None]:
join_df = pd.concat([raw_df, raw_df2], axis=0)

In [None]:
join_df.shape

In [None]:
join_df['Time']

In [None]:
join_df['Time'].fillna(method='bfill').isna().sum()