In [135]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [136]:
def find_unique_most_recent(df, id_name):
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date', ascending = False)
    df = df.reset_index(drop = True)

    u, indices = np.unique(df[id_name].values, return_index = True) # we are trying to take the most recent rating
    indices.sort()
    mask_unique = df.index.isin(indices)
    df = df[mask_unique]
    return df

In [137]:
def filter_into_pl_and_non_pl():
    matches = pd.read_csv('Match.csv')
    mask = matches['country_id'] == 1729 # only PL
    matches_pl = matches[mask]
    matches_non_pl = matches[~mask]
    del matches
    return matches_pl, matches_non_pl
    '''
    teams = pd.read_csv('Team.csv')
    pl_teams = np.unique(matches_pl[['home_team_api_id', 'away_team_api_id']].values)
    mask = teams['team_api_id'].isin(pl_teams)
    teams_pl = teams[mask]
    teams_non_pl = teams[~mask]
    
    teams_attr = pd.read_csv('Team_Attributes.csv')
    pl_teams_attr = teams_pl['team_api_id']
    mask = teams_attr['team_api_id'].isin(pl_teams_attr)
    teams_attr_pl = teams_attr[mask]
    teams_attr_non_pl = teams_attr[~mask]
    ta_pl = find_unique_most_recent(teams_attr_pl, 'team_api_id')
    ta_non_pl = find_unique_most_recent(teams_attr_non_pl, 'team_api_id')
    
    players = pd.read_csv('Player.csv')
    '''

In [138]:
def construct_input(matches, teams_attr):
    match_ids = matches['match_api_id']
    goal_dif = matches['home_team_goal'].sub(matches['away_team_goal'])
    goal_dif[goal_dif < 0] = -1
    goal_dif[goal_dif > 0] = 1
    home_teams = matches['home_team_api_id']
    away_teams = matches['away_team_api_id']
    print(home_teams.shape)
    print(away_teams.shape)

    home_columns = ['home_' + column for column in teams_attr.columns]
    away_columns = ['away_' + column for column in teams_attr.columns]
    columns_len = len(teams_attr.columns) - 1
    values = np.zeros((len(match_ids), 2 * columns_len))
    for i, team in enumerate(away_teams):
        home_team_attr = teams_attr.loc[teams_attr['team_api_id'] == home_teams.iloc[i]].to_numpy()
        away_team_attr = teams_attr.loc[teams_attr['team_api_id'] == team].to_numpy()
        row = np.append(np.delete(home_team_attr, 0), np.delete(away_team_attr, 0))
        values[i,:] = row

    x = pd.DataFrame(values, index = match_ids, columns = home_columns[1:] + away_columns[1:])
    y = goal_dif
    return x, y

In [139]:
def split_matches_data(m_pl, m_npl):
    pl_tr = 0.2
    pl_cv = 0.4
    pl_te = 0.4
    m_pl_tr = m_pl.sample(frac = pl_tr)
    m_pl = m_pl.drop(m_pl_tr.index)
    m_pl_cv = m_pl.sample(frac = (pl_cv / (pl_cv + pl_te)))
    m_pl = m_pl.drop(m_pl_cv.index)
    m_pl_te = m_pl
    
    m_tr = m_npl.append(m_pl_tr).sample(frac = 1)
    
    return m_tr, m_pl_cv, m_pl_te
    

In [140]:
def get_teams_attr():
    df = find_unique_most_recent(pd.read_csv('Team_Attributes.csv'), 'team_api_id')
    columns_keep = ['team_api_id',
                    'buildUpPlaySpeed', 
                    'buildUpPlayDribbling',
                    'buildUpPlayPassing',
                    'chanceCreationPassing',
                    'chanceCreationCrossing',
                    'chanceCreationShooting',
                    'defencePressure',
                    'defenceAggression',
                    'defenceTeamWidth']
    
    return df[columns_keep]

In [141]:
def load_data_all():
    matches_pl, matches_non_pl = filter_into_pl_and_non_pl()
    teams_attr = get_teams_attr()
    m_tr, m_cv, m_te = split_matches_data(matches_pl, matches_non_pl)
    x_tr, y_tr = construct_input(m_tr, teams_attr)
    x_cv, y_cv = construct_input(m_cv, teams_attr)
    x_te, y_te = construct_input(m_te, teams_attr)
    
    min_max_scaler = preprocessing.MinMaxScaler()
    x_tr = pd.DataFrame(min_max_scaler.fit_transform(x_tr))
    x_cv = pd.DataFrame(min_max_scaler.fit_transform(x_cv))
    x_te = pd.DataFrame(min_max_scaler.fit_transform(x_te))

    return x_tr, y_tr, x_cv, y_cv, x_te, y_te

In [142]:
x_tr, y_tr, x_cv, y_cv, x_te, y_te = load_data_all()

(23197,)
(23197,)
(1216,)
(1216,)
(1216,)
(1216,)


In [130]:
x_tr.to_csv('x_train.csv', index_label = False)
y_tr.to_csv('y_train.csv', index_label = False)
x_cv.to_csv('x_val.csv', index_label = False)
y_cv.to_csv('y_val.csv', index_label = False)
x_te.to_csv('x_test.csv', index_label = False)
y_te.to_csv('y_test.csv', index_label = False)