In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [2]:
def find_unique_most_recent(df, id_name):
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date', ascending = False)
    df = df.reset_index(drop = True)

    u, indices = np.unique(df[id_name].values, return_index = True) # we are trying to take the most recent rating
    indices.sort()
    mask_unique = df.index.isin(indices)
    df = df[mask_unique]
    return df

In [3]:
def filter_into_pl_and_non_pl():
    matches = pd.read_csv('Match.csv')
    matches['date'] = pd.to_datetime(matches['date'])
    mask = matches['country_id'] == 1729 # only PL
    matches_pl = matches[mask]
    matches_non_pl = matches[~mask]
    del matches
    return matches_pl, matches_non_pl
    '''
    teams = pd.read_csv('Team.csv')
    pl_teams = np.unique(matches_pl[['home_team_api_id', 'away_team_api_id']].values)
    mask = teams['team_api_id'].isin(pl_teams)
    teams_pl = teams[mask]
    teams_non_pl = teams[~mask]
    
    teams_attr = pd.read_csv('Team_Attributes.csv')
    pl_teams_attr = teams_pl['team_api_id']
    mask = teams_attr['team_api_id'].isin(pl_teams_attr)
    teams_attr_pl = teams_attr[mask]
    teams_attr_non_pl = teams_attr[~mask]
    ta_pl = find_unique_most_recent(teams_attr_pl, 'team_api_id')
    ta_non_pl = find_unique_most_recent(teams_attr_non_pl, 'team_api_id')
    
    players = pd.read_csv('Player.csv')
    '''

In [4]:
def construct_labels(player_attr, match_dates, player_ids):
    print('going')
    labels = []
    for j, date in enumerate(match_dates):
        if (j % 1000 == 0):
            print(j)
        player_id = player_ids.iloc[j]
        player_data = player_attr.loc[player_id]
        a = player_data.index.get_loc(date, method = 'nearest')
        label = player_data.iloc[a].name
        labels.append((player_id, label))
    return labels

In [5]:
def construct_input(matches, player_attr, columns_keep):
    match_ids = matches['match_api_id']
    match_dates = matches['date']
    goal_dif = matches['home_team_goal'].sub(matches['away_team_goal'])
    goal_dif[goal_dif < 0] = -1
    goal_dif[goal_dif > 0] = 1
    positions = ['home_player_1',
                 'home_player_2',
                 'home_player_3',
                 'home_player_4',
                 'home_player_5',
                 'home_player_6',
                 'home_player_7',
                 'home_player_8',
                 'home_player_9',
                 'home_player_10',
                 'home_player_11',
                 'away_player_1',
                 'away_player_2',
                 'away_player_3',
                 'away_player_4',
                 'away_player_5',
                 'away_player_6',
                 'away_player_7',
                 'away_player_8',
                 'away_player_9',
                 'away_player_10',
                 'away_player_11']
    columns = []
    for pos in positions:
        arr = [pos + '_' + column_name for column_name in columns_keep[2:]]
        columns.append(arr)
    
    player_series = [matches[pos] for pos in positions]
    columns_len = len(positions) * (len(player_attr.columns) - 1)
    column_attrs = []
    print('%s columns and %s rows' % (len(positions), len(match_dates)))
    
    from multiprocess import Pool
    pool = Pool()
    for i in range(0, 20, 4):
        print(i)
        task1 = pool.apply_async(construct_labels, [player_attr, match_dates, player_series[i]])
        task2 = pool.apply_async(construct_labels, [player_attr, match_dates, player_series[i+1]])
        task3 = pool.apply_async(construct_labels, [player_attr, match_dates, player_series[i+2]])
        task4 = pool.apply_async(construct_labels, [player_attr, match_dates, player_series[i+3]])
        print('yeet')
        labels1 = task1.get()
        labels2 = task2.get()
        labels3 = task3.get()
        labels4 = task4.get()
        attr1 = player_attr.loc[labels1].reset_index(drop = True)
        attr2 = player_attr.loc[labels2].reset_index(drop = True)
        attr3 = player_attr.loc[labels3].reset_index(drop = True)
        attr4 = player_attr.loc[labels4].reset_index(drop = True)
        for j, column in enumerate(attr1):
            column_attrs.append(attr1[column].rename(columns[i][j]))
            column_attrs.append(attr2[column].rename(columns[i+1][j]))
            column_attrs.append(attr3[column].rename(columns[i+2][j]))
            column_attrs.append(attr4[column].rename(columns[i+3][j]))
            
    labels1 = construct_labels(player_attr, match_dates, player_series[20])
    labels2 = construct_labels(player_attr, match_dates, player_series[21])
    attr1 = player_attr.loc[labels1].reset_index(drop = True)
    attr2 = player_attr.loc[labels2].reset_index(drop = True)
    for j, column in enumerate(attr1):
        column_attrs.append(attr1[column].rename(columns[20][j]))
        column_attrs.append(attr2[column].rename(columns[21][j]))
    
    x = pd.concat(column_attrs, axis = 1).set_index(match_ids)
    y = goal_dif
    return x, y

In [6]:
def split_matches_data(m_pl, m_npl):
    pl_tr = 0.2
    pl_cv = 0.4
    pl_te = 0.4
    m_pl_tr = m_pl.sample(frac = pl_tr)
    m_pl = m_pl.drop(m_pl_tr.index)
    m_pl_cv = m_pl.sample(frac = (pl_cv / (pl_cv + pl_te)))
    m_pl = m_pl.drop(m_pl_cv.index)
    m_pl_te = m_pl
    
    m_tr = m_npl.append(m_pl_tr).sample(frac = 1)
    
    return m_tr, m_pl_cv, m_pl_te

In [7]:
def get_player_attr():
    df = pd.read_csv('Player_Attributes.csv')
    columns_keep = ['player_api_id',
                    'date',
                    'overall_rating',
                    'potential',
                    'crossing',
                    'finishing',
                    'heading_accuracy',
                    'short_passing',
                    'volleys',
                    'dribbling',
                    'curve',
                    'free_kick_accuracy',
                    'long_passing',
                    'ball_control',
                    'acceleration',
                    'sprint_speed',
                    'agility',
                    'reactions',
                    'balance',
                    'shot_power',
                    'jumping',
                    'stamina',
                    'strength',
                    'long_shots',
                    'aggression',
                    'interceptions',
                    'positioning',
                    'vision',
                    'penalties',
                    'marking',
                    'standing_tackle',
                    'sliding_tackle',
                    'gk_diving',
                    'gk_handling',
                    'gk_kicking',
                    'gk_positioning',
                    'gk_reflexes']
    df = df[columns_keep]
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date', ascending = False)
    df = df.set_index(['player_api_id', 'date'])
    df = df.dropna()
    return df, columns_keep

In [8]:
def load_data_all():
    matches_pl, matches_non_pl = filter_into_pl_and_non_pl()
    player_attr, columns_keep = get_player_attr()
    m_tr, m_cv, m_te = split_matches_data(matches_pl, matches_non_pl)
    x_tr, y_tr = construct_input(m_tr, player_attr, columns_keep)
    x_cv, y_cv = construct_input(m_cv, player_attr, columns_keep)
    x_te, y_te = construct_input(m_te, player_attr, columns_keep)
    
    '''
    min_max_scaler = preprocessing.MinMaxScaler()
    x_tr = pd.DataFrame(min_max_scaler.fit_transform(x_tr), index = m_tr['match_api_id'], columns = x_tr.columns)
    x_cv = pd.DataFrame(min_max_scaler.fit_transform(x_cv), index = m_cv['match_api_id'], columns = x_cv.columns)
    x_te = pd.DataFrame(min_max_scaler.fit_transform(x_te), index = m_te['match_api_id'], columns = x_te.columns)
    '''
    
    return x_tr, y_tr, x_cv, y_cv, x_te, y_te

In [9]:
x_tr, y_tr, x_cv, y_cv, x_te, y_te = load_data_all()

22 columns and 16673 rows
0
yeet
4
yeet
8
yeet
12
yeet
16
yeet
going
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
going
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
22 columns and 1094 rows
0
yeet
4
yeet
8
yeet
12
yeet
16
yeet
going
0
1000
going
0
1000
22 columns and 1093 rows
0
yeet
4
yeet
8
yeet
12
yeet
16
yeet
going
0
1000
going
0
1000


In [11]:
x_tr.to_csv('x_train.csv')
y_tr.to_csv('y_train.csv')
x_cv.to_csv('x_val.csv')
y_cv.to_csv('y_val.csv')
x_te.to_csv('x_test.csv')
y_te.to_csv('y_test.csv')

In [None]:
from multiprocess import Pool
with Pool() as pool:
    a = pool.apply_async(hey, (1, 2))
    print('2')
    r = a.get()
r

In [None]:
def hey(a,b):
    return 'hey'

In [12]:
x_cv

Unnamed: 0_level_0,home_player_1_overall_rating,home_player_2_overall_rating,home_player_3_overall_rating,home_player_4_overall_rating,home_player_1_potential,home_player_2_potential,home_player_3_potential,home_player_4_potential,home_player_1_crossing,home_player_2_crossing,...,away_player_10_gk_diving,away_player_11_gk_diving,away_player_10_gk_handling,away_player_11_gk_handling,away_player_10_gk_kicking,away_player_11_gk_kicking,away_player_10_gk_positioning,away_player_11_gk_positioning,away_player_10_gk_reflexes,away_player_11_gk_reflexes
match_api_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1024484,84.0,76.0,80.0,80.0,86.0,82.0,80.0,83.0,11.0,76.0,...,8.0,7.0,7.0,12.0,7.0,5.0,7.0,8.0,5.0,8.0
1724330,85.0,74.0,78.0,82.0,85.0,84.0,78.0,82.0,13.0,67.0,...,1.0,13.0,1.0,15.0,1.0,6.0,1.0,11.0,1.0,14.0
1025473,84.0,83.0,80.0,79.0,86.0,84.0,86.0,83.0,19.0,68.0,...,10.0,8.0,11.0,9.0,13.0,5.0,8.0,11.0,7.0,15.0
1229186,75.0,67.0,76.0,68.0,79.0,75.0,78.0,77.0,15.0,57.0,...,12.0,10.0,13.0,11.0,12.0,6.0,14.0,9.0,12.0,15.0
1229488,81.0,76.0,78.0,75.0,83.0,76.0,80.0,77.0,12.0,79.0,...,5.0,13.0,9.0,15.0,13.0,6.0,15.0,11.0,8.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1475066,82.0,79.0,81.0,81.0,83.0,79.0,81.0,85.0,25.0,72.0,...,5.0,9.0,9.0,11.0,13.0,13.0,15.0,15.0,8.0,15.0
1474327,80.0,80.0,81.0,81.0,83.0,80.0,82.0,81.0,25.0,73.0,...,7.0,27.0,15.0,25.0,9.0,31.0,5.0,33.0,13.0,37.0
840060,80.0,76.0,77.0,77.0,84.0,78.0,83.0,79.0,14.0,76.0,...,6.0,9.0,10.0,10.0,8.0,5.0,15.0,7.0,15.0,8.0
1988907,76.0,75.0,72.0,74.0,76.0,78.0,74.0,74.0,15.0,74.0,...,15.0,13.0,12.0,15.0,12.0,6.0,15.0,11.0,9.0,14.0
