In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import matplotlib.pyplot as plt
import torchvision as T
import torch
from sklearn import preprocessing
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from torch.utils.data import Dataset, DataLoader
import math
import itertools
import os
import sys
root_path = os.path.abspath(os.path.join('../..')) # <- adjust such that root_path always points at the root project dir (i.e. if current file is two folders deep, use '../..'). 
if root_path not in sys.path:
    sys.path.append(root_path)
import database_server.db_utilities as dbu 
import pickle as pkl
from sklearn.decomposition import PCA
from Help_functions import preprocess, game_dict, inputs, inputs_2seas, club_dict, points_and_co, points_and_co_oppon, data_to_lstm, Sport_pred_2LSTM_1, predict
import random

In [2]:
torch.manual_seed(69420)
random.seed(69420)
np.random.seed(69420)

In [3]:
query_str = """
SELECT ms.*, 
       m.schedule_date, m.schedule_time, m.schedule_round, m.schedule_day,
       w.annual_wages_eur AS annual_wage_team, 
       w.weekly_wages_eur AS weekly_wages_eur,
       w.annual_wages_eur/w.n_players AS annual_wage_player_avg
FROM matchstats ms 
LEFT JOIN matches m ON ms.match_id = m.id
LEFT JOIN teamwages w ON ms.team_id = w.team_id
AND     ms.season_str = w.season_str
ORDER BY m.schedule_date DESC, m.schedule_time DESC; 
"""

df_allinfo = dbu.select_query(query_str)

In [4]:
new_data_test = preprocess(df_allinfo)

In [5]:
scale_df = new_data_test.data_frame.copy()
#scale_df["stad_capac"] = 0
for team in scale_df.team_id.unique():
    max_attend = max(scale_df[(scale_df.team_id == team) & (scale_df.venue == new_data_test.return_dicts("venue")["Home"])].attendance)
    scale_df.loc[(scale_df.team_id == team) & (scale_df.venue == new_data_test.return_dicts("venue")["Home"]),"stad_capac"] = scale_df[(scale_df.team_id == team) & (scale_df.venue == new_data_test.return_dicts("venue")["Home"])].attendance.apply(lambda x: x/max_attend)
    scale_df.loc[(scale_df.opponent_id == team) & (scale_df.venue == new_data_test.return_dicts("venue")["Away"]),"stad_capac"] = scale_df[(scale_df.opponent_id == team) & (scale_df.venue == new_data_test.return_dicts("venue")["Away"])].attendance.apply(lambda x: x/max_attend)

In [6]:
#scale_df = new_data_test.data_frame.copy()

liste = ['schedule_time', 'schedule_round', 'schedule_day', 'result', 'gf', 'ga', 'xg', 'xga', 'formation', 
         'referee', 'season_str', 'league_id', 'team_id', 'opponent_id', 'match_id', 'id', 'fbref_id', 
         'home_team_id', 'away_team_id', 'schedule_date', 'venue', 'captain',]

cols_to_scale = list(set(list(scale_df.columns)).difference(liste))
object_ = StandardScaler()

scale_df.loc[:,cols_to_scale] = object_.fit_transform(scale_df.loc[:,cols_to_scale])
scale_df.head()

Unnamed: 0,venue,result,gf,ga,xg,xga,attendance,captain,formation,referee,...,misc_aerialduels_lost,misc_aerialduels_won_perc,schedule_date,schedule_time,schedule_round,schedule_day,annual_wage_team,weekly_wages_eur,annual_wage_player_avg,stad_capac
0,0,2,3,3,1.8,0.6,-0.840228,1027,2,1,...,-0.984829,1.958156,2023-06-04,1,38,0,-0.809407,-0.809407,-0.547866,-1.513961
1,1,1,2,3,1.4,1.1,-0.216427,63,2,2,...,-1.837538,2.666886,2023-06-04,1,38,0,-0.875627,-0.875628,-0.649831,0.900104
2,1,0,2,1,0.7,1.2,-0.348198,162,2,3,...,-0.13212,-2.33153,2023-06-04,1,38,0,-0.520861,-0.520861,0.119906,0.991026
3,0,1,2,5,1.2,4.0,-0.561956,314,0,4,...,-0.984829,0.466091,2023-06-04,1,38,0,-0.3895,-0.3895,-0.097483,0.701977
4,0,2,1,1,2.5,1.6,-0.712704,171,3,5,...,-0.416357,-0.186687,2023-06-04,1,38,0,-0.751744,-0.751744,-0.441241,-0.962527


In [7]:
ohe_team = OneHotEncoder()
to_ohe_team = scale_df.loc[:, ["team_id", "opponent_id"]]
ohe_team.fit(to_ohe_team)

codes = ohe_team.transform(to_ohe_team).toarray()
feature_names = ohe_team.get_feature_names(['team_id', 'opponent_id'])

scale_df = pd.concat([scale_df, 
               pd.DataFrame(codes, columns = feature_names).astype(int)], axis=1)
##########################################
ohe_ligue = OneHotEncoder()
to_ohe_ligue = scale_df.loc[:,["league_id"]]
ohe_ligue.fit(to_ohe_ligue)

codes = ohe_ligue.transform(to_ohe_ligue).toarray()
feature_names = ohe_ligue.get_feature_names(['league_id'])

scale_df = pd.concat([scale_df, 
               pd.DataFrame(codes, columns = feature_names).astype(int)], axis=1)


In [8]:
def save_object(obj, save_name):
        """
        Saves obj to a pickle file in directory path.
        """ 
        with open(os.path.join(f"{save_name}.pkl"), 'wb') as f:
            pkl.dump(obj, f)

def load_data_prep_object(obj_name):
    """
    Loads and returns the specified data prep object from the data_prep_objects_path.
    """
        # check if file exists
    if not os.path.isfile(os.path.join(f"{obj_name}.pkl")):
        raise ValueError(f"Data prep object file '{obj_name}.pkl' does not exist.")
    else:
        with open(os.path.join(f"{obj_name}.pkl"), 'rb') as f:
            obj = pkl.load(f)
            return obj
            
def do_pca(df, perc_var, col_name1 = "shooting_standard_gls", col_name2 = 'misc_aerialduels_won_perc'):
    try:
        pcs_matchstat = load_data_prep_object("./pca_matchstats")
    except:
        x = df.loc[:,col_name1:col_name2].fillna(0)
        pca_matchstat = PCA(n_components = perc_var)
        pcs_matchstat = pca_matchstat.fit_transform(x)
        save_object(pcs_matchstat, "pca_matchstats")
    principal_ms_df = pd.DataFrame(data = pcs_matchstat, columns = [f"feature_{p}" for p in range(pcs_matchstat.shape[1])])
    num_pcs = principal_ms_df.shape[1]
    print(num_pcs)
    columns_to_overwrite = list(df.loc[:, col_name1:col_name2].columns)
    df = df.drop(labels = columns_to_overwrite, axis = "columns")
    new_cols = columns_to_overwrite[:num_pcs-1] + columns_to_overwrite[-1:]
    print(len(new_cols))
    df.loc[:, new_cols] = principal_ms_df.values
    return df

In [9]:
scale_df_pca = do_pca(scale_df, 0.95)

66
66


In [10]:
clubs = club_dict(scale_df_pca)

In [11]:
result_dict = new_data_test.return_dicts("result")
clubs = points_and_co(clubs, result_dict)

In [12]:
clubs = points_and_co_oppon(clubs, result_dict)

In [13]:
game_train, game_valid = game_dict(scale_df_pca)

In [14]:
abcdefg = list(scale_df.columns)
abc = abcdefg[:abcdefg.index("annual_wage_player_avg")+1]
defg = abcdefg[abcdefg.index("annual_wage_player_avg")+1:]

rearange_list = ['result', 'gf', 'ga', 'goal_diff', 'xg', 'xga', 'shooting_standard_gls',
       'shooting_standard_sh', 'shooting_standard_sot',
       'shooting_standard_sot_perc', 'shooting_standard_g_per_sh',
       'shooting_standard_g_per_sot', 'shooting_standard_dist',
       'shooting_standard_fk', 'shooting_standard_pk',
       'shooting_standard_pkatt', 'shooting_expected_npxg',
       'shooting_expected_npxg_per_sh', 'shooting_expected_g_minus_xg',
       'shooting_expected_npg_minus_xg', 'keeper_performance_sota',
       'keeper_performance_saves', 'keeper_performance_save_perc',
       'keeper_performance_cs', 'keeper_performance_psxg',
       'keeper_performance_psxg_plus_minus', 'keeper_penaltykicks_pkatt',
       'keeper_penaltykicks_pka', 'keeper_penaltykicks_pksv',
       'keeper_penaltykicks_pkm', 'keeper_launched_cmp', 'keeper_launched_att',
       'keeper_launched_cmp_perc', 'keeper_passes_att', 'keeper_passes_thr',
       'keeper_passes_launch_perc', 'keeper_passes_avglen',
       'keeper_goalkicks_att', 'keeper_goalkicks_launch_perc',
       'keeper_goalkicks_avglen', 'keeper_crosses_opp', 'keeper_crosses_stp',
       'keeper_crosses_stp_perc', 'keeper_sweeper_number_opa',
       'keeper_sweeper_avgdist', 'passing_total_cmp', 'passing_total_att',
       'passing_total_cmp_perc', 'passing_total_totdist',
       'passing_total_prgdist', 'passing_short_cmp', 'passing_short_att',
       'passing_short_cmp_perc', 'passing_medium_cmp', 'passing_medium_att',
       'passing_medium_cmp_perc', 'passing_long_cmp', 'passing_long_att',
       'passing_long_cmp_perc', 'passing_attacking_ast',
       'passing_attacking_xag', 'passing_attacking_xa', 'passing_attacking_kp',
       'passing_attacking_1_per_3', 'passing_attacking_ppa',
       'passing_attacking_crspa', 'passing_attacking_prgp',
       'passing_types_passtypes_live', 'passing_types_passtypes_dead',
       'passing_types_passtypes_fk', 'passing_types_passtypes_tb',
       'misc_aerialduels_won_perc','attendance', 'points', 'mean_points',
       'weekly_wages_eur', 'season_str',  'league_id', 'venue', 'team_id', 'oppon_wages',
       'opponent_id', 'last_results', 'oppon_points', 'oppon_mean_points', 'schedule_round',
        'captain', 'formation', 'referee',  'match_id', 'schedule_date', 'schedule_time',
        'schedule_day', 'annual_wage_team', 'annual_wage_player_avg',]

rearange_list = list(itertools.chain.from_iterable(defg if item == "team_id" else [item] for item in rearange_list))

del rearange_list[rearange_list.index("opponent_id")]
rearange_list

['result',
 'gf',
 'ga',
 'goal_diff',
 'xg',
 'xga',
 'shooting_standard_gls',
 'shooting_standard_sh',
 'shooting_standard_sot',
 'shooting_standard_sot_perc',
 'shooting_standard_g_per_sh',
 'shooting_standard_g_per_sot',
 'shooting_standard_dist',
 'shooting_standard_fk',
 'shooting_standard_pk',
 'shooting_standard_pkatt',
 'shooting_expected_npxg',
 'shooting_expected_npxg_per_sh',
 'shooting_expected_g_minus_xg',
 'shooting_expected_npg_minus_xg',
 'keeper_performance_sota',
 'keeper_performance_saves',
 'keeper_performance_save_perc',
 'keeper_performance_cs',
 'keeper_performance_psxg',
 'keeper_performance_psxg_plus_minus',
 'keeper_penaltykicks_pkatt',
 'keeper_penaltykicks_pka',
 'keeper_penaltykicks_pksv',
 'keeper_penaltykicks_pkm',
 'keeper_launched_cmp',
 'keeper_launched_att',
 'keeper_launched_cmp_perc',
 'keeper_passes_att',
 'keeper_passes_thr',
 'keeper_passes_launch_perc',
 'keeper_passes_avglen',
 'keeper_goalkicks_att',
 'keeper_goalkicks_launch_perc',
 'keeper_

In [15]:
train_inputs = inputs_2seas(game_train, clubs, rearange_list, scale_df_pca)
valid_inputs = inputs_2seas(game_valid, clubs, rearange_list, scale_df_pca)

7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37


In [16]:
train_for_loader = data_to_lstm(train_inputs)
valid_for_loader = data_to_lstm(valid_inputs)

In [17]:
train_loader = torch.utils.data.DataLoader(train_for_loader, batch_size = 32, drop_last = True)
test_loader = torch.utils.data.DataLoader(valid_for_loader, batch_size = 32, drop_last = True)

In [18]:
len_input = train_inputs[0][0].shape[1]
lr = 2e-5
#wdc = [0.2, 0.15, 0.1, 0.05, 0]
#lrs = [1e-3, 1e-4, 1e-5, 1e-6]
#for lr in lrs:
#for wd in wdc:
net = Sport_pred_2LSTM_1(len_input, len_input, 3, 3)
#print(f"\n\nLR: {lr}, WD: {wd}")
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(net.parameters(), lr = lr)#, weight_decay = 0.01)

net.eval()
accur = 0
for step, ((input1, input2), (result1, result2)) in enumerate(test_loader):
    pred = net(input1)
    #print(pred.shape)
    #print(result1.shape)
    pred = torch.argmax(pred[:,-1,:], dim = 1)
    result = result1[:,-1,:]
    result = torch.argmax(result, dim = 1)
    accur += pred.eq(result).sum().item()

    pred = net(input2)
    pred = torch.argmax(pred[:,-1,:], dim = 1)
    result2 = result2[:,-1,:]
    result2 = torch.argmax(result2, dim = 1)
    accur += pred.eq(result2).sum().item()
print(f"Loss accuracy training: {100 * accur /((step + 1) * 64)}%")

for epoch in range(5):
    losses_val = []
    accuracies = []

    net.train()
    print(f"Epoch: {epoch}")
    for step, ((input1, input2), (result1, result2)) in enumerate(train_loader):
        net.zero_grad()
        pred = net(input1)
        #result = torch.nn.functional.one_hot(result.to(torch.int64), num_classes = 3).to(torch.float32)
        #print(f"pred {pred.dtype}, result {result.dtype}")
        #result1 = result1[:,-1,:]
        loss = criterion(pred.float(), result1.float())
        loss.backward()
        optimizer.step()

        net.zero_grad()
        pred = net(input2)
        #result2 = result2[:,-1,:]
        #result = torch.nn.functional.one_hot(result.to(torch.int64), num_classes = 3).to(torch.float32)
        #print(f"pred {pred.dtype}, result {result.dtype}")
        loss = criterion(pred.float(), result2.float())
        loss.backward()
        optimizer.step()

    net.eval()
    loss = 0
    accur = 0
    for step, ((input1, input2), (result1, result2)) in enumerate(test_loader):
        pred = net(input1)
        #print(pred)
        #print(pred.shape)
        result1 = result1[:,-1,:]
        pred = pred[:,-1,:]
        loss += criterion(pred, result1).item()
        #
        #print(pred)
        #print(pred.shape)
        pred = torch.argmax(pred, dim = 1)
        #print(pred)
        #print(pred.shape)

        result1 = torch.argmax(result1, dim = 1)
        accur += pred.eq(result1).sum().item()
       # if epoch == 9:
        #    print("pred", pred)
         #   print("result", result1)
        pred = net(input2)
        result2 = result2[:,-1,:]
        pred = pred[:,-1,:]
        loss += criterion(pred, result2).item()
        #pred = pred[:,-1,:]
        pred = torch.argmax(pred, dim = 1)
        result2  = torch.argmax(result2, dim = 1)
        accur += pred.eq(result2).sum().item()
    losses_val.append(loss)
    accuracy = 100 * (accur /((step + 1) * 64))
    os.makedirs(os.path.dirname(f"./models/sequence_model_pca_2seas/LSTM/{lr}/accur_{round(accuracy, 2)}"), exist_ok = True)
    torch.save(net.state_dict(), f"./models/sequence_model_pca_2seas/LSTM/{lr}/accur_{round(accuracy, 2)}")
    accuracies.append(accuracy)
    print(f"Loss {loss} accuracy {accuracy}%") 
print("done")


Loss accuracy training: 37.362637362637365%
Epoch: 0
Loss 38.20027278363705 accuracy 46.35989010989011%
Epoch: 1
Loss 38.656844317913055 accuracy 46.46291208791209%
Epoch: 2
Loss 40.29108799248934 accuracy 47.25274725274725%
Epoch: 3
Loss 49.67716025561094 accuracy 45.32967032967033%
Epoch: 4
Loss 58.64155311882496 accuracy 44.36813186813187%
done


In [19]:
len_input = train_inputs[0][0].shape[1]
lr = 2e-5
#wdc = [0.2, 0.15, 0.1, 0.05, 0]
#lrs = [1e-3, 1e-4, 1e-5, 1e-6]
#for lr in lrs:
#for wd in wdc:
net = Sport_pred_2LSTM_1(len_input, len_input, 3, 3)
#print(f"\n\nLR: {lr}, WD: {wd}")
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(net.parameters(), lr = 2e-5)#, weight_decay = 0.01)

net.eval()
accur = 0
for step, ((input1, input2), (result1, result2)) in enumerate(test_loader):
    pred1 = net(input1)
    pred2 = net(input2)
    #pred1_n = change_back(pred1)
    #pred2_n = change_back(pred2)
    result = result1[:,-1,:]
    result = torch.argmax(result, dim = 1)
    predicted = predict(pred1, pred2, result_dict)
    accur += predicted.eq(result).sum().item()
print(f"Loss accuracy training: {100 * accur /((step + 1) * 32)}%")

for epoch in range(3):
    losses_val = []
    accuracies = []

    net.train()
    print(f"Epoch: {epoch}")
    for step, ((input1, input2), (result1, result2)) in enumerate(train_loader):
        net.zero_grad()
        pred = net(input1)
        #result1 = change_result(result1)
        loss = criterion(pred.float(), result1.float())
        loss.backward()
        optimizer.step()

        net.zero_grad()
        pred = net(input2)
        #result2 = change_result(result2)
        loss = criterion(pred.float(), result2.float())
        loss.backward()
        optimizer.step()

    net.eval()
    loss = 0
    accur = 0
    for step, ((input1, input2), (result1, result2)) in enumerate(test_loader):
        pred1 = net(input1)
        pred2 = net(input2)
        result = result1[:,-1,:]
        #pred1_n = change_back(pred1)
        #pred2_n = change_back(pred2)
        result = torch.argmax(result, dim = 1)
        predicted = predict(pred1, pred2, result_dict)
        #test = pred1.clone()
        #test[pred1 > 0.5] = 0
        #predicted = torch.argmax(test, dim = 1)
        accur += predicted.eq(result).sum().item()
    losses_val.append(loss)
    accuracy = 100 * (accur /((step + 1) * 32))
    os.makedirs(os.path.dirname(f"./models/sequence_model_pca_2seas/{lr}/LSTM/accur_{round(accuracy, 2)}"), exist_ok = True)
    torch.save(net.state_dict(), f"./models/sequence_model_pca_2seas/{lr}/LSTM/accur_{round(accuracy, 2)}")
    accuracies.append(accuracy)
    print(f"Loss {loss} accuracy {accuracy}%") 
print("done")


Loss accuracy training: 30.563186813186814%
Epoch: 0


KeyboardInterrupt: 

In [None]:
def predict(tensor1, tensor2, result_dict):
    # tensor with results
    tens1 = tensor1[:,-1,:]
    tens2 = tensor2[:,-1,:]
    
    sum1 = torch.sum(tens1, dim = 1)
    sum2 = torch.sum(tens2, dim = 1)
    sum1_reshaped = sum1.unsqueeze(1).expand_as(tens1)
    sum2_reshaped = sum2.unsqueeze(1).expand_as(tens2)
    output1 = tens1 / sum1_reshaped
    output2 = tens2 / sum1_reshaped
    
    vals1, idx1 = torch.max(output1, dim = 1)
    vals2, idx2 = torch.max(output2, dim = 1)
    #high_val1 = torch.max(output1, dim = 1)
    #high_val2 = torch.argmax(output2, dim = 1)
    #print(high_val1.shape)
    small_val1,_ = torch.topk(output1, k = 2, dim = 1, largest = False)
    small_val2,_ = torch.topk(output2, k = 2, dim = 1, largest = False)
    
    diff1 = torch.sum(torch.square(vals1.unsqueeze(1).expand_as(small_val1) - small_val1), dim = 1)
    diff2 = torch.sum(torch.square(vals2.unsqueeze(1).expand_as(small_val2) - small_val2), dim = 1) 
    
    result = torch.where(diff1 > diff2, idx1, torch.where(abs(diff1 - diff2) < 0.02, result_dict["D"], result_dict["L"]))
        
    return result

In [None]:
def custom_replace(tensor):
    # we create a copy of the original tensor, 
    # because of the way we are replacing them.
    res = tensor.clone()
    res[tensor == 0] = 1
    res[tensor == 1] = 0
    return res

In [None]:
def simil_metric(tens1):
    count = torch.abs(tens1.unsqueeze(1) - tens1.unsqueeze(2))
    div, _ = torch.max(tens1, dim = 1)
    res = 1 - (count/div)
    return res

In [None]:
test = pred1.clone()
test[pred1 > 0.5] = 0
test.shape


In [None]:
def compute_distance(tensor):
    x = tensor[:, 0]
    y = tensor[:, 1]
    z = tensor[:, 2]

    absolute_diff_12 = torch.abs(x - y)
    absolute_diff_13 = torch.abs(x - z)

    max_value_12 = torch.max(x, y)
    max_value_13 = torch.max(x, z)

    distance = torch.stack([absolute_diff_12 / max_value_12, absolute_diff_13 / max_value_13], dim=1)
    return distance

In [None]:
torch.sum(compute_distance(pred1), dim =1 )

In [None]:
pred1

In [None]:
torch.nn.Softmax(pred1)

In [None]:
result = result1[:,-1,:]
torch.where(result1[:,:2].sum() == 1, 1,  0)

In [None]:
result1[:,-1,:]

In [None]:
def change_result(tensor):
    result = tensor.float()
    is_last_column_one = result[:,:,-1] == 1
    result[is_last_column_one] = 0.5
    result = result[:,:,:2]
    return result

In [None]:
result1 = result1.float()
is_last_column_one = result1[:,:,-1] == 1
result1[is_last_column_one] = 0.5 #torch.full((308, 3), 0.5)
result1[:,-1,:]

In [None]:
result1[0,0,0] = 1
result1[0,0,0]

In [None]:
result1[is_last_column_one]

In [None]:
result1[:,-1,:]

In [None]:
change_result(result1)[:,-1,:]

In [None]:
def change_back(tensor):
    tensor = tensor[:,-1,:]
    new = torch.zeros(len(tensor))
    col1 = tensor[:,0]
    col2 = tensor[:,1]
    col1 = torch.where(col1 > 0.6, 1., 0.)
    col2 = torch.where(col2 > 0.6, 1., 0.)
    new = torch.where(new == col1 + col2, 1., 0.)
    out = torch.stack((col1, col2, new), dim = -1)
    return out

In [None]:
change_back(pred1)

In [None]:
pred1[:,-1,:]

In [None]:
tensor = pred1[:,-1,:]
col1 = tensor[:,0]
col2 = tensor[:,1]
col1
torch.where(col2 > 0.7, 1., 0.)

In [None]:
col1