In [None]:
import pandas as pd
import numpy as np
import tqdm

pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('../data/player_data_kevat.csv')
df['nameTeamCombined'] = df['name'] + '_' + df['team']
df['nameTeamCombined'] = df['nameTeamCombined'].str.replace(' ', '')
df

In [None]:
df[df['name']=='Toivonen Jaakko']

In [None]:
unique_combos = df[['nameTeamCombined']].drop_duplicates().reset_index(drop=True)
unique_combos

In [None]:
attack_cols = unique_combos.apply(lambda row: f"{row['nameTeamCombined']}_attack", axis=1)
defense_cols = unique_combos.apply(lambda row: f"{row['nameTeamCombined']}_defense", axis=1)

all_cols = attack_cols.tolist() + defense_cols.tolist()

input_df = pd.DataFrame(columns=all_cols)
input_df

In [None]:
target_df = pd.DataFrame(columns=['goals_scored'])
target_df

In [None]:
match_ids = df['match_id'].unique()
len(match_ids)

In [None]:
for match_id in tqdm.tqdm(match_ids):

    match_df = df[df['match_id'] == match_id]
    home_team = match_df['home_team'].iloc[0]
    away_team = match_df['away_team'].iloc[0]
    
    home_players = match_df.loc[match_df['team'] == home_team, 'nameTeamCombined'].values
    away_players = match_df.loc[match_df['team'] == away_team, 'nameTeamCombined'].values

    aux1 = np.array([1 if player in home_players+'_attack' else 0 for player in input_df.columns])
    aux2 = np.array([1 if player in away_players+'_defense' else 0 for player in input_df.columns])
    aux = aux1 + aux2
    input_df.loc[len(input_df)] = aux
    target_df.loc[len(target_df)] = [match_df['home_goals'].iloc[0]]

    aux1 = np.array([1 if player in away_players+'_attack' else 0 for player in input_df.columns])
    aux2 = np.array([1 if player in home_players+'_defense' else 0 for player in input_df.columns])
    aux = aux1 + aux2
    input_df.loc[len(input_df)] = aux
    target_df.loc[len(target_df)] = [match_df['away_goals'].iloc[0]]

In [None]:
input_df

In [None]:
target_df

### Ridge regression for player impact

In [None]:
from sklearn.linear_model import Ridge
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:
clf = Ridge(alpha=100.0, fit_intercept=False,)

In [None]:
clf.fit(input_df, target_df)

In [None]:
clf.coef_

In [None]:
plt.hist(clf.coef_.flatten(), bins=50);

In [None]:
plt.hist(clf.coef_[0:746].flatten(), bins=50);
plt.title('Attack coefficients')

In [None]:
plt.hist(clf.coef_[746:].flatten(), bins=50);
plt.title('Defense coefficients')

In [None]:
print(input_df.columns[np.argmax(clf.coef_)])
print(np.max(clf.coef_))

In [None]:
print(input_df.columns[np.argmin(clf.coef_)])
print(np.min(clf.coef_))

In [None]:
df[df['name']=='Keskinen Antti']

In [None]:
df[df['name']=='Kazmi Muhammad Mujtaba']

In [None]:
print(input_df.columns[np.argmax(clf.coef_[0:746])])
print(np.max(clf.coef_[0:746]))

In [None]:
df[df['name']=='Chekurov Alexander']

In [None]:
top_indices_attack = np.argsort(clf.coef_[0:746])[-20:][::-1]
top_indices_defense = np.argsort(clf.coef_[746:])[:20]

In [None]:
print("Top 20 attack players:")
print(input_df.columns[top_indices_attack])

In [None]:
print("Top 20 defense players:")
display(input_df.columns[top_indices_defense+746])

In [None]:
df[df['name']=='Eiskop Hindrek']

In [None]:
df[df['name']=='Vikström Valtteri']

In [None]:
df[df['name']=='Hurme Tommi']

In [None]:
clf.coef_[np.argsort(clf.coef_[746:])[:20]+746]

In [None]:
22/8

In [None]:
7/4

In [None]:
substring = 'ToivonenJaakko'
matches = [i for i, col in enumerate(input_df.columns) if substring in col]
matches

In [None]:
print(f"{substring}")
print(f"Attacking coefficient: {clf.coef_[matches[0]]}")
print(f"Defending coefficient: {clf.coef_[matches[1]]}")

In [None]:
substring = 'Trikiinit'
matches = [i for i, col in enumerate(input_df.columns) if substring in col]

In [None]:
len(matches) / 2

In [None]:
attack_list = []
defense_list = []

print(f"{substring}")
print()
print(f"Attacking coefficients")
for i in range(int(len(matches)/2)):
    attack_list.append((input_df.columns[matches[i]], clf.coef_[matches[i]]))
    print(f"{input_df.columns[matches[i]]}: {clf.coef_[matches[i]]:.2f}")

print()
print(f"Defensive coefficients")
for i in range(int(len(matches)/2), len(matches)):
    defense_list.append((input_df.columns[matches[i]], clf.coef_[matches[i]]))
    print(f"{input_df.columns[matches[i]]}: {clf.coef_[matches[i]]:.2f}")

In [None]:
aux_df = pd.DataFrame(columns=['name', 'team', 'attack_coef', 'defense_coef'])
aux_df['name'] = [name[:-17] for name, coefficinet in attack_list]
aux_df['attack_coef'] = [coefficinet for name, coefficinet in attack_list]
aux_df['defense_coef'] = [coefficinet for name, coefficinet in defense_list]
aux_df['team'] = substring
aux_df

In [None]:
fig = px.scatter(aux_df, x='attack_coef', y='defense_coef', text='name')
fig.update_yaxes(autorange="reversed")

In [None]:
input_df.columns

In [None]:
aux_list = input_df.columns[:746].str.split('_').to_list()
aux_list

In [None]:
coeff_df = pd.DataFrame(columns=['name', 'team', 'attack_coef', 'defense_coef', 'coef_diff'])

names = []
teams = []
for item in aux_list:
    names.append(item[0])
    teams.append(item[1])
coeff_df['name'] = names
coeff_df['team'] = teams
coeff_df['attack_coef'] = clf.coef_[:746]
coeff_df['defense_coef'] = clf.coef_[746:]
coeff_df['coef_diff'] = coeff_df['attack_coef'] - coeff_df['defense_coef']
coeff_df

In [None]:
coeff_df.loc[coeff_df['team']=='Trikiinit', 'color_name'] = 'Trikiinit'
coeff_df.loc[coeff_df['team']!='Trikiinit', 'color_name'] = 'other'

In [None]:
(
px.scatter(
    coeff_df, 
    x='attack_coef', 
    y='defense_coef', 
    hover_name='name', 
    color='color_name')
.update_yaxes(autorange="reversed")
)

In [None]:
coeff_df[(coeff_df['name']=='ToivonenJaakko') | (coeff_df['name']=='WinbergMatias')]

In [None]:
df[df['name']=='Vattulainen Taisto']

In [None]:
coeff_df[coeff_df['team']=='Trikiinit'].sort_values(by='coef_diff', ascending=False)

In [None]:
coeff_df.sort_values(by='coef_diff', ascending=False).head(20)

In [None]:
df[df['name']=='Kolovrat Denial']

In [None]:
coeff_df[coeff_df['team']=='Trikiinit']['attack_coef'].sum()

In [None]:
coeff_df[coeff_df['team']=='Trikiinit']['defense_coef'].sum()