In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

import pickle
import re


import plotly.graph_objects as go
import plotly.express as px

## 2. Regression

***2.1*** First delete all players where the value below the 25% quartile and then drop all players that are not playing for a national team ('nation_position').

In [None]:
df1 = df[['short_name','nation_position','value_eur']]
value_25 = df1['value_eur'].quantile(.25)
print("25% quartile of value_euro is ",value_25 )
player= df1[df1.value_eur>=value_25]
player = df1[df1['nation_position'].notna()]
player

25% quartile of value_euro is  300000.0


Unnamed: 0,short_name,nation_position,value_eur
0,L. Messi,RW,67500000
1,Cristiano Ronaldo,LS,46000000
2,J. Oblak,GK,75000000
5,K. De Bruyne,RCM,87000000
6,K. Mbappé,RM,105500000
...,...,...,...
16663,S. Rabotov,SUB,0
17061,O. Patla,SUB,0
17085,C. Palan,LCM,0
17086,D. Singhal,SUB,0


***2.2*** Regress the logarithmic player value on the overall strength ('overall') and potential ('potential') of the player.  Which player has the highest positive residual?

In [None]:
df = df[df['value_eur']>0]
df['log_value'] = np.log(df['value_eur'])
X = df[['overall', 'potential']]
y = df['log_value']
model = LinearRegression()
model.fit(X, y)
predicted_values = model.predict(X)
residuals = y - predicted_values


highest_residual = df.loc[residuals.idxmax()]
print(highest_residual)

sofifa_id                                                           257697
player_url               https://sofifa.com/player/257697/zitong-chen/2...
short_name                                                     Chen Zitong
long_name                                                      Zitong Chen
age                                                                     23
                                               ...                        
rb                                                                    47+2
weekday_born                                                      Thursday
BMI                                                              23.124061
market_value_millions                                                 0.04
log_value                                                        10.596635
Name: 18942, Length: 110, dtype: object




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



***2.3*** Create a single column for every position in 'team_position', which is one if a player plays on that position a zero in all other cases. Regress the logarithmic player value on age, the squared age, and the team position dummies.

In [None]:
player_plays = pd.get_dummies(df['team_position'])
print(player_plays)
df['squared_age']= df['age']**2
X = pd.concat([df[['age', 'squared_age']], player_plays], axis=1)
y= df['log_value']

model = sm.OLS(y, X).fit()
print(model.summary())

       CAM  CB  CDM  CF  CM  GK  LAM  LB  LCB  LCM  LDM  LF  LM  LS  LW  LWB  \
0        1   0    0   0   0   0    0   0    0    0    0   0   0   0   0    0   
1        0   0    0   0   0   0    0   0    0    0    0   0   0   1   0    0   
2        0   0    0   0   0   1    0   0    0    0    0   0   0   0   0    0   
3        0   0    0   0   0   0    0   0    0    0    0   0   0   0   0    0   
4        0   0    0   0   0   0    0   0    0    0    0   0   0   0   1    0   
...    ...  ..  ...  ..  ..  ..  ...  ..  ...  ...  ...  ..  ..  ..  ..  ...   
18939    0   0    0   0   0   0    0   0    0    0    0   0   0   0   0    0   
18940    0   0    0   0   0   0    0   0    0    0    0   0   0   0   0    0   
18941    0   0    0   0   0   0    0   0    0    0    0   0   0   0   0    0   
18942    0   0    0   0   0   0    0   0    0    0    0   0   0   0   0    0   
18943    0   0    0   0   0   0    0   0    0    0    0   0   0   0   0    0   

       RAM  RB  RCB  RCM  RDM  RES  RF 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
y_pred= model.predict(X)
r2_scr= r2_score(y,y_pred)
r2_scr

0.2176586137254417

***2.4*** Use the same model as in the last task. Now estimate all possible models in which you omit one explanatory variable (every position dummy count as single explanatory variables). Which has the highest and lowest influence on the R Squared? (Use a loop!)

In [None]:
r2_dict={}
explanatory_variables = list(player_plays)
for column_to_omit in explanatory_variables:
    X_omitted = X.drop(columns=[column_to_omit])

    model = sm.OLS(y, X_omitted).fit()

    y_pred = model.predict(X_omitted)
    r_squared = r2_score(y, y_pred)
    r2_dict[column_to_omit] = r_squared


sorted_r2_series = pd.Series(r2_dict).sort_values(ascending = False)
print(sorted_r2_series)

RF     0.214896
RES    0.214853
LWB    0.214538
RWB    0.214392
CF     0.214327
CM     0.214283
LF     0.214134
SUB    0.213933
CB     0.213676
LAM    0.213629
LCB    0.212837
RAM    0.212797
RCB    0.212651
LB     0.212399
RCM    0.212289
RB     0.212266
RDM    0.212183
RS     0.212146
LCM    0.212068
GK     0.212051
LDM    0.212007
LS     0.211798
RM     0.211789
LM     0.211673
CDM    0.211639
ST     0.211393
RW     0.210989
LW     0.210764
CAM    0.210757
dtype: float64


***2.5*** Estimate a logit model that predicts whether a Player plays one the four biggest leagues ("Spain Primera Division", "German 1. Bundesliga", "English Premier League", and "French Ligue 1") or not. Use the same explanatory variables as in Task 2.3. Could the accuracy of the model be improved if the market value is added to the explanatory variables?

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
df1 = df
df1['target'] = df1['league_name'].apply(lambda x: 1 if x in ["Spanish Primera Division", "German 1. Bundesliga", "English Premier League", "French Ligue 1"] else 0)

#copied X from task2.3
player_plays = pd.get_dummies(df1['team_position'])
df1['squared_age']= df1['age']**2
X = pd.concat([df1[['age', 'squared_age']], player_plays], axis=1)

y= df1['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

threshold = 0.5
y_pred_binary = (y_pred > threshold).astype(int)
accuracy = accuracy_score(y_test, y_pred_binary)
print(f"Accuracy of the model 1 : {accuracy:.4f}")


Accuracy of the model 1 : 0.8971




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
#model2
y2= df1['target']
X2 = pd.concat([df1[['age', 'squared_age', 'value_eur']], player_plays], axis=1)

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2)

model2 = LinearRegression()
model2.fit(X2_train, y2_train)
y2_pred = model2.predict(X2_test)

threshold = 0.5
y2_pred_binary = (y2_pred > threshold).astype(int)
accur = accuracy_score(y2_test, y2_pred_binary)
print(f"Accuracy of the model 2 : {accur:.4f}")

Accuracy of the model 2 : 0.9062
