In [2]:
import pandas as pd
import altair as alt
import numpy as np
import math
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler

#### Dependencies:

This project was done on Jupyter notebook hosted locally, and used python and library (not available by default) versions as below:<br>

Python 3.10.4<br>

altair==5.0.0<br>
pandas==2.0.1<br>
numpy==1.24.3<br>
statsmodels==0.14.0<br>
scikit-learn==1.2.2<br>

In [11]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [5]:
# Reading in each country's top ELO rating
elo_rating = pd.read_csv('OUTPUTFILE/top_elo_data.csv')
elo_rating = elo_rating.groupby('country_name', as_index = False)['elo'].max()
elo_rating.head(5)

Unnamed: 0,country_name,elo
0,Albania,1284
1,Algeria,1537
2,Angola,1512
3,Argentina,1709
4,Armenia,1267


In [6]:
# Reading in the average of top 25 MV domestic clubs per country
top_mv_domestic_club = pd.read_csv('OUTPUTFILE/avg_league_val_by_country.csv')
top_mv_domestic_club = top_mv_domestic_club.drop(columns=['Unnamed: 0'])
top_mv_domestic_club.head(5)

Unnamed: 0,country_name,market_val
0,Afghanistan,400.0
1,Albania,2179400.0
2,Algeria,3690200.0
3,American Samoa,0.0
4,American Virgin Islands,3750.0


In [7]:
# Reading in the World Cup results from 2018 and 2022
wc_results = pd.read_csv('OUTPUTFILE/world_cup_result.csv')
wc_results = wc_results.drop(columns=['Unnamed: 0'])
wc_results.head(5)

Unnamed: 0,country_name,power_rating
0,Argentina,0.75
1,Australia,0.4375
2,Belgium,0.65625
3,Brazil,0.828125
4,Cameroon,0.21875


In [8]:
# Reading in the ratio of MV for players who play abroad vs at home
foreign_domestic = pd.read_csv('OUTPUTFILE/foreign_player_national_teams_agg.csv')
foreign_domestic = foreign_domestic.drop(columns=['Unnamed: 0'])

foreign_domestic['avg_mv_diff'] = \
np.nan_to_num(foreign_domestic['mv_play_dom']/foreign_domestic['num_play_dom'],nan=0) - \
np.nan_to_num(foreign_domestic['mv_play_int']/foreign_domestic['num_play_int'],nan=0)

# Let's get rid of any entries that have ratio of 0
foreign_domestic = foreign_domestic[foreign_domestic.dom_int_mv_ratio != 0]
foreign_domestic.head(5)

Unnamed: 0,country_name,mv_play_dom,num_play_dom,mv_play_int,num_play_int,dom_int_mv_ratio,avg_mv_diff
0,Australia,6550000.0,8.0,35350000.0,17.0,0.394,-1260662.0
1,Belgium,11800000.0,2.0,446500000.0,21.0,0.277,-15361900.0
2,Brazil,80000000.0,7.0,777000000.0,16.0,0.235,-37133930.0
4,Canada,26800000.0,15.0,214125000.0,38.0,0.317,-3848202.0
5,Colombia,14200000.0,6.0,120350000.0,18.0,0.354,-4319444.0


In [9]:
wc_results_m = wc_results.merge(foreign_domestic, on=['country_name'],how='left') \
                         .merge(top_mv_domestic_club,on=['country_name'],how='left') \
                         .merge(elo_rating,on=['country_name'],how='left')

wc_results_m = wc_results_m.dropna()

wc_results_m['market_val'] = np.log(wc_results_m['market_val'].astype(float))

wc_results_m.head(5)

Unnamed: 0,country_name,power_rating,mv_play_dom,num_play_dom,mv_play_int,num_play_int,dom_int_mv_ratio,avg_mv_diff,market_val,elo
1,Australia,0.4375,6550000.0,8.0,35350000.0,17.0,0.394,-1260662.0,15.130094,1424
2,Belgium,0.65625,11800000.0,2.0,446500000.0,21.0,0.277,-15361900.0,17.437149,1619
3,Brazil,0.828125,80000000.0,7.0,777000000.0,16.0,0.235,-37133930.0,17.895296,1823
5,Canada,0.03125,26800000.0,15.0,214125000.0,38.0,0.317,-3848202.0,15.532942,1379
6,Colombia,0.375,14200000.0,6.0,120350000.0,18.0,0.354,-4319444.0,16.09368,1675


In [10]:
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

wc_results_m_num = wc_results_m.copy().drop(columns = ['country_name','mv_play_dom','num_play_dom','mv_play_int','num_play_int'])

wc_results_m_num_norm = pd.DataFrame(scaler.fit_transform(wc_results_m_num), columns=wc_results_m_num.columns)

reg = smf.ols(formula = 'power_rating ~ market_val + elo + dom_int_mv_ratio', data = wc_results_m_num_norm).fit() 

print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:           power_rating   R-squared:                       0.430
Model:                            OLS   Adj. R-squared:                  0.364
Method:                 Least Squares   F-statistic:                     6.535
Date:                Tue, 13 Jun 2023   Prob (F-statistic):            0.00193
Time:                        17:57:40   Log-Likelihood:                -34.139
No. Observations:                  30   AIC:                             76.28
Df Residuals:                      26   BIC:                             81.88
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept        -1.838e-17      0.148  