In [12]:
import pandas as pd
import altair as alt
import numpy as np
import math
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler

<IPython.core.display.Javascript object>

# Regression

This is the notebook that will be used to aggregate all our metrics and perform regression. Afterwards, we create a visualization that summarizes our results.

#### Dependencies:

This project was done on Jupyter notebook hosted locally, and used python and library (not available by default) versions as below:<br>

Python 3.10.4<br>

altair==5.0.0<br>
pandas==2.0.1<br>
numpy==1.24.3<br>
statsmodels==0.14.0<br>
scikit-learn==1.2.2<br>

In [13]:
%load_ext nb_black

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [14]:
# Reading in each country's top ELO rating
elo_rating = pd.read_csv("OUTPUTFILE/top_elo_data.csv")
elo_rating = elo_rating.groupby("country_name", as_index=False)["elo"].max()
elo_rating.head(5)

Unnamed: 0,country_name,elo
0,Albania,1284
1,Algeria,1537
2,Angola,1512
3,Argentina,1709
4,Armenia,1267


<IPython.core.display.Javascript object>

In [15]:
# Reading in the average of top 25 MV domestic clubs per country
top_mv_domestic_club = pd.read_csv("OUTPUTFILE/avg_league_val_by_country.csv")
top_mv_domestic_club = top_mv_domestic_club.drop(columns=["Unnamed: 0"])
top_mv_domestic_club.head(5)

Unnamed: 0,country_name,market_val
0,Afghanistan,400.0
1,Albania,2174800.0
2,Algeria,3694200.0
3,American Samoa,0.0
4,American Virgin Islands,3750.0


<IPython.core.display.Javascript object>

In [16]:
# Reading in the World Cup results from 2018 and 2022
wc_results = pd.read_csv("OUTPUTFILE/world_cup_result.csv")
wc_results = wc_results.drop(columns=["Unnamed: 0"])
wc_results.head(5)

Unnamed: 0,country_name,power_rating
0,Argentina,0.75
1,Australia,0.4375
2,Belgium,0.65625
3,Brazil,0.828125
4,Cameroon,0.21875


<IPython.core.display.Javascript object>

In [17]:
# Reading in the ratio of MV for players who play abroad vs at home
foreign_domestic = pd.read_csv("OUTPUTFILE/foreign_player_national_teams_agg.csv")
foreign_domestic = foreign_domestic.drop(columns=["Unnamed: 0"])

foreign_domestic["avg_mv_diff"] = np.nan_to_num(
    foreign_domestic["mv_play_dom"] / foreign_domestic["num_play_dom"], nan=0
) - np.nan_to_num(
    foreign_domestic["mv_play_int"] / foreign_domestic["num_play_int"], nan=0
)

# Let's get rid of any entries that have ratio of 0
foreign_domestic = foreign_domestic[foreign_domestic.dom_int_mv_ratio != 0]
foreign_domestic.head(5)

Unnamed: 0,country_name,mv_play_dom,num_play_dom,mv_play_int,num_play_int,dom_int_mv_ratio,avg_mv_diff
0,Australia,6300000.0,8.0,32950000.0,15.0,0.358498,-1409167.0
1,Belgium,33500000.0,5.0,334200000.0,19.0,0.38091,-10889470.0
2,Brazil,73000000.0,6.0,837500000.0,17.0,0.246965,-37098040.0
4,Canada,7250000.0,4.0,31825000.0,19.0,1.08209,137500.0
5,Colombia,7700000.0,5.0,221000000.0,19.0,0.132398,-10091580.0


<IPython.core.display.Javascript object>

In [18]:
secondary_ind = pd.read_csv("OUTPUTFILE/SECONDARY_INDICATORS_AGG.csv")

secondary_ind = secondary_ind.drop(columns=["mv_usd", "power_rating"])

secondary_ind

Unnamed: 0,country_name,gdp_usd,bmi,life_exp,qol
0,Afghanistan,1.478686e+10,22.3,61.982000,37
1,Albania,1.825579e+10,27.0,76.463000,54
2,Algeria,1.630444e+11,24.7,76.377000,51
3,American Samoa,7.090000e+08,0.0,0.000000,0
4,Andorra,3.330282e+09,27.6,0.000000,0
...,...,...,...,...,...
219,Virgin Islands (U.S.),0.000000e+00,0.0,80.068293,0
220,West Bank and Gaza,1.803680e+10,0.0,73.473000,0
221,"Yemen, Rep.",0.000000e+00,0.0,63.753000,0
222,Zambia,2.214763e+10,21.0,61.223000,46


<IPython.core.display.Javascript object>

In [20]:
wc_results_m = (
    wc_results.merge(foreign_domestic, on=["country_name"], how="left")
    .merge(top_mv_domestic_club, on=["country_name"], how="left")
    .merge(elo_rating, on=["country_name"], how="left")
    .merge(secondary_ind, on=["country_name"], how="left")
)

# Merged all the sources, drop na
wc_results_m = wc_results_m.dropna()

# log-scale the market value to reduce skew
wc_results_m["market_val"] = np.log(wc_results_m["market_val"].astype(float))

# Getting rid of Panama
wc_results_m = wc_results_m[wc_results_m.qol != 0]

wc_results_m.head(10)

Unnamed: 0,country_name,power_rating,mv_play_dom,num_play_dom,mv_play_int,num_play_int,dom_int_mv_ratio,avg_mv_diff,market_val,elo,gdp_usd,bmi,life_exp,qol
1,Australia,0.4375,6300000.0,8.0,32950000.0,15.0,0.358498,-1409167.0,15.129019,1424,1552667000000.0,27.8,83.3,75.0
2,Belgium,0.65625,33500000.0,5.0,334200000.0,19.0,0.38091,-10889470.0,17.483616,1619,594104200000.0,27.0,81.890244,63.0
3,Brazil,0.828125,73000000.0,6.0,837500000.0,17.0,0.246965,-37098040.0,17.89485,1823,1608981000000.0,26.3,72.75,48.0
5,Canada,0.03125,7250000.0,4.0,31825000.0,19.0,1.08209,137500.0,15.533839,1379,1988336000000.0,27.6,82.59661,68.0
6,Colombia,0.375,7700000.0,5.0,221000000.0,19.0,0.132398,-10091580.0,16.091957,1675,314464100000.0,25.8,72.83,45.0
7,Costa Rica,0.15625,4000000.0,16.0,6750000.0,7.0,0.259259,-714285.7,14.447846,1534,64282440000.0,26.8,77.023,57.0
8,Croatia,0.921875,49800000.0,5.0,216800000.0,17.0,0.780996,-2792941.0,16.367266,1663,68955080000.0,28.4,76.42439,64.0
9,Denmark,0.4375,9000000.0,3.0,379700000.0,22.0,0.173821,-14259090.0,16.292385,1574,398303300000.0,26.5,81.404878,67.0
10,Ecuador,0.25,5500000.0,5.0,161800000.0,19.0,0.129172,-7415789.0,15.736982,1689,106165900000.0,26.5,73.67,53.0
13,France,1.0,204000000.0,3.0,944000000.0,20.0,1.440678,20800000.0,18.744863,1794,2957880000000.0,26.3,82.32439,68.0


<IPython.core.display.Javascript object>

In [24]:
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

wc_results_m_num = wc_results_m.copy().drop(
    columns=[
        "country_name",
        # "mv_play_dom"
        #         ,"num_play_dom"
        #         ,"mv_play_int"
        #         ,"num_play_int"
    ]
)

wc_results_m_num_norm = pd.DataFrame(
    scaler.fit_transform(wc_results_m_num), columns=wc_results_m_num.columns
)

reg = smf.ols(
    formula="power_rating ~ market_val + mv_play_int + mv_play_dom + elo + qol + gdp_usd + life_exp",
    data=wc_results_m_num_norm,
).fit()

print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:           power_rating   R-squared:                       0.437
Model:                            OLS   Adj. R-squared:                  0.360
Method:                 Least Squares   F-statistic:                     5.681
Date:                Tue, 20 Jun 2023   Prob (F-statistic):            0.00489
Time:                        09:42:29   Log-Likelihood:                -29.436
No. Observations:                  26   AIC:                             66.87
Df Residuals:                      22   BIC:                             71.90
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept    5.546e-17      0.160   3.47e-16      

<IPython.core.display.Javascript object>

In [22]:
# reg.params
reg_df = pd.DataFrame(
    {
        "coef_name": reg.pvalues.index,
        "coef_value": reg.params.values,
        "coef_pvalue": reg.pvalues.values,
    }
)

reg_df = reg_df.melt(id_vars=["coef_name"]).rename(columns={"variable": "value_name"})

reg_df.head(5)

Unnamed: 0,coef_name,value_name,value
0,Intercept,coef_value,5.545885e-17
1,market_val,coef_value,0.7252902
2,mv_play_int,coef_value,0.245586
3,mv_play_dom,coef_value,-0.7123638
4,elo,coef_value,0.3529987


<IPython.core.display.Javascript object>

In [23]:

chart = (
    alt.Chart(
        reg_df,
        title=alt.Title(
            'Regression: Coefficients & P-values for metrics',
            subtitle=' ',
        ),
    )
    .mark_bar(size=11)
    .encode(
        x=alt.X(
            "value_name:N",
            axis=alt.Axis(ticks=False, labelPadding=10)
        ).title(""),
        y=alt.Y(
            "sum(value):Q",
            axis=alt.Axis(ticks=False, labels=True),
            scale=alt.Scale(domain=[-1,1]),
        ).title(""),
        color=alt.Color("value_name:N", legend=None),
        column = alt.Column('coef_name:N', title=None)
    )
).properties(width=50, height=100)
    
chart.configure_title(fontSize=14, anchor="middle").configure_axis(
    grid=True, domain=False
)
    


<IPython.core.display.Javascript object>