In [1]:
import pandas as pd
import altair as alt
import numpy as np
import math
import requests
from bs4 import BeautifulSoup
import regex as re
import time
import geopandas as gpd

In [2]:
elo_rating = pd.read_csv('elo_data.csv')
elo_rating = elo_rating.groupby('country_name', as_index = False)['elo'].max()
elo_rating.head(5)

Unnamed: 0,country_name,elo
0,Albania,1284
1,Algeria,1537
2,Angola,1512
3,Argentina,1709
4,Armenia,1267


In [3]:
top_mv_domestic_club = pd.read_csv('avg_league_val_by_country.csv')
top_mv_domestic_club = top_mv_domestic_club.drop(columns=['Unnamed: 0'])
top_mv_domestic_club.head(5)

Unnamed: 0,country_name,market_val
0,Afghanistan,400.0
1,Albania,2179400.0
2,Algeria,3690200.0
3,American Samoa,0.0
4,American Virgin Islands,3750.0


In [5]:
wc_results = pd.read_csv('world_cup_result.csv')
wc_results = wc_results.drop(columns=['Unnamed: 0'])
wc_results.head(5)

Unnamed: 0,country_name,power_rating
0,Argentina,0.75
1,Australia,0.4375
2,Belgium,0.65625
3,Brazil,0.828125
4,Cameroon,0.21875


In [7]:
foreign_domestic = pd.read_csv('foreign_player_national_teams_agg.csv')
foreign_domestic = foreign_domestic.drop(columns=['Unnamed: 0'])

foreign_domestic['avg_mv_diff'] = \
np.nan_to_num(foreign_domestic['mv_play_dom']/foreign_domestic['num_play_dom'],nan=0) - \
np.nan_to_num(foreign_domestic['mv_play_int']/foreign_domestic['num_play_int'],nan=0)
# Let's get rid of any entries that have ratio of 0
foreign_domestic = foreign_domestic[foreign_domestic.dom_int_mv_ratio != 0]
foreign_domestic.head(5)

Unnamed: 0,country_name,mv_play_dom,num_play_dom,mv_play_int,num_play_int,dom_int_mv_ratio,avg_mv_diff
0,Australia,6300000.0,8.0,32950000.0,15.0,0.358498,-1409167.0
1,Belgium,33500000.0,5.0,431700000.0,19.0,0.294881,-16021050.0
2,Brazil,80000000.0,7.0,777000000.0,16.0,0.235337,-37133930.0
4,Canada,6000000.0,2.0,187900000.0,21.0,0.335285,-5947619.0
5,Colombia,7700000.0,5.0,226500000.0,19.0,0.129183,-10381050.0


In [8]:
wc_results_m = wc_results.merge(foreign_domestic, on=['country_name'],how='left') \
                         .merge(top_mv_domestic_club,on=['country_name'],how='left') \
                         .merge(elo_rating,on=['country_name'],how='left')

wc_results_m = wc_results_m.dropna()

wc_results_m['market_val'] = np.log(wc_results_m['market_val'].astype(float))

wc_results_m.head(5)

Unnamed: 0,country_name,power_rating,mv_play_dom,num_play_dom,mv_play_int,num_play_int,dom_int_mv_ratio,avg_mv_diff,market_val,elo
1,Australia,0.4375,6300000.0,8.0,32950000.0,15.0,0.358498,-1409167.0,15.130094,1424
2,Belgium,0.65625,33500000.0,5.0,431700000.0,19.0,0.294881,-16021050.0,17.437149,1619
3,Brazil,0.828125,80000000.0,7.0,777000000.0,16.0,0.235337,-37133930.0,17.895296,1823
5,Canada,0.03125,6000000.0,2.0,187900000.0,21.0,0.335285,-5947619.0,15.532942,1379
6,Colombia,0.375,7700000.0,5.0,226500000.0,19.0,0.129183,-10381050.0,16.09368,1675


In [9]:
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

wc_results_m_num = wc_results_m.copy().drop(columns = ['country_name','mv_play_dom','num_play_dom','mv_play_int','num_play_int'])

wc_results_m_num_norm = pd.DataFrame(scaler.fit_transform(wc_results_m_num), columns=wc_results_m_num.columns)

reg = smf.ols(formula = 'power_rating ~ market_val + elo + dom_int_mv_ratio', data = wc_results_m_num_norm).fit() 

print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:           power_rating   R-squared:                       0.464
Model:                            OLS   Adj. R-squared:                  0.397
Method:                 Least Squares   F-statistic:                     6.923
Date:                Sun, 11 Jun 2023   Prob (F-statistic):            0.00161
Time:                        14:40:58   Log-Likelihood:                -31.002
No. Observations:                  28   AIC:                             70.00
Df Residuals:                      24   BIC:                             75.33
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept         9.714e-17      0.149  