In [1]:
import polars as pl
import statsmodels.formula.api as smf
import numpy as np
from stargazer.stargazer import Stargazer

In [2]:
df = pl.read_parquet("../data/combined-2022-xgboost-synthetic.parquet")
pd_df = df.to_pandas()
prod_columns = {"math": "PISA Math", "in_math99": "PISA Math in global P99" ,"imo_total_score": "IMO score per log population", "arwu_ranked_num":"ARWU insitutions", "arwu_ranked_num:gdp_pc": "ARWU insitutions x GDP PC", "gdp_pc": "GDP per capita", "primary_completion": "Primary School Completion Rate", "lower_sec_completion": "Lower Sec. Completion Rate", "upper_sec_completion": "Upper Sec. Completion Rate", "democracy_eiu": "Democracy Rating", "population": "Population"}

In [3]:
pd_df.head()

Unnamed: 0,country,year,gdp_pc,gdp_pc_growth,primary_completion,lower_sec_completion,upper_sec_completion,population,arwu_ranked_num,arwu_sum_score,math,read,science,in_math99,imo_total_score,democracy_eiu,xg_primary_completion,xg_lower_sec_completion,xg_upper_sec_completion
0,AFG,2003,199.643228,92.702913,20.042681,15.858682,11.536274,22645130.0,0.0,0.0,,,,,0.0,,20.042681,15.858682,11.536274
1,AFG,2004,221.830531,-249.725464,11.35412,21.78886,10.474828,23553551.0,0.0,0.0,,,,,0.0,,11.35412,21.78886,10.474828
2,AFG,2005,254.115274,732.187384,23.180153,8.637574,8.668537,24411191.0,0.0,0.0,,,,,0.0,,23.180153,8.637574,8.668537
3,AFG,2006,274.015394,108.49882,25.465551,13.990375,12.341593,25442944.0,0.0,0.0,,,,,0.0,3.06,25.465551,13.990375,12.341593
4,AFG,2007,376.318296,1180.338266,26.941549,9.942389,8.840587,25903301.0,0.0,0.0,,,,,0.0,,26.941549,9.942389,8.840587


In [4]:
# pd_df["arwu_ranked_num"] = pd_df["arwu_ranked_num"] / pd_df["population"] * 1_000_000
# pd_df["imo_total_score"] = pd_df["imo_total_score"] / np.log(pd_df["population"])
# pd_df["arwu_ranked_num"] = np.where(pd_df["year"] < 2017, pd_df["arwu_ranked_num"], pd_df["arwu_ranked_num"] / 2)
# # pd_df["arwu_ranked_num"] = np.maximum(pd_df["arwu_ranked_num"], pd_df["arwu_sum_score"]) / pd_df["population"] * 1_000_000
# pd_df["gdp_pc_growth"] = 100 * pd_df["gdp_pc_growth"]
# pd_df["gdp_pc"] = np.log(pd_df["gdp_pc"])
# pd_df["population"] = np.log(pd_df["population"])

In [5]:
pd_df.head()

Unnamed: 0,country,year,gdp_pc,gdp_pc_growth,primary_completion,lower_sec_completion,upper_sec_completion,population,arwu_ranked_num,arwu_sum_score,math,read,science,in_math99,imo_total_score,democracy_eiu,xg_primary_completion,xg_lower_sec_completion,xg_upper_sec_completion
0,AFG,2003,199.643228,92.702913,20.042681,15.858682,11.536274,22645130.0,0.0,0.0,,,,,0.0,,20.042681,15.858682,11.536274
1,AFG,2004,221.830531,-249.725464,11.35412,21.78886,10.474828,23553551.0,0.0,0.0,,,,,0.0,,11.35412,21.78886,10.474828
2,AFG,2005,254.115274,732.187384,23.180153,8.637574,8.668537,24411191.0,0.0,0.0,,,,,0.0,,23.180153,8.637574,8.668537
3,AFG,2006,274.015394,108.49882,25.465551,13.990375,12.341593,25442944.0,0.0,0.0,,,,,0.0,3.06,25.465551,13.990375,12.341593
4,AFG,2007,376.318296,1180.338266,26.941549,9.942389,8.840587,25903301.0,0.0,0.0,,,,,0.0,,26.941549,9.942389,8.840587


In [6]:
# pd_df.groupby("year")["arwu_ranked_num"].sum()

In [7]:
# math = smf.ols("math ~ gdp_pc + imo_total_score + in_math99 + arwu_ranked_num*gdp_pc + primary_completion + lower_sec_completion + upper_sec_completion + population + democracy_eiu", pd_df[pd_df["year"].isin([2003, 2006, 2009, 2012, 2015, 2018])]).fit()
# math.summary()

In [8]:
pisa_df = pd_df[pd_df["year"].isin([2003, 2006, 2009, 2012, 2015, 2018, 2022]) & (pd_df["math"].notnull())]

In [9]:
pisa_no_score = smf.ols("gdp_pc_growth ~ gdp_pc + imo_total_score + arwu_ranked_num*gdp_pc + primary_completion + lower_sec_completion + upper_sec_completion + population + democracy_eiu + C(year) ", pisa_df).fit()
pisa_no_fix = smf.ols("gdp_pc_growth ~ gdp_pc + imo_total_score + math + in_math99 + arwu_ranked_num*gdp_pc + primary_completion + lower_sec_completion + upper_sec_completion + population + democracy_eiu", pisa_df).fit()
pisa = smf.ols("gdp_pc_growth ~ gdp_pc + imo_total_score + math + in_math99 + arwu_ranked_num*gdp_pc + primary_completion + lower_sec_completion + upper_sec_completion + population + democracy_eiu + C(year) ", pisa_df).fit()
pisa_country = smf.ols("gdp_pc_growth ~ gdp_pc + imo_total_score + math + in_math99 + arwu_ranked_num*gdp_pc + primary_completion + lower_sec_completion + upper_sec_completion + population + democracy_eiu + country + C(year)", pisa_df).fit()

In [68]:
non_pisa_pyears = smf.ols("gdp_pc_growth ~ gdp_pc + democracy_eiu + imo_total_score + arwu_ranked_num*gdp_pc + primary_completion + lower_sec_completion + upper_sec_completion + population + C(year) ", pd_df[pd_df["year"].isin([2003, 2006, 2009, 2012, 2015, 2018, 2022])]).fit()
non_pisa = smf.ols("gdp_pc_growth ~ gdp_pc + democracy_eiu + imo_total_score + arwu_ranked_num*gdp_pc + primary_completion + lower_sec_completion + upper_sec_completion + population + C(year) ", pd_df).fit()
non_pisa_country = smf.ols("gdp_pc_growth ~ gdp_pc + democracy_eiu + imo_total_score + arwu_ranked_num*gdp_pc + primary_completion + lower_sec_completion + upper_sec_completion + population + C(year) + country", pd_df).fit()
non_pisa_no_interac = smf.ols("gdp_pc_growth ~ gdp_pc + democracy_eiu + imo_total_score + arwu_ranked_num + primary_completion + lower_sec_completion + upper_sec_completion + population + C(year) ", pd_df).fit()

In [69]:
pisa_sg = Stargazer([pisa_no_score, pisa_no_fix, pisa, pisa_country])

In [70]:
pisa_sg.covariate_order(["in_math99", "imo_total_score", "arwu_ranked_num", "arwu_ranked_num:gdp_pc", "math", "gdp_pc", "primary_completion", "lower_sec_completion", "upper_sec_completion", "democracy_eiu"])
pisa_sg.add_line('Time Effects', ['Yes', 'No', 'Yes', 'Yes'])
pisa_sg.add_line('Fixed Effects', ['No', 'No', 'No', 'Yes'])
pisa_sg.add_line('Entities', [49, 49, 49, 49])
pisa_sg.rename_covariates(prod_columns)
# print(pisa_sg.render_latex())
pisa_sg

0,1,2,3,4
,,,,
,Dependent variable: gdp_pc_growth,Dependent variable: gdp_pc_growth,Dependent variable: gdp_pc_growth,Dependent variable: gdp_pc_growth
,,,,
,(1),(2),(3),(4)
,,,,
PISA Math in global P99,,-37.136**,-41.042**,-37.273
,,(17.118),(16.291),(33.163)
IMO score per log population,1.456,3.726,-0.013,9.249
,(5.668),(6.361),(6.071),(14.100)
ARWU insitutions,-570.653***,-550.662***,-638.324***,-302.520


In [71]:
non_pisa_sg = Stargazer([pisa, non_pisa_pyears, non_pisa, non_pisa_country])
non_pisa_sg.covariate_order(["imo_total_score", "arwu_ranked_num", "arwu_ranked_num:gdp_pc", "gdp_pc", "primary_completion", "lower_sec_completion", "upper_sec_completion", "democracy_eiu", "population"])
non_pisa_sg.add_line('Time Effects', ['Yes', 'Yes', 'Yes', 'Yes'])
non_pisa_sg.add_line('Fixed Effects', ['No', 'No', 'No', 'Yes'])
non_pisa_sg.add_line('Entities', [49, 103, 165, 165])
non_pisa_sg.custom_columns(['Model 3 (PISA)', 'Model 5 (PISA years)', 'Model 6 (All years)', 'Model 7 (All years, FE)'], [1, 1, 1, 1])
non_pisa_sg.show_model_numbers(False)
non_pisa_sg.rename_covariates(prod_columns)
# print(non_pisa_sg.render_latex())
non_pisa_sg

0,1,2,3,4
,,,,
,Dependent variable: gdp_pc_growth,Dependent variable: gdp_pc_growth,Dependent variable: gdp_pc_growth,Dependent variable: gdp_pc_growth
,,,,
,Model 3 (PISA),Model 5 (PISA years),Model 6 (All years),"Model 7 (All years, FE)"
,,,,
IMO score per log population,-0.013,-3.524,9.904**,-8.894
,(6.071),(8.148),(4.230),(10.540)
ARWU insitutions,-638.324***,-570.262**,-495.470***,128.753
,(147.004),(231.567),(123.355),(232.965)
ARWU insitutions x GDP PC,0.008***,0.006,0.006***,-0.005


In [72]:
group1 = pd_df[(pd_df["arwu_ranked_num"].notnull()) & (pd_df["math"].notnull()) & (pd_df["imo_total_score"].notnull()) & (pd_df["primary_completion"].notnull()) & (pd_df["lower_sec_completion"].notnull()) & (pd_df["upper_sec_completion"].notnull()) & (pd_df["democracy_eiu"].notnull())]
group1.shape

(323, 19)

In [73]:
group2 = pd_df[(pd_df["gdp_pc"].notnull()) & (pd_df["arwu_ranked_num"].notnull()) & (pd_df["imo_total_score"].notnull()) & (pd_df["primary_completion"].notnull()) & (pd_df["lower_sec_completion"].notnull()) & (pd_df["upper_sec_completion"].notnull()) & (pd_df["democracy_eiu"].notnull())]
group2.shape

(2450, 19)

In [74]:
group2.describe()

Unnamed: 0,year,gdp_pc,gdp_pc_growth,primary_completion,lower_sec_completion,upper_sec_completion,population,arwu_ranked_num,arwu_sum_score,math,read,science,in_math99,imo_total_score,democracy_eiu,xg_primary_completion,xg_lower_sec_completion,xg_upper_sec_completion
count,2450.0,2450.0,2437.0,2450.0,2450.0,2450.0,2450.0,2450.0,2450.0,323.0,247.0,249.0,324.0,2450.0,2450.0,2450.0,2450.0,2450.0
mean,2014.760816,14200.820543,200.785325,72.078527,58.633074,43.858983,44505030.0,0.100828,22.389388,457.379171,461.807007,467.471015,0.882929,2.658117,5.508649,72.036354,58.586185,43.661362
std,4.648082,20443.814396,536.912902,24.005121,26.120641,25.000397,152626600.0,0.231087,160.965386,55.257372,50.479093,49.549635,1.487725,3.322624,2.193694,23.569048,25.283541,24.048134
min,2006.0,166.276245,-4789.998601,10.85229,4.36065,2.0158,303782.0,0.0,0.0,315.963154,290.918937,326.428759,0.0,0.0,0.32,13.936201,8.000157,5.138081
25%,2011.0,1591.145776,24.487374,53.774658,37.549344,22.164979,4011939.0,0.0,0.0,414.152142,422.823276,425.346767,0.049328,0.0,3.52,54.045876,37.967611,22.118862
50%,2015.0,5192.665938,214.546339,78.177551,58.273027,41.179806,10280500.0,0.0,0.0,473.142717,477.3861,482.03066,0.473346,0.359663,5.81,78.441765,58.999876,41.229095
75%,2019.0,16873.540641,431.434596,93.529528,82.290218,66.323612,32385970.0,0.029428,0.0,497.408018,500.113705,504.054081,1.122524,5.250164,7.24,92.568352,81.242363,64.48554
max,2022.0,133711.794436,9695.64199,102.338989,102.15873,97.399788,1417173000.0,1.456028,2190.4,574.66382,555.079856,563.748407,14.639147,12.723634,9.93,102.346764,102.15873,97.831635


In [75]:
group1.describe()

Unnamed: 0,year,gdp_pc,gdp_pc_growth,primary_completion,lower_sec_completion,upper_sec_completion,population,arwu_ranked_num,arwu_sum_score,math,read,science,in_math99,imo_total_score,democracy_eiu,xg_primary_completion,xg_lower_sec_completion,xg_upper_sec_completion
count,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,323.0,247.0,249.0,323.0,323.0,323.0,323.0,323.0,323.0
mean,2015.201238,28224.681762,284.233721,90.313211,78.3402,62.857463,35995430.0,0.244822,55.427554,457.379171,461.807007,467.471015,0.885663,4.907074,7.088978,89.984215,78.138931,62.66729
std,5.335315,24525.625813,341.698294,10.401118,17.002472,18.634168,59867590.0,0.313454,251.730465,55.257372,50.479093,49.549635,1.489219,3.541352,1.783192,10.511139,16.205027,17.710205
min,2006.0,543.110702,-658.290445,49.49374,31.7194,22.409929,303782.0,0.0,0.0,315.963154,290.918937,326.428759,0.0,0.0,1.93,47.905609,33.907448,20.282097
25%,2012.0,8884.024918,113.216863,83.652916,63.912451,47.315454,4897921.0,0.0,0.0,414.152142,422.823276,425.346767,0.049926,1.519785,6.385,82.427689,64.54369,48.216541
50%,2015.0,19186.359592,235.777247,95.212746,83.20343,66.713837,10175210.0,0.092051,0.0,473.142717,477.3861,482.03066,0.473588,5.194906,7.37,94.965492,83.205673,65.073914
75%,2018.0,44449.622554,431.367894,98.540371,92.418171,79.085361,43585110.0,0.399057,25.35,497.408018,500.113705,504.054081,1.122677,7.616614,8.21,98.176998,92.092751,78.18803
max,2022.0,116786.511655,3303.048777,102.338989,102.15873,97.399788,333287600.0,1.417551,2190.4,574.66382,555.079856,563.748407,14.639147,11.786918,9.93,102.346764,102.15873,97.831635


In [44]:
import plotly.express as px

In [25]:
fig = px.scatter(group1, y="gdp_pc_growth", x="year", color='gdp_pc', hover_data=["country", "year"])
fig.show()

In [26]:
fig = px.scatter(group2, y="gdp_pc_growth", x="year", color='imo_total_score', hover_data=["country", "year"])
fig.show()

In [27]:
fig = px.scatter(group2, y="gdp_pc_growth", x="year", color='arwu_ranked_num', hover_data=["country", "year"])
fig.show()

In [28]:
fig = px.scatter(pd_df, y="math", x="year", color='arwu_ranked_num', hover_data=["country", "year"])
fig.show()

In [38]:
# prim = smf.ols("imo_total_score ~ math + gdp_pc + democracy_eiu + arwu_ranked_num*gdp_pc + population", pd_df).fit()
# prim.summary()

0,1,2,3
Dep. Variable:,imo_total_score,R-squared:,0.305
Model:,OLS,Adj. R-squared:,0.288
Method:,Least Squares,F-statistic:,17.71
Date:,"Tue, 02 Apr 2024",Prob (F-statistic):,5.38e-17
Time:,15:14:20,Log-Likelihood:,-601.48
No. Observations:,249,AIC:,1217.0
Df Residuals:,242,BIC:,1242.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-8.8498,1.862,-4.754,0.000,-12.517,-5.183
math,0.0348,0.004,7.897,0.000,0.026,0.043
gdp_pc,-4.674e-05,1.15e-05,-4.055,0.000,-6.94e-05,-2.4e-05
democracy_eiu,-0.2889,0.132,-2.191,0.029,-0.549,-0.029
arwu_ranked_num,-0.4396,1.466,-0.300,0.764,-3.327,2.447
arwu_ranked_num:gdp_pc,1.174e-05,2.7e-05,0.435,0.664,-4.15e-05,6.49e-05
population,1.744e-08,2.93e-09,5.961,0.000,1.17e-08,2.32e-08

0,1,2,3
Omnibus:,5.773,Durbin-Watson:,0.947
Prob(Omnibus):,0.056,Jarque-Bera (JB):,3.646
Skew:,-0.108,Prob(JB):,0.162
Kurtosis:,2.448,Cond. No.,807000000.0
