In [1]:
import pandas as pd
import warnings
from sklearn.linear_model import Ridge

In [2]:
air_quality_dat = pd.read_csv("fetched_data/AIR_QUALITY_DATA.csv")

paris_agreement_dat = pd.read_csv("fetched_data/PARIS_AGREEMENT_DATA.csv")

greenhouse_gas_dat = pd.read_csv("fetched_data/GHG_DATA.csv")

In [41]:
merged_data = air_quality_dat.merge(paris_agreement_dat, how = "left", on = "Country_Name").merge(greenhouse_gas_dat, how = "left", left_on = ["Country_Name"], right_on = ["Country"])


In [5]:
simplified_data = merged_data[["Country_Name", "NO2", "GHG_Emissions", "Unnamed: 0_x"]]

In [6]:
simplified_data = simplified_data.rename(columns={"Unnamed: 0_x":"Paris_Index"})

In [16]:
warnings.filterwarnings('ignore')
simplified_data["Paris_Index"].loc[~simplified_data["Paris_Index"].isnull()] = 1
simplified_data["Paris_Index"].loc[simplified_data["Paris_Index"].isnull()] = 0

## Initially I thought that only small cities would be missing values, 
# however there seems to be larger cities with missing NO2/GHG values as well 
# so we will go with median imputation. Limitations of this method should be noted
simplified_data['NO2'] = simplified_data.groupby("Country_Name")['NO2'].transform(lambda x: x.fillna(x.median()))
simplified_data['GHG_Emissions'] = simplified_data.groupby("Country_Name")['GHG_Emissions'].transform(lambda x: x.fillna(x.median()))

In [23]:
# Group by 'Country' column
simplified_data_grouped = simplified_data.groupby('Country_Name')

# Drop rows where all NO2 values are null within each country group
simplified_data_minimal = simplified_data_grouped.filter(lambda x: x['NO2'].notna().any())

In [25]:
aggregated_data = simplified_data_minimal.groupby(['Country_Name',"GHG_Emissions","Paris_Index"]).sum().reset_index()

In [42]:
aggregated_data

Unnamed: 0,Country_Name,GHG_Emissions,Paris_Index,NO2
0,Argentina ...,8.46501,1,17.0
1,Australia ...,24.130798,1,30.5
2,Austria ...,9.466312,1,2011.84
3,Bangladesh ...,1.676438,1,263.0
4,Belgium ...,10.862293,1,1197.6
5,Bosnia and Herzegovina ...,9.133521,1,194.5
6,Brazil ...,6.04619,1,670.63
7,Bulgaria ...,8.8247,1,488.1
8,Canada ...,21.314508,1,2056.5
9,Costa Rica ...,3.349035,1,205.0


In [40]:

X = aggregated_data[["GHG_Emissions", "Paris_Index"]]
y = aggregated_data[["NO2"]]

result = sm.OLS(y, X).fit()
 
# printing the summary table
print(result.summary())


                            OLS Regression Results                            
Dep. Variable:                    NO2   R-squared:                       0.013
Model:                            OLS   Adj. R-squared:                 -0.006
Method:                 Least Squares   F-statistic:                    0.6701
Date:                Mon, 18 Mar 2024   Prob (F-statistic):              0.417
Time:                        09:57:06   Log-Likelihood:                -466.07
No. Observations:                  53   AIC:                             936.1
Df Residuals:                      51   BIC:                             940.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
GHG_Emissions   -17.3327     21.173     -0.819