In [1]:
import pandas as pd
import warnings
from sklearn.linear_model import Ridge

In [2]:
## Reading in the data
air_quality_dat = pd.read_csv("fetched_data/AIR_QUALITY_DATA.csv")

paris_agreement_dat = pd.read_csv("fetched_data/PARIS_AGREEMENT_DATA.csv")

greenhouse_gas_dat = pd.read_csv("fetched_data/GHG_DATA.csv")

In [14]:
# Merging all the data needed for analysis
merged_data = air_quality_dat.merge(paris_agreement_dat, how = "left", on = "Country_Name").merge(greenhouse_gas_dat, how = "left", left_on = ["Country_Name"], right_on = ["Country"])


In [26]:
# Selecting columns needed
simplified_data = merged_data[["Country_Name", "NO2", "GHG_Emissions", "Date_Joined"]].rename(columns = {"Date_Joined":"Paris_Status"})

In [27]:
warnings.filterwarnings('ignore')

### HERE is where I will do preprocessing to turn data joined variables such as a, AA, and R 
### into one-hot encoded variables

## Initially I thought that only small cities would be missing values, 
# however there seems to be larger cities with missing NO2/GHG values as well 
# so we will go with median imputation. Limitations of this method should be noted
simplified_data['NO2'] = simplified_data.groupby("Country_Name")['NO2'].transform(lambda x: x.fillna(x.median()))
simplified_data['GHG_Emissions'] = simplified_data.groupby("Country_Name")['GHG_Emissions'].transform(lambda x: x.fillna(x.median()))

In [28]:
# Group by 'Country' column
simplified_data_grouped = simplified_data.groupby('Country_Name')

# Drop rows where all NO2 values are null within each country group
simplified_data_minimal = simplified_data_grouped.filter(lambda x: x['NO2'].notna().any())

In [29]:
aggregated_data = simplified_data_minimal.groupby(['Country_Name',"GHG_Emissions","Paris_Status"]).sum().reset_index()

In [None]:
import statsmodels.regression.linear_model as sm
X = aggregated_data[["GHG_Emissions", "Paris_Status"]]
y = aggregated_data[["NO2"]]

result = sm.OLS(y, X).fit()
 
# printing the summary table
print(result.summary())
