In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from functools import partial
import plotly.subplots as sp
import scipy.stats as stats
from tqdm.notebook import tnrange 
from time import sleep
from IPython.display import clear_output

In [37]:
df = pd.read_csv("cleaned-data/comb_df.csv").drop(["Unnamed: 0"], axis=1)
df = df.rename(columns={"Value":"Emission"})
df

Unnamed: 0,Province,Pollutant,Year,Emission,Per Capita,By Capita,Death,Death per Capita
0,AB,NH3,2000,1.399578e+05,2996031.75,0.046714,17745,0.005923
1,AB,CO,2000,1.677588e+06,2996031.75,0.559937,17745,0.005923
2,AB,SOX,2000,4.907949e+05,2996031.75,0.163815,17745,0.005923
3,AB,NOX,2000,8.018365e+05,2996031.75,0.267633,17745,0.005923
4,AB,VOC,2000,6.306122e+05,2996031.75,0.210482,17745,0.005923
...,...,...,...,...,...,...,...,...
1045,PE,NH3,2020,3.203449e+03,160633.75,0.019943,23255,0.144770
1046,PE,CO,2020,2.030025e+04,160633.75,0.126376,23255,0.144770
1047,PE,SOX,2020,2.709045e+02,160633.75,0.001686,23255,0.144770
1048,PE,NOX,2020,3.301916e+03,160633.75,0.020556,23255,0.144770


In [61]:
df_modified = df.groupby(["Pollutant", "Year", "Death"])["Emission", "Per Capita"].sum()
df_modified = df_modified.rename(columns={"Per Capita":"Population"})
# df_modified["Death by Population"] = df_modified["Death"] / df_modified["Population"]
df_modified["Emission by Population"] = df_modified["Emission"] / df_modified["Population"]
df_modified = df_modified.reset_index()
df_modified

  df_modified = df.groupby(["Pollutant", "Year", "Death"])["Emission", "Per Capita"].sum()


Unnamed: 0,Pollutant,Year,Death,Emission,Population,Emission by Population
0,CO,2000,17745,9.971515e+06,30549078.00,0.326410
1,CO,2001,17585,9.148745e+06,30872411.50,0.296340
2,CO,2002,17761,8.831071e+06,31208319.75,0.282972
3,CO,2003,18472,8.690271e+06,31500928.75,0.275873
4,CO,2004,19607,8.203394e+06,31796829.50,0.257994
...,...,...,...,...,...,...
100,VOC,2016,23727,1.606948e+06,35932687.75,0.044721
101,VOC,2017,25867,1.623493e+06,36372984.25,0.044635
102,VOC,2018,27465,1.668528e+06,36879684.25,0.045242
103,VOC,2019,25580,1.637399e+06,37415331.50,0.043763


In [63]:
df_modified["Death by Population"] = df_modified["Death"] / df_modified["Population"]
df_modified

Unnamed: 0,Pollutant,Year,Death,Emission,Population,Emission by Population,Death by Population
0,CO,2000,17745,9.971515e+06,30549078.00,0.326410,0.000581
1,CO,2001,17585,9.148745e+06,30872411.50,0.296340,0.000570
2,CO,2002,17761,8.831071e+06,31208319.75,0.282972,0.000569
3,CO,2003,18472,8.690271e+06,31500928.75,0.275873,0.000586
4,CO,2004,19607,8.203394e+06,31796829.50,0.257994,0.000617
...,...,...,...,...,...,...,...
100,VOC,2016,23727,1.606948e+06,35932687.75,0.044721,0.000660
101,VOC,2017,25867,1.623493e+06,36372984.25,0.044635,0.000711
102,VOC,2018,27465,1.668528e+06,36879684.25,0.045242,0.000745
103,VOC,2019,25580,1.637399e+06,37415331.50,0.043763,0.000684


In [57]:
df_modified["Death"]

0      17745
1      17585
2      17761
3      18472
4      19607
       ...  
100    23727
101    25867
102    27465
103    25580
104    23255
Name: Death, Length: 105, dtype: int64

In [68]:
df_modified

Unnamed: 0,Pollutant,Year,Death,Emission,Population,Emission by Population,Death by Population
0,CO,2000,17745,9.971515e+06,30549078.00,0.326410,0.000581
1,CO,2001,17585,9.148745e+06,30872411.50,0.296340,0.000570
2,CO,2002,17761,8.831071e+06,31208319.75,0.282972,0.000569
3,CO,2003,18472,8.690271e+06,31500928.75,0.275873,0.000586
4,CO,2004,19607,8.203394e+06,31796829.50,0.257994,0.000617
...,...,...,...,...,...,...,...
100,VOC,2016,23727,1.606948e+06,35932687.75,0.044721,0.000660
101,VOC,2017,25867,1.623493e+06,36372984.25,0.044635,0.000711
102,VOC,2018,27465,1.668528e+06,36879684.25,0.045242,0.000745
103,VOC,2019,25580,1.637399e+06,37415331.50,0.043763,0.000684


In [80]:
pollutants = df_modified["Pollutant"].unique().tolist()

def calculate_corr(anylist):
    
    for pollutant in pollutants:
        df = df_modified.loc[df_modified["Pollutant"] == pollutant]

        # calculating correlational coefficient for the occurance of wildfire and PM2.5 emission.
        corr, pval=stats.pearsonr(df["Emission by Population"], df["Death by Population"])
        print("Correlation coefficient for " + pollutant + " emission and total Canadian respiratory deaths: " + str(corr))
        
calculate_corr(pollutants)

Correlation coefficient for CO emission and total Canadian respiratory deaths: -0.7731187990986208
Correlation coefficient for NH3 emission and total Canadian respiratory deaths: -0.7560241031863244
Correlation coefficient for NOX emission and total Canadian respiratory deaths: -0.7838520365215957
Correlation coefficient for SOX emission and total Canadian respiratory deaths: -0.7730472476410772
Correlation coefficient for VOC emission and total Canadian respiratory deaths: -0.758517211116605
