In [1]:
import pandas as pd

In [2]:
gbd_health = pd.read_csv("../1_datasets/cleaned_datasets/clean_gbd_health.csv").copy()
sdi_pm25 = pd.read_csv("../1_datasets/cleaned_datasets/merged_sdi_pm25_data.csv").copy()

In [3]:
gbd_health.head(5)

Unnamed: 0,measure,Country,cause,Risk-Exposure-Impact,metric,Year,measure Val,upper,lower
0,DALYs (Disability-Adjusted Life Years),China,Stroke,Ambient particulate matter pollution,Percent,2010,0.221882,0.296283,0.128691
1,DALYs (Disability-Adjusted Life Years),China,Stroke,Ambient particulate matter pollution,Rate,2010,793.957477,1068.976845,455.632535
2,DALYs (Disability-Adjusted Life Years),China,Stroke,Ambient particulate matter pollution,Percent,2011,0.234283,0.30752,0.138086
3,DALYs (Disability-Adjusted Life Years),China,Stroke,Ambient particulate matter pollution,Rate,2011,835.920051,1110.785446,492.084815
4,DALYs (Disability-Adjusted Life Years),China,Stroke,Ambient particulate matter pollution,Percent,2012,0.249782,0.321437,0.151426


In [4]:
gbd_health.nunique()

measure                    1
Country                   25
cause                      3
Risk-Exposure-Impact       1
metric                     2
Year                      10
measure Val             1500
upper                   1500
lower                   1500
dtype: int64

In [5]:
sdi_pm25.head(5)

Unnamed: 0,Country,Year,PM25 concentration,PM25 lower bound,PM25 upper bound,SDI mean value
0,Germany,2019,10.73,10.56,10.93,0.899703
1,Japan,2019,10.84,10.02,11.55,0.867148
2,Brazil,2019,10.94,9.37,13.01,0.645298
3,Kenya,2019,12.52,7.8,17.78,0.508004
4,Romania,2019,13.3,12.55,14.1,0.760284


In [6]:
print(f"GBD Shape: {gbd_health.shape}")
print(f"SDI PM25 Shape: {sdi_pm25.shape}")

GBD Shape: (1500, 9)
SDI PM25 Shape: (250, 6)


In [7]:
sdi_pm25.nunique()

Country                25
Year                   10
PM25 concentration    242
PM25 lower bound      234
PM25 upper bound      243
SDI mean value        250
dtype: int64

In [8]:
print(f"GBD Health Years: {gbd_health.Year.unique()}")
print(f"SDI PM25 Years: {sdi_pm25.Year.unique()}")

GBD Health Years: [2010 2011 2012 2013 2014 2015 2016 2017 2018 2019]
SDI PM25 Years: [2019 2018 2017 2016 2015 2014 2013 2012 2011 2010]


In [9]:
print(f"GBD Health Countries: {gbd_health.Country.unique()}")
print(f"SDI PM25 Countries: {sdi_pm25.Country.unique()}")

GBD Health Countries: ['China' 'Indonesia' 'Fiji' 'Romania' 'Japan' 'Russian Federation'
 'Republic of Korea' 'Australia' 'Italy' 'Germany' 'Spain' 'Canada'
 'United States of America' 'Chile' 'Mexico' 'Saudi Arabia' 'Brazil'
 'Egypt' 'India' 'Afghanistan' 'Ethiopia' 'Bangladesh' 'Kenya'
 'South Africa' 'Nigeria']
SDI PM25 Countries: ['Germany' 'Japan' 'Brazil' 'Kenya' 'Romania' 'Italy' 'Mexico' 'Indonesia'
 'South Africa' 'Chile' 'Ethiopia' 'Republic of Korea' 'China'
 'Bangladesh' 'India' 'Nigeria' 'Saudi Arabia' 'Canada' 'Afghanistan'
 'Egypt' 'United States of America' 'Fiji' 'Russian Federation'
 'Australia' 'Spain']


In [10]:
# Helper function to check mismatches
def check_mismatch(df1, df2, col):
    unique_1 = set(df1[col].unique())
    unique_2 = set(df2[col].unique())
    diff = unique_1 - unique_2
    if diff:
        print(f"Values in {col} from df1 not in df2:", diff)
    else:
        print(f"No mismatch in '{col}'")

In [11]:
check_mismatch(gbd_health, sdi_pm25, "Country")

No mismatch in 'Country'


In [12]:
check_mismatch(gbd_health, sdi_pm25, "Year")

No mismatch in 'Year'


In [13]:
# Standardize column names
gbd_health.columns = gbd_health.columns.str.lower().str.strip()
sdi_pm25.columns = sdi_pm25.columns.str.lower().str.strip()

In [14]:
gbd_health.columns

Index(['measure', 'country', 'cause', 'risk-exposure-impact', 'metric', 'year',
       'measure val', 'upper', 'lower'],
      dtype='object')

In [15]:
sdi_pm25.columns

Index(['country', 'year', 'pm25 concentration', 'pm25 lower bound',
       'pm25 upper bound', 'sdi mean value'],
      dtype='object')

In [16]:
# Merge the two DataFrames
merged_df = pd.merge(gbd_health, sdi_pm25, on=["country", "year"], how="inner")

In [17]:
print(merged_df.shape)

(1500, 13)


In [19]:
merged_df.head()

Unnamed: 0,measure,country,cause,risk-exposure-impact,metric,year,measure val,upper,lower,pm25 concentration,pm25 lower bound,pm25 upper bound,sdi mean value
0,DALYs (Disability-Adjusted Life Years),China,Stroke,Ambient particulate matter pollution,Percent,2010,0.221882,0.296283,0.128691,47.18,44.59,49.38,0.641521
1,DALYs (Disability-Adjusted Life Years),China,Stroke,Ambient particulate matter pollution,Rate,2010,793.957477,1068.976845,455.632535,47.18,44.59,49.38,0.641521
2,DALYs (Disability-Adjusted Life Years),China,Stroke,Ambient particulate matter pollution,Percent,2011,0.234283,0.30752,0.138086,51.92,45.85,59.45,0.651448
3,DALYs (Disability-Adjusted Life Years),China,Stroke,Ambient particulate matter pollution,Rate,2011,835.920051,1110.785446,492.084815,51.92,45.85,59.45,0.651448
4,DALYs (Disability-Adjusted Life Years),China,Stroke,Ambient particulate matter pollution,Percent,2012,0.249782,0.321437,0.151426,53.43,46.71,59.15,0.657144


In [20]:
merged_df.to_csv("final_gbd_pm25_sdi.csv", index=False)