In [5]:
import numpy as np
import pandas as pd
import openpyxl as xl
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [11]:
wpp = pd.read_excel("data/WPP2022_GEN_F01_DEMOGRAPHIC_INDICATORS_COMPACT_REV1.xlsx", skiprows=16)
print(wpp.columns.tolist())

['Index', 'Variant', 'Region, subregion, country or area *', 'Notes', 'Location code', 'ISO3 Alpha-code', 'ISO2 Alpha-code', 'SDMX code**', 'Type', 'Parent code', 'Year', 'Total Population, as of 1 January (thousands)', 'Total Population, as of 1 July (thousands)', 'Male Population, as of 1 July (thousands)', 'Female Population, as of 1 July (thousands)', 'Population Density, as of 1 July (persons per square km)', 'Population Sex Ratio, as of 1 July (males per 100 females)', 'Median Age, as of 1 July (years)', 'Natural Change, Births minus Deaths (thousands)', 'Rate of Natural Change (per 1,000 population)', 'Population Change (thousands)', 'Population Growth Rate (percentage)', 'Population Annual Doubling Time (years)', 'Births (thousands)', 'Births by women aged 15 to 19 (thousands)', 'Crude Birth Rate (births per 1,000 population)', 'Total Fertility Rate (live births per woman)', 'Net Reproduction Rate (surviving daughters per woman)', 'Mean Age Childbearing (years)', 'Sex Ratio at 

In [35]:
wpp_originalColNames = pd.read_excel("data/WPP2022_GEN_F01_DEMOGRAPHIC_INDICATORS_COMPACT_REV1.xlsx", skiprows=16, usecols=['Region, subregion, country or area *', 'Year', 'Total Population, as of 1 July (thousands)', 'Crude Death Rate (deaths per 1,000 population)'])
print(wpp.columns.tolist())

['Country', 'Year', 'Population', 'CDR']


In [36]:
wpp = wpp_originalColNames.rename(columns={
    'Region, subregion, country or area *': 'Country',
    'Total Population, as of 1 July (thousands)': 'Population',
    'Crude Death Rate (deaths per 1,000 population)': 'CDR'
})
print(wpp.columns.tolist())

['Country', 'Year', 'Population', 'CDR']


In [37]:
print(wpp.iloc[0])

Country             WORLD
Year               1950.0
Population    2499322.157
CDR                19.518
Name: 0, dtype: object


In [38]:
# 'Year' is blank cell in row between each country
# prepare to restore Year to 4-digit form
wpp.dropna(subset=['Year'], inplace=True)

In [39]:
wpp['Year'] = wpp['Year'].astype(int)
print(wpp.iloc[0]['Year'])

1950


In [43]:
# Filter USA and Uganda
wpp_country_filter = wpp[(wpp['Country'] == 'United States of America') | (wpp['Country'] == 'Uganda')]
print(wpp_country_filter)

                        Country  Year  Population     CDR
2884                     Uganda  1950    5750.637  25.067
2885                     Uganda  1951    5909.819  24.859
2886                     Uganda  1952    6073.833   24.36
2887                     Uganda  1953    6243.883  23.845
2888                     Uganda  1954    6419.882  23.318
...                         ...   ...         ...     ...
18575  United States of America  2017  329791.231   8.424
18576  United States of America  2018  332140.037   8.387
18577  United States of America  2019  334319.671   8.325
18578  United States of America  2020  335942.003   9.651
18579  United States of America  2021  336997.624   9.743

[144 rows x 4 columns]


In [46]:
# Filter year 2019
WPP_2019 = wpp_country_filter[wpp_country_filter['Year'] == 2019]
print(WPP_2019)

                        Country  Year  Population    CDR
2953                     Uganda  2019    42949.08  5.823
18577  United States of America  2019  334319.671  8.325


In [49]:
COPD_original = pd.read_csv("data/COPD_age-specific_USA_Uganda_2019.csv", skiprows=1, skipfooter=2, engine='python')
print(COPD_original)

   Age group (years)  Death rate, United States, 2019  \
0                0-4                             0.04   
1                5-9                             0.02   
2              10-14                             0.02   
3              15-19                             0.02   
4              20-24                             0.06   
5              25-29                             0.11   
6              30-34                             0.29   
7              35-39                             0.56   
8              40-44                             1.42   
9              45-49                             4.00   
10             50-54                            14.13   
11             55-59                            37.22   
12             60-64                            66.48   
13             65-69                           108.66   
14             70-74                           213.10   
15             75-79                           333.06   
16             80-84           

In [50]:
# harmonise age column name with WHO Standard Popualtion Distribution
# COPD = COPD_original.copy()
COPD = COPD_original.rename(columns={
    'Age group (years)': 'Age group'
})
print(COPD)

   Age group  Death rate, United States, 2019  Death rate, Uganda, 2019
0        0-4                             0.04                      0.40
1        5-9                             0.02                      0.17
2      10-14                             0.02                      0.07
3      15-19                             0.02                      0.23
4      20-24                             0.06                      0.38
5      25-29                             0.11                      0.40
6      30-34                             0.29                      0.75
7      35-39                             0.56                      1.11
8      40-44                             1.42                      2.04
9      45-49                             4.00                      5.51
10     50-54                            14.13                     13.26
11     55-59                            37.22                     33.25
12     60-64                            66.48                   

In [55]:
WHO_standard = pd.read_csv("data/WHO_World_Standard_from_NIH_NCI.csv", usecols=['Age Group', 'WHO World Standard (%)'], skiprows=1, skipfooter=2, engine='python')
WHO_standard = WHO_standard.rename(columns={
    'Age Group': 'Age group'
})
print(WHO_standard)

   Age group  WHO World Standard (%)
0        0-4                   8.860
1        5-9                   8.690
2      10-14                   8.600
3      15-19                   8.470
4      20-24                   8.220
5      25-29                   7.930
6      30-34                   7.610
7      35-39                   7.150
8      40-44                   6.590
9      45-49                   6.040
10     50-54                   5.370
11     55-59                   4.550
12     60-64                   3.720
13     65-69                   2.960
14     70-74                   2.210
15     75-79                   1.520
16     80-84                   0.910
17     85-89                   0.440
18     90-94                   0.150
19     95-99                   0.040
20      100+                   0.005


In [56]:
# COPD data's final row is age 85+
# so for compatibility merge older cohorts in WHO_standard
sum_percentage = WHO_standard.iloc[-4:]['WHO World Standard (%)'].sum()
print(sum_percentage)

# check: matches percentage in Ahmad et al paper's table 1 (0.63%)

0.635


In [57]:
# delete last 4 rows
WHO_standard.drop(WHO_standard.index[-4:], inplace=True)
new_row = pd.DataFrame({'Age group': ['85+'], 'WHO World Standard (%)': [sum_percentage]})
WHO_standard = pd.concat([WHO_standard, new_row], ignore_index=True)
# WHO_standard = pd.concat([WHO_standard.iloc[:-4], new_row], ignore_index=True)
print(WHO_standard)

   Age group  WHO World Standard (%)
0        0-4                   8.860
1        5-9                   8.690
2      10-14                   8.600
3      15-19                   8.470
4      20-24                   8.220
5      25-29                   7.930
6      30-34                   7.610
7      35-39                   7.150
8      40-44                   6.590
9      45-49                   6.040
10     50-54                   5.370
11     55-59                   4.550
12     60-64                   3.720
13     65-69                   2.960
14     70-74                   2.210
15     75-79                   1.520
16     80-84                   0.910
17       85+                   0.635


In [None]:
# Now data components are ready, begin calculations.
# COPD crude death rate
