In [1]:
# importing pandas to clean
import pandas as pd

In [2]:
# Read in infectious disease data
all_infectious_disease_data = pd.read_csv(r"Data\WHOMortalityDatabase_Deaths_Infectious and parasitic diseases_6th May 2024 20_00.csv", skiprows=[0,1,2,3,4,5], index_col = False)
all_infectious_disease_df = pd.DataFrame(all_infectious_disease_data)
all_infectious_disease_df.head()

Unnamed: 0,Region Code,Region Name,Country Code,Country Name,Year,Sex,Age group code,Age Group,Number,Percentage of cause-specific deaths out of total deaths,Age-standardized death rate per 100 000 standard population,Death rate per 100 000 population
0,EU,Europe,ALB,Albania,1987,All,Age_unknown,[Unknown],0.0,,,
1,EU,Europe,ALB,Albania,1987,All,Age85_over,[85+],14.0,0.628931,,140.0
2,EU,Europe,ALB,Albania,1987,All,Age80_84,[80-84],6.0,0.366077,,36.144578
3,EU,Europe,ALB,Albania,1987,All,Age75_79,[75-79],15.0,0.70922,,48.701299
4,EU,Europe,ALB,Albania,1987,All,Age70_74,[70-74],13.0,0.719823,,31.862745


In [3]:
# limit data range to years 2010 or later, all age groups, and all sexes
all_infectious_disease_df = all_infectious_disease_df.drop(all_infectious_disease_df[all_infectious_disease_df['Year'] < 2010].index)
all_infectious_disease_df = all_infectious_disease_df.drop(all_infectious_disease_df[all_infectious_disease_df['Age group code']!='Age_all'].index)
all_infectious_disease_df = all_infectious_disease_df.drop(all_infectious_disease_df[all_infectious_disease_df['Sex']!='All'].index)

In [4]:
# drop null data
all_infectious_disease_df = all_infectious_disease_df.dropna()

In [5]:
# check if dataset has enough data for project
all_infectious_disease_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1082 entries, 1323 to 309519
Data columns (total 12 columns):
 #   Column                                                       Non-Null Count  Dtype  
---  ------                                                       --------------  -----  
 0   Region Code                                                  1082 non-null   object 
 1   Region Name                                                  1082 non-null   object 
 2   Country Code                                                 1082 non-null   object 
 3   Country Name                                                 1082 non-null   object 
 4   Year                                                         1082 non-null   int64  
 5   Sex                                                          1082 non-null   object 
 6   Age group code                                               1082 non-null   object 
 7   Age Group                                                    1082 non-null   o

In [6]:
# remove brackets from Age Group column
all_infectious_disease_df['Age Group'] = all_infectious_disease_df['Age Group'].str.strip('[]').astype(str)

In [7]:
# look at data head
all_infectious_disease_df.head()

Unnamed: 0,Region Code,Region Name,Country Code,Country Name,Year,Sex,Age group code,Age Group,Number,Percentage of cause-specific deaths out of total deaths,Age-standardized death rate per 100 000 standard population,Death rate per 100 000 population
1323,EU,Europe,ALB,Albania,2010,All,Age_all,All,40.0,0.329516,1.289031,1.373141
3841,NAC,North America and the Caribbean,ATG,Antigua and Barbuda,2012,All,Age_all,All,35.0,7.099391,37.839414,38.71296
3904,NAC,North America and the Caribbean,ATG,Antigua and Barbuda,2013,All,Age_all,All,28.0,5.944798,29.327673,30.595743
3967,NAC,North America and the Caribbean,ATG,Antigua and Barbuda,2014,All,Age_all,All,40.0,6.896552,40.583518,43.214278
6700,CSA,Central and South America,ARG,Argentina,2010,All,Age_all,All,14684.0,4.642457,29.704645,35.90593


In [8]:
#locate number of unique country names to identify how many we can drop.
country_count = all_infectious_disease_df['Country Name'].nunique()
print(country_count)

115


In [9]:
# identify the number of countries with less than 10 years worth of data
year_counts = all_infectious_disease_df.groupby('Country Name').count()['Year'].sort_values()
print(year_counts[0:43])

Country Name
Albania                                1
Syrian Arab Republic                   1
Cabo Verde                             1
Tajikistan                             2
Saudi Arabia                           2
Montenegro                             2
Iraq                                   2
Fiji                                   2
United Arab Emirates                   3
Trinidad and Tobago                    3
Lebanon                                3
Barbados                               4
Iran (Islamic Republic of)             5
Suriname                               5
Bahrain                                5
Jamaica                                5
Belarus                                5
Mayotte                                5
Martinique                             6
Mongolia                               6
Bahamas                                6
Turkmenistan                           6
New Zealand                            7
Norway                                 7
Phi

In [40]:
# create a data fram that merges the original data frame with the new year_counts without losing data
output_df = all_infectious_disease_df.merge(year_counts, on="Country Name")

In [41]:
# check for success
output_df.head()

Unnamed: 0,Region Code,Region Name,Country Code,Country Name,Year_x,Sex,Age group code,Age Group,Number,Percentage of cause-specific deaths out of total deaths,Age-standardized death rate per 100 000 standard population,Death rate per 100 000 population,Year_y
0,NAC,North America and the Caribbean,ATG,Antigua and Barbuda,2012,All,Age_all,All,35.0,7.099391,37.839414,38.71296,9
1,NAC,North America and the Caribbean,ATG,Antigua and Barbuda,2013,All,Age_all,All,28.0,5.944798,29.327673,30.595743,9
2,NAC,North America and the Caribbean,ATG,Antigua and Barbuda,2014,All,Age_all,All,40.0,6.896552,40.583518,43.214278,9
3,NAC,North America and the Caribbean,ATG,Antigua and Barbuda,2020,All,Age_all,All,26.0,4.529617,24.842839,26.549847,9
4,NAC,North America and the Caribbean,ATG,Antigua and Barbuda,2019,All,Age_all,All,24.0,3.883495,22.73087,24.712206,9


In [42]:
# drop all instances where a country's year count is less than 10
all_infectious_disease_df = output_df.drop(output_df[output_df['Year_y'] < 10].index)

In [43]:
# delete the now unneeded Year_y column
del all_infectious_disease_df['Year_y']

In [44]:
# check for success
all_infectious_disease_df.head()

Unnamed: 0,Region Code,Region Name,Country Code,Country Name,Year_x,Sex,Age group code,Age Group,Number,Percentage of cause-specific deaths out of total deaths,Age-standardized death rate per 100 000 standard population,Death rate per 100 000 population
9,CSA,Central and South America,ARG,Argentina,2011,All,Age_all,All,14572.0,4.600982,28.98691,35.265788
10,CSA,Central and South America,ARG,Argentina,2012,All,Age_all,All,13986.0,4.409984,27.442447,33.495233
11,CSA,Central and South America,ARG,Argentina,2013,All,Age_all,All,14654.0,4.532254,28.102552,34.728386
12,CSA,Central and South America,ARG,Argentina,2014,All,Age_all,All,14816.0,4.600742,27.769779,34.748745
13,CSA,Central and South America,ARG,Argentina,2020,All,Age_all,All,13267.0,3.526403,22.155131,29.354514


In [45]:
# rename Year_x column
all_infectious_disease_df= all_infectious_disease_df.rename(columns={'Year_x':'Year'})

In [46]:
# check if this process was successful by sorting year counts by ascending
year_counts = all_infectious_disease_df.groupby('Country Name').count()['Year'].sort_values()
print(year_counts[0:43])

Country Name
Kyrgyzstan                                              10
Thailand                                                10
T?rkiye                                                 10
Slovakia                                                10
Russian Federation                                      10
Romania                                                 10
Portugal                                                10
Maldives                                                10
Uruguay                                                 10
Kuwait                                                  10
Guyana                                                  10
Egypt                                                   10
Uzbekistan                                              10
Brunei Darussalam                                       10
Malta                                                   11
United Kingdom of Great Britain and Northern Ireland    11
Slovenia                                   

In [47]:
# check amount of data left over is sufficient
all_infectious_disease_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 841 entries, 9 to 849
Data columns (total 12 columns):
 #   Column                                                       Non-Null Count  Dtype  
---  ------                                                       --------------  -----  
 0   Region Code                                                  841 non-null    object 
 1   Region Name                                                  841 non-null    object 
 2   Country Code                                                 841 non-null    object 
 3   Country Name                                                 841 non-null    object 
 4   Year                                                         841 non-null    int64  
 5   Sex                                                          841 non-null    object 
 6   Age group code                                               841 non-null    object 
 7   Age Group                                                    841 non-null    obj

In [50]:
#export to CSV
all_infectious_disease_df.to_csv('Data/all_infectious_disease.csv')