# Checking out the data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)


In [9]:
tb_df = pd.read_csv('data/tuberculosis.csv')
gdp_df = pd.read_csv('data/P_Popular_Indicators/gdp.csv')
# gdp_metadata = pd.read_csv('data/P_Popular_Indicators/gdp_metadata.csv')

## Cleaning the tuberculosis data

In [4]:
tb_df.head()

Unnamed: 0,IndicatorCode,Indicator,ValueType,ParentLocationCode,ParentLocation,Location type,SpatialDimValueCode,Location,Period type,Period,IsLatestYear,Dim1 type,Dim1,Dim1ValueCode,Dim2 type,Dim2,Dim2ValueCode,Dim3 type,Dim3,Dim3ValueCode,DataSourceDimValueCode,DataSource,FactValueNumericPrefix,FactValueNumeric,FactValueUoM,FactValueNumericLowPrefix,FactValueNumericLow,FactValueNumericHighPrefix,FactValueNumericHigh,Value,FactValueTranslationID,FactComments,Language,DateModified
0,MDG_0000000020,Incidence of tuberculosis (per 100 000 populat...,text,EUR,Europe,Country,SMR,San Marino,Year,2023,True,,,,,,,,,,,,,0.0,,,0.0,,0.0,0 [0-0],,,EN,2024-10-28T07:00:00.000Z
1,MDG_0000000020,Incidence of tuberculosis (per 100 000 populat...,text,EMR,Eastern Mediterranean,Country,PSE,"occupied Palestinian territory, including east...",Year,2023,True,,,,,,,,,,,,,0.35,,,0.27,,0.44,0.35 [0.27-0.44],,,EN,2024-10-28T07:00:00.000Z
2,MDG_0000000020,Incidence of tuberculosis (per 100 000 populat...,text,AMR,Americas,Country,PRI,Puerto Rico,Year,2023,True,,,,,,,,,,,,,0.71,,,0.61,,0.82,0.71 [0.61-0.82],,,EN,2024-10-28T07:00:00.000Z
3,MDG_0000000020,Incidence of tuberculosis (per 100 000 populat...,text,EMR,Eastern Mediterranean,Country,ARE,United Arab Emirates,Year,2023,True,,,,,,,,,,,,,0.8,,,0.68,,0.92,0.8 [0.68-0.92],,,EN,2024-10-28T07:00:00.000Z
4,MDG_0000000020,Incidence of tuberculosis (per 100 000 populat...,text,AMR,Americas,Country,BRB,Barbados,Year,2023,True,,,,,,,,,,,,,0.81,,,0.7,,0.94,0.81 [0.7-0.94],,,EN,2024-10-28T07:00:00.000Z


In [None]:
# Select desired columns
tb_clean = tb_df[['ParentLocationCode', 'ParentLocation', 'SpatialDimValueCode', 'Location', 'Period', 'FactValueNumeric','FactValueNumericLow', 'FactValueNumericHigh', 'Value']]

# Rename
tb_clean = tb_clean.rename(columns={
    'ParentLocationCode': 'ContinentCode', 
    'ParentLocation': 'Continent', 
    'SpatialDimValueCode': 'CountryCode', 
    'Location': 'CountryName', 
    'Period': 'Year', 
    'FactValueNumeric': 'Incidence',
    'FactValueNumericLow': 'IncidenceLow', 
    'FactValueNumericHigh': 'IncidenceHigh', 
    'Value': 'IncidenceRange'
    })   
    
tb_clean.head()


Unnamed: 0,ContinentCode,Continent,CountryCode,CountryName,Year,Incidence,IncidenceLow,IncidenceHigh,IncidenceRange
0,EUR,Europe,SMR,San Marino,2023,0.0,0.0,0.0,0 [0-0]
1,EMR,Eastern Mediterranean,PSE,"occupied Palestinian territory, including east...",2023,0.35,0.27,0.44,0.35 [0.27-0.44]
2,AMR,Americas,PRI,Puerto Rico,2023,0.71,0.61,0.82,0.71 [0.61-0.82]
3,EMR,Eastern Mediterranean,ARE,United Arab Emirates,2023,0.8,0.68,0.92,0.8 [0.68-0.92]
4,AMR,Americas,BRB,Barbados,2023,0.81,0.7,0.94,0.81 [0.7-0.94]


In [11]:
tb_clean.describe()

Unnamed: 0,Year,Incidence,IncidenceLow,IncidenceHigh
count,4729.0,4729.0,4729.0,4729.0
mean,2011.53986,130.282041,77.103772,204.833694
std,6.913056,188.602665,99.134271,368.390021
min,2000.0,0.0,0.0,0.0
25%,2006.0,13.0,11.0,16.0
50%,2012.0,52.0,37.0,66.0
75%,2018.0,179.0,105.0,263.0
max,2023.0,1590.0,907.0,5510.0


In [12]:
tb_clean.isna().sum()

ContinentCode     0
Continent         0
CountryCode       0
CountryName       0
Year              0
Incidence         0
IncidenceLow      0
IncidenceHigh     0
IncidenceRange    0
dtype: int64

## Clean the GDP data

In [16]:
gdp_df['Series Name'].unique()

array(['Population, total', 'Population growth (annual %)',
       'Surface area (sq. km)',
       'Poverty headcount ratio at national poverty lines (% of population)',
       'GNI, Atlas method (current US$)',
       'GNI per capita, Atlas method (current US$)',
       'GNI, PPP (current international $)',
       'GNI per capita, PPP (current international $)',
       'Income share held by lowest 20%',
       'Life expectancy at birth, total (years)',
       'Fertility rate, total (births per woman)',
       'Adolescent fertility rate (births per 1,000 women ages 15-19)',
       'Contraceptive prevalence, any method (% of married women ages 15-49)',
       'Births attended by skilled health staff (% of total)',
       'Mortality rate, under-5 (per 1,000 live births)',
       'Prevalence of underweight, weight for age (% of children under 5)',
       'Immunization, measles (% of children ages 12-23 months)',
       'Primary completion rate, total (% of relevant age group)',
       'Sc

In [26]:
gdp_df.head()

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2000 [YR2000],2001 [YR2001],2002 [YR2002],2003 [YR2003],2004 [YR2004],2005 [YR2005],2006 [YR2006],2007 [YR2007],2008 [YR2008],2009 [YR2009],2010 [YR2010],2011 [YR2011],2012 [YR2012],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020],2021 [YR2021],2022 [YR2022],2023 [YR2023]
0,"Population, total",SP.POP.TOTL,Afghanistan,AFG,20130327,20284307,21378117,22733049,23560654,24404567,25424094,25909852,26482622,27466101,28284089,29347708,30560034,31622704,32792523,33831764,34700612,35688935,36743039,37856121,39068979,40000412,40578842,41454761
1,"Population, total",SP.POP.TOTL,Albania,ALB,3089027,3060173,3051010,3039616,3026939,3011487,2992547,2970017,2947314,2927519,2913021,2905195,2900401,2895092,2889104,2880703,2876101,2873457,2866376,2854191,2837849,2811666,2777689,2745972
2,"Population, total",SP.POP.TOTL,Algeria,DZA,30903893,31331221,31750835,32175818,32628286,33109249,33623506,34189416,34816961,35490445,36188236,36903376,37646166,38414171,39205031,40019529,40850721,41689299,42505035,43294546,44042091,44761099,45477389,46164219
3,"Population, total",SP.POP.TOTL,American Samoa,ASM,56855,57053,57062,56971,56818,56617,56374,56113,55828,55528,55228,54895,54489,54006,53466,52878,52245,51586,50908,50209,49761,49225,48342,47521
4,"Population, total",SP.POP.TOTL,Andorra,AND,65685,65852,66506,69486,74325,77421,79585,81877,83495,83888,80706,77783,76834,75194,73737,72174,72181,73763,75162,76474,77380,78364,79705,80856


In [34]:
gdp_df.columns[5:].astype(str).split()


AttributeError: 'Index' object has no attribute 'split'

In [37]:
# gdp_df = gdp_df.rename(columns={
#     'Country Name': 'CountryName', 
#     'Country Code': 'CountryCode', 
#     'Series Name': 'SeriesName',
#     'Year': 'Year', 
# })


for col in gdp_df.columns[4:]:
    gdp_df = gdp_df.rename(columns={col: col.split(' ')[0]})

gdp_df.columns
    

Index(['Series Name', 'Series Code', 'Country Name', 'Country Code', '2000',
       '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009',
       '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018',
       '2019', '2020', '2021', '2022', '2023'],
      dtype='object')

In [27]:
gdp_df.columns

Index(['Series Name', 'Series Code', 'Country Name', 'Country Code',
       '2000 [YR2000]', '2001 [YR2001]', '2002 [YR2002]', '2003 [YR2003]',
       '2004 [YR2004]', '2005 [YR2005]', '2006 [YR2006]', '2007 [YR2007]',
       '2008 [YR2008]', '2009 [YR2009]', '2010 [YR2010]', '2011 [YR2011]',
       '2012 [YR2012]', '2013 [YR2013]', '2014 [YR2014]', '2015 [YR2015]',
       '2016 [YR2016]', '2017 [YR2017]', '2018 [YR2018]', '2019 [YR2019]',
       '2020 [YR2020]', '2021 [YR2021]', '2022 [YR2022]', '2023 [YR2023]'],
      dtype='object')

In [None]:
type(gdp_df.columns)



pandas.core.indexes.base.Index