for tb_2022-09-30.csv:

In [30]:
import pandas as pd
# Load the dataset
tb_data = pd.read_csv('tb_2022-09-30.csv')

# Step 1: Drop columns with more than 50% missing values
tb_data_cleaned = tb_data.dropna(thresh=tb_data.shape[0] * 0.5, axis=1)

# Step 2: Remove rows with missing critical values
tb_data_cleaned = tb_data_cleaned.dropna(subset=['iso3', 'country', 'year'])

# Step 3: Fill missing numeric values with interpolation (time-based for years)
numeric_columns = tb_data_cleaned.select_dtypes(include=['float64', 'int64']).columns

# Interpolate missing values within each group (country)
tb_data_cleaned[numeric_columns] = tb_data_cleaned.groupby('country')[numeric_columns]\
    .apply(lambda group: group.interpolate()).reset_index(drop=True)

# Handle zero values in numeric columns by replacing them with the column mean
for col in numeric_columns:
    col_mean = tb_data_cleaned[col].mean()  # Calculate the mean of the column
    tb_data_cleaned[col] = tb_data_cleaned[col].replace(0, col_mean)  # Replace zeros with the column mean

# Step 4: Normalize metrics (e.g., TB cases per 100,000 people)
tb_data_cleaned['tb_cases_per_100k'] = tb_data_cleaned['newinc'] / tb_data_cleaned['pop'] * 100000

# Save the preprocessed file
tb_data_cleaned.to_csv('tb_preprocessfile_1.csv', index=False)


In [36]:
tb_data_cleaned.columns

Index(['iso3', 'year', 'country', 'iso2', 'iso.numeric', 'g.whoregion',
       'new.ep', 'tot.newrel', 'c.newunk', 'c.newinc', 'c.ret', 'c.notified',
       'c.new.014', 'e.pop.m04', 'e.pop.m514', 'e.pop.m014', 'e.pop.m1524',
       'e.pop.m2534', 'e.pop.m3544', 'e.pop.m4554', 'e.pop.m5564', 'e.pop.m65',
       'e.pop.m15plus', 'e.pop.f04', 'e.pop.f514', 'e.pop.f014', 'e.pop.f1524',
       'e.pop.f2534', 'e.pop.f3544', 'e.pop.f4554', 'e.pop.f5564', 'e.pop.f65',
       'e.pop.f15plus', 'e.pop.15plus', 'e.pop.num', 'pop', 'newinc',
       'c.new.tsr', 'ch', 'conf', 'ep', 'tb_cases_per_100k'],
      dtype='object')

file tx_2022-08-29.csv:

In [35]:
# Load the dataset
tx_data = pd.read_csv('tx_2022-08-29.csv')


# Step 1: Drop columns with more than 50% missing values
tx_data_cleaned = tx_data.dropna(thresh=tx_data.shape[0] * 0.5, axis=1)


# Step 2: Remove rows with missing critical values
tx_data_cleaned = tx_data_cleaned.dropna(subset=['country', 'iso3', 'year'])

# Step 3: Fill missing numeric values with the median or by grouping
numeric_columns = tx_data_cleaned.select_dtypes(include=['float64', 'int64']).columns
tx_data_cleaned[numeric_columns] = tx_data_cleaned[numeric_columns].fillna(tx_data_cleaned[numeric_columns].median())

# Step 4: Save the preprocessed file
tx_data_cleaned.to_csv('tb_preprocessfile_2.csv', index=False)



In [37]:
tx_data_cleaned.columns

Index(['country', 'year', 'iso2', 'iso3', 'iso.numeric', 'g.whoregion',
       'rep.meth', 'new.sp.coh', 'new.sp.cur', 'new.sp.cmplt', 'new.sp.died',
       'new.sp.def', 'c.new.sp.neval', 'c.new.sp.tsr', 'c.tsr', 'c.ret.tsr',
       'c.new.tsr'],
      dtype='object')