In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt     
import seaborn as sns  
import missingno as msn   
import janitor
import re     
from dateutil import parser
# Install local package using "pip install -e . --config-setting editable_mode=compat"
import src  
from src.clean import load_data_and_dict       
from collections import Counter
import os
from dotenv import load_dotenv

In [2]:
# Load the west africa data 
load_dotenv() 

df = pd.read_excel(os.getenv('TBDATA_FILE_PATH'), sheet_name='R4KA_Gambia')

df.head()

Unnamed: 0,SerialNo,Date_1,StudyID_2,Adultindex_3,Patientsin_4,DOB_5,Ageifupto5_6,Ageifover5_7,Sex_8,Interviewe_9,...,HIVserolog_110,Urine_118,Diagnosis_119,IfEPTBspec_120,IfnotTBspe_121,Isthepatie_124,Ifyesplace_125,Datetreatm_126,TreatmentI_127,Otherplan_128
0,173727,2017-08-01,G17-001,,SJ,2010-05-03,,,2 - Male,SK,...,0.0,1 - Yes,3.0,,VIRAL RTI,0 - No,,NaT,,"FOR FOLLOW UP AT MONTHS 1,3 & 6"
1,173722,2017-08-01,G17-002,,AM,2013-01-21,,,1 - Female,SK,...,0.0,1 - Yes,3.0,,REACTIVE AIRWAY DISEASE,0 - No,,NaT,,"FOLLOW UP AT MONTHS 1,3 & 6"
2,173748,2017-08-08,G17-003,,EN,2012-10-20,,,2 - Male,SK,...,0.0,1 - Yes,3.0,,REACTIVE AIRWAY DISEASE,0 - No,,NaT,,FOR SPUTUM INDUCTION AND REVIEW WITH RESULTS. ...
3,173749,2017-08-08,G17-004,,MT,2006-06-15,,,2 - Male,LJ,...,0.0,1 - Yes,3.0,,URTI,0 - No,,NaT,,TO COLLECT SPONTANEOUS SPUTUM (x2 SAMPLES) FOR...
4,173734,2017-08-10,G17-005,,ADD,2016-10-21,,,2 - Male,SK,...,0.0,1 - Yes,3.0,,VIRAL RTI,0 - No,,NaT,,"FOR FOLLOW UP AT 1,3 AND 6 MONTHS"


In [3]:
# Ensure dates are parsed correctly
df["Date_1"] = pd.to_datetime(df["Date_1"], errors="coerce")
df["DOB_5"] = pd.to_datetime(df["DOB_5"], errors="coerce")

# Calculate age from dates (if both available)
df["age_from_dob"] = (
    (df["Date_1"] - df["DOB_5"]).dt.days / 365.25
).round(1)

# Create unified Age variable
df["Age"] = np.select(
    [
        df["age_from_dob"].notna(),        # If DOB and Date available
        df["Ageifupto5_6"].notna(),        # Else use Age up to 5
        df["Ageifover5_7"].notna()         # Else use Age ≥5
    ],
    [
        df["age_from_dob"],                # Age from DOB
        df["Ageifupto5_6"],                # Age ≤5 column
        df["Ageifover5_7"]                 # Age ≥5 column
    ],
    default=np.nan
)

# Optional: drop helper column
df.drop(columns=["DOB_5", "Ageifupto5_6", "Ageifover5_7", "age_from_dob"], inplace=True)


In [4]:
df['Ifothermyc_106'].value_counts(dropna=False)

Ifothermyc_106
NaN    307
Name: count, dtype: int64

In [5]:
df.columns.tolist()

['SerialNo',
 'Date_1',
 'StudyID_2',
 'Adultindex_3',
 'Patientsin_4',
 'Sex_8',
 'Interviewe_9',
 'Cough_10',
 'Productive_11',
 'Chestpain_12',
 'Wheeze_13',
 'FeverChill_14',
 'Lossfailur_15',
 'Difficulty_16',
 'Nightsweat_17',
 'Sidepain_18',
 'MalaiseFat_19',
 'Lossofappe_20',
 'Haemoptysi_21',
 'Vomiting_22',
 'Previoushi_23',
 'Ifyeswhen_24',
 'Pallor_25',
 'Splenomega_26',
 'Jaundice_27',
 'Hepatomega_28',
 'Wasting_29',
 'Oedema_30',
 'Clubbing_31',
 'Irritabili_32',
 'Dyspnoea_33',
 'Deformityo_34',
 'Lethargy_35',
 'Abdominald_36',
 'Lymphadeno_37',
 'Stifforwea_38',
 'Temperatur_39',
 'WeightKg_40',
 'Heightcm_41',
 'Occipitofr_42',
 'Midupperar_43',
 'Pulserateb_44',
 'Otherclini_45',
 'Respirator_46',
 'Subcostalr_47',
 'Dullness_48',
 'Crepitatio_49',
 'Bronchialb_50',
 'Rhonchi_51',
 'Reducedbre_52',
 'Otherabnor_53',
 'Ifotherabn_54',
 'ChestXrayd_55',
 'ChestXrayd_56',
 'ChestXrayq_57',
 'ChestXrayn_58',
 'Lobarconso_59',
 'Bronchopne_60',
 'Perihilara_61',
 'Hyperi

In [6]:
df['Ifotherabn_54'].value_counts(dropna=False)

Ifotherabn_54
0.0    254
NaN     49
2.0      2
1.0      1
3.0      1
Name: count, dtype: int64

In [7]:
existing_mapping = {
    "SerialNo": "patient_id",
    "Date_1": "date_particip",
    "Sex_8": "gender",
    "Age": "age",
    "Cough_10": "cough",
    "Productive_11": "productive",
    "Chestpain_12": "chest_pain",
    "Wheeze_13": "wheeze",
    "FeverChill_14": "fever_chills",
    "Lossfailur_15": "loss_weight",
    "Difficulty_16": "difficulty_breathing",
    "Nightsweat_17": "night_sweats",
    "Sidepain_18": "side_pain",
    "MalaiseFat_19": "malaise_fatigue",
    "Lossofappe_20": "loss_appetite",
    "Haemoptysi_21": "haemoptysis",
    "Vomiting_22": "vomiting",
    "Previoushi_23": "pretb_history",
    "Ifyeswhen_24": "date_previous_tb",
    "Pallor_25": "pallor",
    "Splenomega_26": "splenomegaly",
    "Jaundice_27": "jaundice",
    "Hepatomega_28": "hepatomegaly",
    "Wasting_29": "wasting",
    "Dyspnoea_33": "dyspnoea",
    "Lethargy_35": "lethargy",
    "Abdominald_36": "abd_distension_mass",
    "Lymphadeno_37": "neck_lymph",
    "Temperatur_39": "temperature",
    "WeightKg_40": "weight",
    "Heightcm_41": "height",
    "Midupperar_43": "mua_circumference",
    "Otherclini_45": "other_clinical_sign",
    "Respirator_46": "respiratory_rate",
    "Subcostalr_47": "subscostal_rec",
    "Dullness_48": "dullness",
    "Crepitatio_49": "crepitation",
    "Bronchialb_50": "bronchial_breathing",
    "Rhonchi_51": "rhonchi",
    "Reducedbre_52": "reduced_breath_sounds",
    "Otherabnor_53": "other_abnorm",
    "Ifotherabn_54": "abnorm_site",
    "Assessment_95": "xray_assessment",
    "OtherXrayd_96": "other_xray",
    "Typeofspec_97": "specimen_type",
    "Ifsputumty_99": "sputum_type",
    "GeneXpertr_104": "genexpert_result",
    "MGITcultur_105": "mgit_result",
    "HIVserolog_110": "hiv_serology",
    "Diagnosis_119": "diagnosis",
    "IfEPTBspec_120": "eptb_site",
    "IfnotTBspe_121": "nottb_diagnosis",
}

df = df[list(existing_mapping.keys())].rename(columns=existing_mapping)


In [8]:
# Cleaning function
def clean_column(val):
    if pd.isna(val):
        return val
    val = str(val).strip()
    
    # Detect and normalize valid date strings
    try:
        parsed_date = parser.parse(val, fuzzy=False)
        return parsed_date.strftime('%Y-%m-%d')
    except (ValueError, TypeError):
        pass  

    val = val.lower()
    
    # Replace invalid or missing values
    if re.match(r"^\s*$|^_+$|^(n/a|na|null|none|unknown|\?)$", val, re.IGNORECASE):
        return pd.NA

    # Normalize text values
    val = re.sub(r"\s+", "_", val)         
    val = re.sub(r"[^a-zA-Z0-9_]", "", val)  
    return val

# Cleaning pipeline
df = (
    df
    .clean_names()
    .apply(lambda col: col.map(clean_column) if col.dtype == "object" else col)
    .drop_duplicates(subset='patient_id')
)

display(df.head())

Unnamed: 0,patient_id,date_particip,gender,age,cough,productive,chest_pain,wheeze,fever_chills,loss_weight,...,xray_assessment,other_xray,specimen_type,sputum_type,genexpert_result,mgit_result,hiv_serology,diagnosis,eptb_site,nottb_diagnosis
0,173727,2017-08-01,2__male,7.2,0__no_cough_or_cough__2__cough__3__unknown_wee...,0__no,1__yes,0__no,1__yes,1__yes,...,2,,1.0,2.0,0,0,0.0,3.0,,viral_rti
1,173722,2017-08-01,1__female,4.5,2__cough__3__unknown_weeks,0__no,1__yes,0__no,1__yes,1__yes,...,2,,1.0,2.0,0,0,0.0,3.0,,reactive_airway_disease
2,173748,2017-08-08,2__male,4.8,2__cough__3__unknown_weeks,0__no,1__yes,1__yes,1__yes,1__yes,...,2,,1.0,2.0,0,0,0.0,3.0,,reactive_airway_disease
3,173749,2017-08-08,2__male,11.1,1__cough__2__cough__3__unknown_weeks_weeks,1__yes,0__no,0__no,0__no,0__no,...,1,,1.0,1.0,0,3,0.0,3.0,,urti
4,173734,2017-08-10,2__male,0.8,0__no_cough_or_cough__2__cough__3__unknown_wee...,1__yes,1__yes,1__yes,1__yes,1__yes,...,2,,1.0,2.0,0,0,0.0,3.0,,viral_rti


In [11]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

df['cough']

0      0__no_cough_or_cough__2__cough__3__unknown_wee...
1                             2__cough__3__unknown_weeks
2                             2__cough__3__unknown_weeks
3             1__cough__2__cough__3__unknown_weeks_weeks
4      0__no_cough_or_cough__2__cough__3__unknown_wee...
5             1__cough__2__cough__3__unknown_weeks_weeks
6             1__cough__2__cough__3__unknown_weeks_weeks
7      0__no_cough_or_cough__2__cough__3__unknown_wee...
8             1__cough__2__cough__3__unknown_weeks_weeks
9                             2__cough__3__unknown_weeks
10            1__cough__2__cough__3__unknown_weeks_weeks
11            1__cough__2__cough__3__unknown_weeks_weeks
12                            2__cough__3__unknown_weeks
13            1__cough__2__cough__3__unknown_weeks_weeks
14            1__cough__2__cough__3__unknown_weeks_weeks
15            1__cough__2__cough__3__unknown_weeks_weeks
16            1__cough__2__cough__3__unknown_weeks_weeks
17            1__cough__2__coug

In [9]:
df.columns

Index(['patient_id', 'date_particip', 'gender', 'age', 'cough', 'productive',
       'chest_pain', 'wheeze', 'fever_chills', 'loss_weight',
       'difficulty_breathing', 'night_sweats', 'side_pain', 'malaise_fatigue',
       'loss_appetite', 'haemoptysis', 'vomiting', 'pretb_history',
       'date_previous_tb', 'pallor', 'splenomegaly', 'jaundice',
       'hepatomegaly', 'wasting', 'dyspnoea', 'lethargy',
       'abd_distension_mass', 'neck_lymph', 'temperature', 'weight', 'height',
       'mua_circumference', 'other_clinical_sign', 'respiratory_rate',
       'subscostal_rec', 'dullness', 'crepitation', 'bronchial_breathing',
       'rhonchi', 'reduced_breath_sounds', 'other_abnorm', 'abnorm_site',
       'xray_assessment', 'other_xray', 'specimen_type', 'sputum_type',
       'genexpert_result', 'mgit_result', 'hiv_serology', 'diagnosis',
       'eptb_site', 'nottb_diagnosis'],
      dtype='object')

`There are 5 columns that don't match: "spec_result", "other_mycobactera" "histology_result", "antitb_treatment", "country"`


In [12]:
df.shape

(307, 52)