## Cleaning the UNAIDs dataset 

In [41]:
import pandas as pd


df_raw = pd.read_excel('../../data/raw_data/UNAIDS.xlsx', header=None)

found_indicators = False

for row_idx in range(5):
    row_content = df_raw.iloc[row_idx].dropna().tolist()
    if any('prevalence' in str(cell).lower() for cell in row_content):
        indicator_row = row_idx
        found_indicators = True
        print(f"Found indicators in row {row_idx + 1}")
        break

if found_indicators:
    # Map indicators to the corresponding Estimate columns
    indicators = [cell for cell in df_raw.iloc[indicator_row].tolist() if isinstance(cell, str) and cell.strip()]
    print(f"Found indicators: {indicators}")
    
    df = pd.read_excel('../../data/raw_data/UNAIDS.xlsx', header=0)
    
    # Keep the first 3 columns (Year, Country Code, Country Name)
    result_df = df.iloc[:, :3].copy()

    column_idx = 3  
    for indicator in indicators:
        # Take the first column of each indicator group
        if column_idx < len(df.columns):
            result_df[indicator] = df.iloc[:, column_idx]
            column_idx += 3  # Skip the next two columns (Low, High)
    
else:
    print("No indicators found in the file, using manual mapping")
    
    # create list of indicator names to make as the new column headers
    indicators = [
        "Adults (15-49) prevalence (%)",
        "Young women (15-24) prevalence (%)",
        "Young men (15-24) prevalence (%)",
        "AIDS-related deaths among adults and children",
        "AIDS-related deaths among children (0-14)",
        "AIDS-related deaths among adults (15+)",
        "Estimated children (0-14) living with HIV",
        "Estimated women (15+) living with HIV",
        "Estimated adults (15+) living with HIV",
        "Estimated adults and children living with HIV",
        "Adults (15-49) incidence (per 1000 uninfected population)",
        "All ages incidence (per 1000 uninfected population)",
        "Pregnant women needing antiretrovirals for preventing mother-to-child transmission",
        "Children (0-14) newly infected with HIV",
        "Adults (15+) newly infected with HIV",
        "Adults and children newly infected with HIV",
    ]
    
    df = pd.read_excel('../../data/raw_data/UNAIDS.xlsx', header=0)
    
    # Keep the first 3 columns
    result_df = df.iloc[:, :3].copy()
    result_df.columns = ['Year', 'Country Code', 'Country']
    
    # Add the indicator columns
    for i, indicator in enumerate(indicators):
        col_idx = 3 + (i * 3)  
        if col_idx < len(df.columns):
            result_df[indicator] = df.iloc[:, col_idx]

result_df = result_df.iloc[7:].reset_index(drop=True)

print(f"Column names in cleaned dataset: {result_df.columns.tolist()}")

No indicators found in the file, using manual mapping
Column names in cleaned dataset: ['Year', 'Country Code', 'Country', 'Adults (15-49) prevalence (%)', 'Young women (15-24) prevalence (%)', 'Young men (15-24) prevalence (%)', 'AIDS-related deaths among adults and children', 'AIDS-related deaths among children (0-14)', 'AIDS-related deaths among adults (15+)', 'Estimated children (0-14) living with HIV', 'Estimated women (15+) living with HIV', 'Estimated adults (15+) living with HIV', 'Estimated adults and children living with HIV', 'Adults (15-49) incidence (per 1000 uninfected population)', 'All ages incidence (per 1000 uninfected population)', 'Pregnant women needing antiretrovirals for preventing mother-to-child transmission', 'Children (0-14) newly infected with HIV', 'Adults (15+) newly infected with HIV', 'Adults and children newly infected with HIV']


In [42]:
import numpy as np

# Track counts of non-numeric values
non_numeric_count = 0
non_numeric_examples = []

for col in result_df.columns:
    # Skip non-numeric columns
    if col in ["Country Code", "Country"]:
        continue
        
    for i in range(len(result_df)):
        val = result_df.loc[i, col]
        
        # Skip non-string values (already numeric or NaN)
        if not isinstance(val, str):
            continue
            
        # Handle different string formats
        if val == "..." or val.strip() == "":
            result_df.loc[i, col] = np.nan
        elif val == "<0.1" or val == "<0.01":
            result_df.loc[i, col] = 0
        elif " m" in val:
            try:
                num_part = val.replace(" m", "")
                result_df.loc[i, col] = float(num_part) * 1000000
            except:
                non_numeric_count += 1
        elif val.startswith("<"):
            try:
                num_part = val[1:]
                if num_part.isdigit():
                    result_df.loc[i, col] = int(num_part) - 50
                else:
                    non_numeric_count += 1
            except:
                non_numeric_count += 1
        # Handle numbers with spaces (like "34 000")
        elif " " in val:
            try:
                no_spaces = val.replace(" ", "")
                if no_spaces.isdigit():
                    result_df.loc[i, col] = float(no_spaces)
                else:
                    non_numeric_count += 1
                    if len(non_numeric_examples) < 10:
                        non_numeric_examples.append(f"'{val}' in column '{col}', row {i}")
            except:
                non_numeric_count += 1
        else:
            # Try to convert to numeric
            try:
                result_df.loc[i, col] = float(val)
            except:
                non_numeric_count += 1
                if len(non_numeric_examples) < 10:
                    non_numeric_examples.append(f"'{val}' in column '{col}', row {i}")

# Ensure all columns (except Country Code and Country) are numeric type
for col in result_df.columns:
    if col not in ["Country Code", "Country"]:
        result_df[col] = pd.to_numeric(result_df[col], errors='coerce')

# Save the cleaned dataset
result_df.to_excel('../../data/clean_data/cleaned_UNAIDS.xlsx', index=False)

print(f"Data cleaning complete!")
print(f"Remaining non-numeric cells: {non_numeric_count}")
if non_numeric_examples:
    print("Examples of remaining non-numeric values:")
    for example in non_numeric_examples:
        print(f"  - {example}")

Data cleaning complete!
Remaining non-numeric cells: 0


In [43]:
print(f"Number of rows in cleaned dataset: {len(result_df)}")

# Print some statistics about the data
print("\nSample statistics after cleaning:")
print(f"Number of NaN values: {result_df.isna().sum().sum()}")

Number of rows in cleaned dataset: 6153

Sample statistics after cleaning:
Number of NaN values: 24564
