# Cleaning the dataset


Load the dataset:


In [67]:
import pandas as pd

file_path = "../dataset/dirty.csv"
df = pd.read_csv(file_path, delimiter=";")

## Clean Column Names


Standardize column names:


In [68]:
df.columns = (
    df.columns.str.lower()
    .str.lstrip()
    .str.rstrip()
    .str.replace(".", "_")
    .str.replace("-", "_")
    .str.replace(" ", "_")
)

Rename specific columns:


In [69]:
df = df.rename(
    columns={
        "agegroup": "age_group",
        "health_expenditure%gdp": "health_expenditure_gdp",
        "mortality_rate_per_1000": "mortality_rate_per_thousand",
        "poverty_rate_%": "poverty_rate_percent",
    }
)

Create variables for column names to be used later:


In [70]:
categorical_column_names = [
    "age_group",
    "cause_of_death",
    "country",
    "economic_status",
    "education_level",
    "year",
]

numerical_column_names = [
    "population",
    "health_expenditure_gdp",
    "mortality_rate_per_thousand",
    "life_expectancy",
    "gdp_per_capita",
    "healthcare_access_index",
    "poverty_rate_percent",
    "environmental_pollution_index",
    "healthcare_quality_index",
]

## Clean NA values


Remove NA values in categorical data:


In [71]:
df = df.dropna(subset=categorical_column_names)

Convert `poverty_rate_percent` to float to prepare for the next step:


In [72]:
df["poverty_rate_percent"] = df["poverty_rate_percent"].str.rstrip("%").astype(float)

Replace the NA values with the mean of the columns for other data:


In [73]:
df[numerical_column_names] = df[numerical_column_names].fillna(
    value=df[numerical_column_names].mean()
)

## Correct column types


Convert the `year`, `population` columns to integer:


In [74]:
df[["year", "population"]] = df[["year", "population"]].astype(int)

Check the data types and raise an error if any are incorrect:


In [85]:
expected_dtypes = {
    "year": "int64",
    "country": "object",
    "age_group": "object",
    "cause_of_death": "object",
    "economic_status": "object",
    "health_expenditure_gdp": "float64",
    "mortality_rate_per_thousand": "float64",
    "life_expectancy": "float64",
    "education_level": "object",
    "population": "int64",
    "gdp_per_capita": "float64",
    "healthcare_access_index": "float64",
    "poverty_rate_percent": "float64",
    "healthcare_quality_index": "float64",
    "environmental_pollution_index": "float64",
}

for column, expected_type in expected_dtypes.items():
    if df[column].dtype != expected_type:
        msg = (
            f"Column {column} has incorrect dtype. "
            f"Expected {expected_type}, got {df[column].dtype}"
        )
        raise Exception(msg)

## Correct spelling errors


Clean `economic_status`, `education_level`, `cause_of_death` and `country` columns by correcting the spelling:


In [75]:
df[["economic_status", "education_level"]] = df[
    ["economic_status", "education_level"]
].replace(
    {"high": "High", "l0w": "Low", "L0w": "Low", "low": "Low", "midd1e": "Middle"}
)

df["cause_of_death"] = df["cause_of_death"].replace(
    {"Cardi0vascular": "Cardiovascular", "Infecti0us diseases": "Infectious diseases"}
)

df["country"] = df["country"].str.replace("0", "o")

## Use absolute values


Replace values in the `population` and `healthcare_quality_index` columns with the absolute values:


In [76]:
df[["population", "healthcare_quality_index"]] = df[
    ["population", "healthcare_quality_index"]
].abs()

## Check outliers


Check for outliers in the numerical columns and raise an error if any exist:


In [80]:
for column_name in numerical_column_names:
    q1 = df[column_name].quantile(0.25)
    q3 = df[column_name].quantile(0.75)
    iqr = q3 - q1

    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    outliers = df[(df[column_name] < lower_bound) | (df[column_name] > upper_bound)][
        column_name
    ]

    if not outliers.empty:
        msg = "Found outliers"
        raise Exception(msg)

## Save the cleaned data to a CSV file


In [78]:
# Save to a new CSV file
df.to_csv("../dataset/clean.csv", index=False)