# Cleaning the dataset


Load the dataset:


In [41]:
import pandas as pd

file_path = "../dataset/dirty.csv"
df_clean = pd.read_csv(file_path, delimiter=";")

## Clean Column Names


Standardize column names:


In [42]:
df_clean.columns = (
    df_clean.columns.str.lower()
    .str.lstrip()
    .str.rstrip()
    .str.replace(".", "_")
    .str.replace("-", "_")
    .str.replace(" ", "_")
)

Rename specific columns:


In [43]:
df_clean = df_clean.rename(
    columns={
        "agegroup": "age_group",
        "health_expenditure%gdp": "health_expenditure_gdp",
        "mortality_rate_per_1000": "mortality_rate_per_thousand",
        "poverty_rate_%": "poverty_rate_percent",
    }
)

## Clean NA values


Remove NA values in categorical data:


In [44]:
df_clean = df_clean.dropna(
    subset=[
        "age_group",
        "cause_of_death",
        "country",
        "economic_status",
        "education_level",
        "year",
    ]
)

Convert `poverty_rate_percent` to float to prepare for the next step:


In [45]:
df_clean["poverty_rate_percent"] = (
    df_clean["poverty_rate_percent"].str.rstrip("%").astype(float)
)

Replace the NA values with the mean of the columns for other data:


In [46]:
replace_column_names = [
    "population",
    "health_expenditure_gdp",
    "mortality_rate_per_thousand",
    "life_expectancy",
    "gdp_per_capita",
    "healthcare_access_index",
    "poverty_rate_percent",
    "environmental_pollution_index",
    "healthcare_quality_index",
]

df_clean[replace_column_names] = df_clean[replace_column_names].fillna(
    value=df_clean[replace_column_names].mean()
)

## Correct column types


Convert the `year`, `population` columns to integer:


In [47]:
df_clean[["year", "population"]] = df_clean[["year", "population"]].astype(int)

## Correct spelling errors


Clean `economic_status`, `education_level`, `cause_of_death` and `country` columns by correcting the spelling:


In [48]:
df_clean[["economic_status", "education_level"]] = df_clean[
    ["economic_status", "education_level"]
].replace(
    {"high": "High", "l0w": "Low", "L0w": "Low", "low": "Low", "midd1e": "Middle"}
)

df_clean["cause_of_death"] = df_clean["cause_of_death"].replace(
    {"Cardi0vascular": "Cardiovascular", "Infecti0us diseases": "Infectious diseases"}
)

df_clean["country"] = df_clean["country"].str.replace("0", "o")

## Use absolute values


Replace values in the `population` and `healthcare_quality_index` columns with the absolute values:


In [49]:
df_clean[["population", "healthcare_quality_index"]] = df_clean[
    ["population", "healthcare_quality_index"]
].abs()

## Save the cleaned data to a CSV file


In [50]:
# Save to a new CSV file
df_clean.to_csv("../dataset/clean.csv", index=False)