In [1]:
import os
import pandas as pd
from tqdm import tqdm as progressbar
from utils.constant import *

print("Loading Datasets...")
datasets = []

for filename in progressbar(os.listdir(DIR_RAW)):
    if filename.endswith(".csv"):
        file_path = os.path.join(DIR_RAW, filename)
        df = pd.read_csv(file_path, low_memory=False)
        datasets.append(df)

print("Datasets loaded.")

Loading Datasets...


100%|██████████| 6/6 [00:08<00:00,  1.45s/it]

Datasets loaded.





In [2]:
def clean_datasets(datasets: list[pd.DataFrame], columns_to_keep: list[str] = None) -> list[pd.DataFrame]:
    """Pulisce una lista di DataFrame secondo criteri specificati."""
    columns_to_keep = columns_to_keep or ["Territorio", "SEXISTAT1", "ETA1", "TIME", "Value"]

    for i, dataset in progressbar(enumerate(datasets), desc="Pulizia dei Dataset"):
        # Mantieni solo le colonne specificate
        dataset = dataset[columns_to_keep]

        # Rimuovi record indesiderati e stringhe specifiche, converte in numerico
        dataset = dataset[(dataset['SEXISTAT1'] != 9) & 
                          (dataset['ETA1'] != 'TOTAL') & 
                          (dataset['Territorio'].isin(ITA_STATE))]
        dataset['ETA1'] = pd.to_numeric(dataset['ETA1'].replace({'Y_GE100': '100'}).str.replace('Y', ''), errors='coerce')
        dataset.dropna(subset=['ETA1'], inplace=True)

        # Verifica tipo della colonna
        if not pd.api.types.is_integer_dtype(dataset['ETA1']):
            raise ValueError("La colonna ETA1 contiene ancora tipi non int")

        datasets[i] = dataset

    return datasets



print("Cleaning Datasets...")

datasets = clean_datasets(datasets)

print("Datasets cleaned.")

Cleaning Datasets...


Pulizia dei Dataset: 6it [00:01,  3.44it/s]

Datasets cleaned.





In [3]:
print("Feature Engineering...")

from utils import FeatureEngineering

fe = FeatureEngineering()
datasets = fe.apply(datasets)

print("Feature Engineering done.")

Feature Engineering...


Feature Engineering: 6it [00:00, 12.39it/s]


Feature Engineering done.


In [4]:
from utils import DatasetMerger

print("Merging Datasets and Saving...")
merger = DatasetMerger()
merger.process_and_save(datasets, save_to_csv=True)

Merging Datasets and Saving...
Dataset saved to ../data/cleaned/merged_dataset1725896203.parquet!
Dataset saved to ../data/cleaned/merged_dataset1725896203.csv!
