In [None]:
import os
import pandas as pd
from time import time
from tqdm import tqdm as progressbar
from utils.constant import *

print("Loading Datasets...")
datasets = []

for filename in progressbar(os.listdir(DIR_RAW_DATA)):
    if filename.endswith(".csv"):
        file_path = os.path.join(DIR_RAW_DATA, filename)
        df = pd.read_csv(file_path, low_memory=False)
        datasets.append(df)

print("Datasets loaded.")

In [None]:
def cleaning_datasets(datasets: list[pd.DataFrame], columns_to_keep: list[str] = ["Territorio", "SEXISTAT1", "ETA1", "TIME", "Value"]) -> list[pd.DataFrame]:
    """Pulisce una lista di DataFrame secondo criteri specificati."""

    for i, dataset in progressbar(enumerate(datasets), desc="Pulizia dei Dataset"):
        # Mantieni solo le colonne specificate
        dataset = dataset[columns_to_keep]

        # Rimuovi record indesiderati e stringhe specifiche, converte in numerico
        dataset = dataset[(dataset['SEXISTAT1'] != 9) & 
                          (dataset['ETA1'] != 'TOTAL') & 
                          (dataset['Territorio'].isin(ITA_STATE.keys()))]
        dataset['ETA1'] = pd.to_numeric(dataset['ETA1'].replace({'Y_GE100': '100'}).str.replace('Y', ''), errors='coerce')
        dataset.dropna(subset=['ETA1'], inplace=True)

        # Verifica tipo della colonna
        if not pd.api.types.is_integer_dtype(dataset['ETA1']):
            raise ValueError("La colonna ETA1 contiene ancora tipi non int")
        
        datasets[i] = dataset

    return datasets



print("Cleaning Datasets...")

cleaned_datasets = cleaning_datasets(datasets)

print("Datasets cleaned.")

In [None]:
def merge_datasets(datasets: list[pd.DataFrame]) -> pd.DataFrame:
    """Unisce una lista di DataFrame in un unico DataFrame."""

    def rename_columns(dataset: pd.DataFrame) -> pd.DataFrame:
        """Rinomina le colonne del DataFrame con nomi più riconoscibili."""

        new_column_names = {
            "Territorio": "Territory",
            "SEXISTAT1": "Sex",
            "TIME": "Year"
        }

        return dataset.rename(columns=new_column_names)
    
    def convert_unsupported_types(dataset: pd.DataFrame) -> pd.DataFrame:
        """Converte i tipi di dati non supportati da Parquet in tipi supportati."""
        for col in dataset.columns:
            dtype = dataset[col].dtype
            if isinstance(dtype, pd.CategoricalDtype) or pd.api.types.is_object_dtype(dataset[col]):
                dataset[col] = dataset[col].astype(str)
            elif isinstance(dtype, pd.IntervalDtype):
                dataset[col] = dataset[col].apply(lambda x: str(x) if pd.notnull(x) else None)
        return dataset

    def sort_by_year(dataset: pd.DataFrame) -> pd.DataFrame:
        """Ordina il DataFrame in base alla colonna TIME."""
        if not pd.api.types.is_numeric_dtype(dataset['TIME']):
            dataset['TIME'] = pd.to_numeric(dataset['TIME'], errors='coerce')
        return dataset.sort_values(by='TIME', ascending=True)

    merged_dataset = pd.concat(datasets, ignore_index=True)
    merged_dataset = convert_unsupported_types(merged_dataset)
    merged_dataset = sort_by_year(merged_dataset)
    merged_dataset = rename_columns(merged_dataset)
    merged_dataset = merged_dataset.drop_duplicates()

    return merged_dataset

print("Merging Datasets...")
merged_dataset = merge_datasets(cleaned_datasets)
print("Datasets merged.")

In [None]:
def age_range(dataset: pd.DataFrame, age_bins: list[int] = [0, 9, 19, 29, 39, 49, 59, 69, 79, 89, 99, 109]) -> pd.DataFrame:
    """Crea gruppi di età e aggiunge colonne di età raggruppate con nomi leggibili."""
    age_groups = pd.cut(dataset['ETA1'], bins=age_bins, right=False)

    pivot_df = dataset.pivot_table(
        index=['Year', 'Sex'],
        columns=age_groups, 
        values='Value', 
        aggfunc='sum', 
        fill_value=0, 
        observed=False
    ).reset_index()

    format_name = "Age_{}_{}"
    pivot_df.columns = ['Year', 'Sex'] + [format_name.format(interval.left, interval.right) for interval in pivot_df.columns[2:]]

    return pd.merge(dataset.drop(columns=['Value', 'ETA1']), pivot_df, on=['Year', 'Sex'], how='left')

def age_bins_percentage(dataset: pd.DataFrame) -> pd.DataFrame:
    """Calcola la percentuale di ogni fascia di età rispetto alla popolazione totale."""
    age_columns = dataset.filter(like='Age_').columns
    dataset['Total_Population'] = dataset[age_columns].sum(axis=1)
    for col in age_columns:
        dataset['Perc_' + col] = dataset[col] / dataset['Total_Population'] * 100
    return dataset

def population_growth_rate(dataset: pd.DataFrame) -> pd.DataFrame:
    dataset = dataset.sort_values(by=['Territory', 'Sex', 'Year'])
    dataset = dataset.fillna(0)
    dataset['Growth_Rate_Total_Population'] = dataset.groupby(['Territory', 'Sex'])['Total_Population'] \
                                                   .pct_change() * 100

    for age_group in dataset.columns:
        if age_group.startswith('Age_'):
            dataset[f'Growth_Rate_{age_group}'] = dataset.groupby(['Territory', 'Sex'])[age_group] \
                                                         .pct_change() * 100


    dataset['Growth_Rate_Total_Population'] = dataset.groupby(['Territory', 'Sex'])['Growth_Rate_Total_Population'] \
                                                   .transform(lambda x: x.fillna(0))

    for age_group in dataset.columns:
        if age_group.startswith('Growth_Rate_Age_'):
            dataset[age_group] = dataset.groupby(['Territory', 'Sex'])[age_group] \
                                        .transform(lambda x: x.fillna(0))

    dataset = dataset.drop_duplicates(subset=['Year', 'Territory', 'Sex'])

    return dataset

def cumulative_population(dataset: pd.DataFrame) -> pd.DataFrame:
    """Calcola la somma cumulativa della popolazione per ciascuna fascia di età."""
    age_columns = dataset.filter(like='Age_').columns
    dataset['Cumulative_Pop'] = dataset[age_columns].cumsum(axis=1).iloc[:, -1]
    return dataset

# Aggiorna il dizionario delle funzioni di feature
features_function = {
    "age_range": age_range,
    "age_bins_percentage": age_bins_percentage,
    "population_growth_rate": population_growth_rate,
    "cumulative_population": cumulative_population
}

def apply_feature_engineering(dataset: pd.DataFrame) -> list[pd.DataFrame]:
    """Applica tutte le funzioni di feature engineering definite a ciascun dataset."""
    for feature_name, feature_function in progressbar(features_function.items()):
        dataset = feature_function(dataset)  # Applica ogni funzione di feature engineering
    
    dataset = dataset.drop_duplicates()

    return dataset

print("Feature Engineering...")
merged_dataset.to_csv("merged_dataset.csv")
#featured_dataset = apply_feature_engineering(merged_dataset)
print("Feature Engineering done.")

In [None]:
def save_dataset(dataset: pd.DataFrame, output_dir: str = DIR_CLEANED_DATA, output_filename: str = f"merged_dataset", save_to_csv: bool = False):
    """Salva un DataFrame in formato Parquet e CSV (opzionale)."""
    if not os.path.exists(output_dir):  
        os.makedirs(output_dir)

    file_path = os.path.join(output_dir, output_filename)
    dataset.to_parquet(file_path + ".parquet", index=False)
    if save_to_csv:
        dataset.to_csv(file_path + ".csv", index=False)


def save_multiple_datasets(dataset: pd.DataFrame, output_dir: str = DIR_CLEANED_DATA, output_filename: str = f"dataset", save_to_csv: bool = False):
    for territory, code in progressbar(ITA_STATE.items(), desc="Saving Multiple Datasets"):
        territory_dataset = dataset[dataset['Territory'] == territory]
        save_dataset(territory_dataset, output_dir, f"{output_filename}_{code}", save_to_csv)


def process_and_save(dataset: pd.DataFrame, output_dir: str = DIR_CLEANED_DATA, output_filename: str = f"dataset", save_to_csv: bool = False, multiple_datasets: bool = False):
    """Processa i dataset e li salva in formato Parquet e CSV (opzionale)."""
    if not os.path.exists(output_dir):  
        os.makedirs(output_dir)

    try:
        save_dataset(dataset, output_dir, output_filename, save_to_csv)
        if multiple_datasets:
            save_multiple_datasets(dataset, output_dir, output_filename, save_to_csv)
            print("Multiple datasets saved successfully!")
        else:
            print("Dataset saved successfully!")

        file_path = os.path.join(output_dir, output_filename)
    except Exception as e:
        print(f"Error during saving: {e}")
        return None


print("Merging Datasets and Saving...")

process_and_save(featured_dataset, save_to_csv=True, multiple_datasets=True)