In [1]:
# Librerie e costanti da usare
import os
from datetime import datetime
import numpy as np
import pandas as pd
import logging
import matplotlib.pyplot as plt

from tqdm import tqdm
from utils.constant import *

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures

import joblib 
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Settaggio
def setup_logging(log_file):
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)

    if logger.hasHandlers():
        logger.handlers.clear()

    file_handler = logging.FileHandler(log_file, mode='w')
    
    formatter = logging.Formatter('%(message)s')
    file_handler.setFormatter(formatter)

    logger.addHandler(file_handler)

    return logger

def close_logger(logger):
    handlers = logger.handlers[:]
    for handler in handlers:
        handler.close()
        logger.removeHandler(handler)

# Data Cleaning - Datasets ISTAT
Ri-formatto i dataset ISTAT con procedura di cleaning. Una volta creati i vari dataset, verranno mergiati in un unico dataset a cui si procederà al feature engineering

### Dataset dagli anni 1952 agli anni 1971

In [2]:
# Lettura del dataset

SOURCE_PATH = RAW_DATA_DEMOGRAPHICS + '1952-1971/'

datasets = []

for filename in os.listdir(SOURCE_PATH):
    if filename.endswith('.csv'):
        datasets.append(pd.read_csv(SOURCE_PATH + filename, encoding='latin1'))

In [3]:
# Pulizia dei dataset

def convert_eta(eta):
    if eta.startswith('Y_GE'):
        return 100 
    else:
        return int(eta[1:])
    
def convert_sesso(sesso):
    if sesso == 1:
        return 'M'
    elif sesso == 2:
        return 'F'
    else:
        return 'T'

colums_to_keep = ['Territorio', 'ETA1', 'TIME', 'SEXISTAT1', 'Value']

for i, df in enumerate(datasets):
    df = df[colums_to_keep]

    df = df[df['ETA1'] != 'TOTAL']
    df['ETA1']= df["ETA1"].apply(convert_eta)

    df['SEXISTAT1'] = df['SEXISTAT1'].apply(convert_sesso)

    df = df[df['Territorio'].isin(list(MAPPING_REGION.keys()))]
    datasets[i] = df

In [4]:
# Strutturazione del dataset

COLUMN_NAMES = {
    'Territorio': 'Area',
    'ETA1': 'Eta',
    'TIME': 'Anno',
    'SEXISTAT1': 'Sesso',
}

def categorize_age(age):
    for i, age_range in enumerate(AGE_GROUP["categories"]):
        if age in age_range:
            return AGE_GROUP["age_labels"][i]
    else:
        return None
    
for i, df in enumerate(datasets):
    df = df.rename(columns=COLUMN_NAMES)

    df['fascia_eta'] = df['Eta'].apply(categorize_age)
    
    df = df.groupby(['Area', 'Sesso', 'Anno', 'fascia_eta'], as_index=False)['Value'].sum()

    dataset_pivot = df.pivot_table(
        index=['Area', 'Sesso', 'Anno'], 
        columns='fascia_eta', 
        values='Value', 
        aggfunc='sum', 
        observed=True
        ).reset_index()


    dataset_pivot.columns.name = None 
    dataset_pivot = dataset_pivot.rename_axis(None, axis=1) 
    dataset_pivot['Area'] = dataset_pivot['Area'].replace(MAPPING_REGION)
    datasets[i] = dataset_pivot

In [5]:
# Salvataggio dei dataset

DESTINATION_PATH = PROCESSED_DATA_DEMOGRAPHICS + '1952-1971/'

if not os.path.exists(DESTINATION_PATH):
    os.makedirs(DESTINATION_PATH)

final_dataset = pd.concat(datasets, ignore_index=True)
final_dataset = final_dataset.drop_duplicates()

final_dataset.sort_values(by=['Area', 'Sesso', 'Anno'], inplace=True)
final_dataset.to_parquet(DESTINATION_PATH + 'popolazione.parquet', index=False)
final_dataset.to_csv(DESTINATION_PATH + 'popolazione.csv', index=False)

### Dataset dagli anni 1972 agli anni 1981

In [6]:
# Lettura del dataset

SOURCE_PATH = RAW_DATA_DEMOGRAPHICS + '1972-1981/'

datasets = []

for filename in os.listdir(SOURCE_PATH):
    if filename.endswith('.csv'):
        datasets.append(pd.read_csv(SOURCE_PATH + filename, encoding='latin1'))

In [7]:
# Pulizia dei dataset

def convert_eta(eta):
    if eta.startswith('Y_GE'):
        return 100 
    else:
        return int(eta[1:])
    
def convert_sesso(sesso):
    if sesso == 1:
        return 'M'
    elif sesso == 2:
        return 'F'
    else:
        return 'T'

colums_to_keep = ['Territorio', 'ETA1', 'TIME', 'SEXISTAT1', 'Value']

for i, df in enumerate(datasets):
    df = df[colums_to_keep]

    df = df[df['ETA1'] != 'TOTAL']
    df['ETA1']= df["ETA1"].apply(convert_eta)

    df['SEXISTAT1'] = df['SEXISTAT1'].apply(convert_sesso)

    df = df[df['Territorio'].isin(list(MAPPING_REGION.keys()))]
    datasets[i] = df

In [8]:
# Strutturazione del dataset

COLUMN_NAMES = {
    'Territorio': 'Area',
    'ETA1': 'Eta',
    'TIME': 'Anno',
    'SEXISTAT1': 'Sesso',
}

def categorize_age(age):
    for i, age_range in enumerate(AGE_GROUP["categories"]):
        if age in age_range:
            return AGE_GROUP["age_labels"][i]
    else:
        return None
    
for i, df in enumerate(datasets):
    df = df.rename(columns=COLUMN_NAMES)

    df['fascia_eta'] = df['Eta'].apply(categorize_age)
    
    df = df.groupby(['Area', 'Sesso', 'Anno', 'fascia_eta'], as_index=False)['Value'].sum()

    dataset_pivot = df.pivot_table(
        index=['Area', 'Sesso', 'Anno'], 
        columns='fascia_eta', 
        values='Value', 
        aggfunc='sum', 
        observed=True
        ).reset_index()


    dataset_pivot.columns.name = None 
    dataset_pivot = dataset_pivot.rename_axis(None, axis=1) 
    dataset_pivot['Area'] = dataset_pivot['Area'].replace(MAPPING_REGION)
    datasets[i] = dataset_pivot

In [9]:
# Salvataggio dei dataset

DESTINATION_PATH = PROCESSED_DATA_DEMOGRAPHICS + '1972-1981/'

if not os.path.exists(DESTINATION_PATH):
    os.makedirs(DESTINATION_PATH)

final_dataset = pd.concat(datasets, ignore_index=True)
final_dataset = final_dataset.drop_duplicates()

final_dataset.sort_values(by=['Area', 'Sesso', 'Anno'], inplace=True)
final_dataset.to_parquet(DESTINATION_PATH + 'popolazione.parquet', index=False)
final_dataset.to_csv(DESTINATION_PATH + 'popolazione.csv', index=False)

### Dataset dagli anni 1982 agli anni 1991

In [10]:
# Lettura dei dati

SOURCE_PATH = RAW_DATA_DEMOGRAPHICS + '1982-1991/'

datasets = []

for filename in os.listdir(SOURCE_PATH):
    if filename.endswith('.csv'):
        datasets.append(pd.read_csv(SOURCE_PATH + filename, sep=';', encoding='latin1'))

In [11]:
# Pulizia dei dati

for i, dataset in enumerate(datasets):
    dataset = dataset[dataset["Tipo aggregazione"].isin(["Regione", "Totale"])]
    dataset = dataset[dataset["ETA"] != 99]
    
    dataset = dataset.drop(columns=["Tipo aggregazione", "Codice aggregazione"])
    dataset["Aggregazione"] = dataset["Aggregazione"].str.capitalize()
    datasets[i] = dataset

In [12]:
# Strutturazione dei dati

COLUMN_NAMES = {
    "Aggregazione": "Area",
    "Sesso": "Sesso",
    "ETA": "Eta",
    "Anno": "Anno"
}

def categorize_age(age):
    for i, age_range in enumerate(AGE_GROUP["categories"]):
        if age in age_range:
            return AGE_GROUP["age_labels"][i]
    else:
        return None
        

for i, dataset in enumerate(datasets):
    dataset = dataset.melt(id_vars=[ "Aggregazione", "Sesso", "ETA"],
                        var_name="Anno", value_name="Popolazione")
    
    dataset['Anno'] = dataset['Anno'].str.extract(r'(\d+)').astype(int)

    dataset['fascia_eta'] = dataset['ETA'].apply(get_age_group)
    dateset = dataset.groupby(['Aggregazione', 'Sesso', 'Anno', 'fascia_eta'], as_index=False)['Popolazione'].sum()

    dataset_pivot = dateset.pivot_table(
        index=['Aggregazione', 'Sesso', 'Anno'], 
        columns='fascia_eta', 
        values='Popolazione', 
        aggfunc='sum', 
        observed=True
        ).reset_index()

    dataset_pivot.columns.name = None 
    dataset_pivot = dataset_pivot.rename_axis(None, axis=1) 
    dataset_pivot = dataset_pivot.rename(columns=COLUMN_NAMES)
    dataset_pivot['Area'] = dataset_pivot['Area'].replace(MAPPING_REGION)
    datasets[i] = dataset_pivot

In [13]:
# Salvataggio dei dati

DESTINATION_PATH = PROCESSED_DATA_DEMOGRAPHICS + '1982-1991/'

if not os.path.exists(DESTINATION_PATH):
    os.makedirs(DESTINATION_PATH)
    
final_dataset = pd.concat(datasets, ignore_index=True)
final_dataset = final_dataset.drop_duplicates()

final_dataset.sort_values(by=['Area', 'Anno', 'Sesso'], inplace=True)
final_dataset.to_parquet(DESTINATION_PATH + 'popolazione.parquet', index=False)
final_dataset.to_csv(DESTINATION_PATH + 'popolazione.csv', index=False)

### Dataset dagli anni 1992 agli anni 2001

In [14]:
# Lettura dei dati

SOURCE_PATH = RAW_DATA_DEMOGRAPHICS + '1992-2001/'

datasets = {}

for filename in os.listdir(SOURCE_PATH):
    if filename.endswith('.csv'):
        year = filename.split('.')[0]
        datasets[year] = pd.read_csv(SOURCE_PATH + filename, encoding='latin1')

In [15]:
# Pulizia dei dati

for k, df in datasets.items():
    df = df.drop(columns=["Codice regione"])
    datasets[k] = df

In [16]:
# Strutturazione dei dati

COLUMN_NAMES = {
    "Regione": "Area",
    "Sesso": "Sesso",
    "ETA": "Eta",
    "Anno": "Anno"
}

for k, df in datasets.items():
    df["Anno"] = int(k)
    df["Sesso"] = df["Sesso"].replace({"Maschi": "M", "Femmine": "F", "Totale": "T"})
    df["ETA"] = df["ETA"].replace({"100 e oltre": 100}).astype(int)
    df["Popolazione"] = df["Popolazione"].astype(int)

    df['fascia_eta'] = df['ETA'].apply(get_age_group)
    dataset_pivot = df.pivot_table(
            index=['Regione', 'Sesso', 'Anno'], 
            columns='fascia_eta', 
            values='Popolazione', 
            aggfunc='sum', 
            observed=True
            ).reset_index()

    dataset_pivot.columns.name = None 
    dataset_pivot = dataset_pivot.rename_axis(None, axis=1) 
    dataset_pivot = dataset_pivot.rename(columns=COLUMN_NAMES)
    dataset_pivot['Area'] = dataset_pivot['Area'].replace(MAPPING_REGION)
    datasets[k] = dataset_pivot
    

> Visto che il dataset di questo periodo non presentano come Area l'Italia, dobbiamo applicare tecniche di feature engineering per crearla artifecialmente. \
Semplicemente sommiamo di uno specifico anno e sesso per ogni regione e creamo i dati relativi al paese.

In [17]:
# Aggiunta del terrorio nazionale


for k in list(datasets.keys()):
    df = datasets[k]
    years = list(df["Anno"].unique())
    
    for year in years:
        df_year = df[df["Anno"] == year]
        
        # Creazione del territorio nazionale Maschile
        df_year_m = df_year[df_year["Sesso"] == "M"]
        new_item_m = {
            "Area": "Italia",
            "Anno": year,
            "Sesso": "M"
        }
        
        # Aggiungiamo dinamicamente le fasce di età
        for label in AGE_GROUP["age_labels"]:
            new_item_m[label] = df_year_m[label].sum()

        new_df_m = pd.DataFrame([new_item_m])

        
        # Creazione del territorio nazionale Femminile
        df_year_f = df_year[df_year["Sesso"] == "F"]
        new_item_f = {
            "Area": "Italia",
            "Anno": year,
            "Sesso": "F"
        }
        
        # Aggiungiamo dinamicamente le fasce di età
        for label in AGE_GROUP["age_labels"]:
            new_item_f[label] = df_year_f[label].sum()

        new_df_f = pd.DataFrame([new_item_f])


        # Creazione del territorio nazionale Totale
        df_year_t = df_year[df_year["Sesso"] == "T"]
        new_item_t = {
            "Area": "Italia",
            "Anno": year,
            "Sesso": "T"
        }

        # Aggiungiamo dinamicamente le fasce di età
        for label in AGE_GROUP["age_labels"]:
            new_item_t[label] = df_year_t[label].sum()

        new_df_t = pd.DataFrame([new_item_t])
        
        # Concatenare i nuovi dati al DataFrame esistente
        update_df = pd.concat([df, new_df_m, new_df_f, new_df_t], ignore_index=True)
        datasets[year] = update_df

In [18]:
# Salvataggio dei dati

DESTINATION_PATH = PROCESSED_DATA_DEMOGRAPHICS + '1992-2001/'

if not os.path.exists(DESTINATION_PATH):
    os.makedirs(DESTINATION_PATH)
    
final_dataset = pd.concat(datasets, ignore_index=True)
final_dataset = final_dataset.drop_duplicates()

final_dataset = final_dataset.sort_values(by=["Area", "Anno", "Sesso"])
final_dataset.to_parquet(DESTINATION_PATH + 'popolazione.parquet', index=False)
final_dataset.to_csv(DESTINATION_PATH + 'popolazione.csv', index=False)

### Dataset dagli anni 2002 agli anni 2018

Il file CSV di questo dataset è formattato in modo non comune e ciò complica la lettura da parte della libreria pandas. \
Per risolvere questo problema, creamo un codice che dal fi

In [19]:
# Formatazione dei file CSV
SOURCE_PATH = RAW_DATA_DEMOGRAPHICS + '2002-2018/'

DESTINATION_PATH = PROCESSED_DATA_DEMOGRAPHICS + '2002-2018/formatted/'

BLACKLIST = [
    "\"Ricostruzione della popolazione intercensuaria - Popolazione al 1° gennaio per età\"\n", 
    "\n",
    "\"Popolazione per età, vista per territorio - Tutte le regioni\"\n"
]

def chunk_list(lst, n):
    """Divide una lista in sotto-liste di lunghezza n."""
    return [lst[i:i + n] for i in range(0, len(lst), n)]

if not os.path.exists(DESTINATION_PATH):
    os.makedirs(DESTINATION_PATH)

for filename in os.listdir(SOURCE_PATH):
    if not filename.endswith(".csv"):
        continue
    
    lines = []
    with open(SOURCE_PATH + filename, 'r') as f:
        lines = f.readlines()
        lines = [line.replace('Territorio/Età;', 'Codice regione;Regione') for line in lines]
        lines = [line.replace(';', ',') for line in lines]
        lines = [line.replace(',,', ',') for line in lines]
        lines = [line for line in lines if line not in BLACKLIST]
    chunked_list = chunk_list(lines, 70)
    
    for chunck in chunked_list:
        year = int(chunck[0].split(' ')[-1][:-2])
        if "Tutte le cittadinanze" not in chunck[0]:
            continue
        
        chunck = chunck[1:]
        gender_chunk = chunk_list(chunck, 23)
        
        for inner_chunck in gender_chunk:
            inner_chunck = inner_chunck[:-1]
            gender = inner_chunck[1].split(',')[-1].strip()
            del inner_chunck[1]
            
            with open(DESTINATION_PATH + f"{year}_{gender[0]}.csv", 'w') as f:
                f.writelines(inner_chunck)

In [20]:
# Lettura dei dati

SOURCE_PATH = DESTINATION_PATH

temp = {
    str(year): {}
    for year in range(2002, 2018 + 1)
}

for filename in os.listdir(SOURCE_PATH):
    if not filename.endswith('.csv') or filename.startswith('2019'):
        continue
    name = filename.split('.')[0]
    year, gender = name.split('_')
    
    item = {
        gender: pd.read_csv(SOURCE_PATH + filename, encoding='latin1')
    }
    temp[year].update(item)
    
    
datasets = {}
for year, data in temp.items():
    for gender, df in data.items():
        df["Sesso"] = gender 
        data[gender] = df
    datasets[year] = pd.concat(data.values(), ignore_index=True)

In [21]:
# Pulizia dei dati

for k, df in datasets.items():
    df = df.drop(columns=["Codice regione"])
    datasets[k] = df

In [22]:
# Strutturazione dei dati

COLUMN_NAMES = {
    "Regione": "Area",
    "Sesso": "Sesso",
    "Anno": "Anno"
}

def create_age_bins(df: pd.DataFrame) -> pd.DataFrame:
    age_bins = {}
    
    for age_range, label in zip(AGE_GROUP['categories'], AGE_GROUP['age_labels']):
        if label == AGE_GROUP['age_labels'][-1]:
            columns = [str(i) for i in range(85, 100)]  # Colonne corrispondenti alla fascia di età
        else:
            columns = [str(i) for i in age_range]
        age_bins[label] = df[columns].sum(axis=1)  # Somma lungo la riga per ciascun gruppo
    
    df_age_bins = pd.DataFrame(age_bins)
    
    return df_age_bins

for k, df in datasets.items():
    df["Anno"] = int(k)
    
    df_age_bins = create_age_bins(df)
    df_final = pd.concat([df[['Anno', 'Regione', 'Sesso']], df_age_bins], axis=1)
    df_final = df_final.rename(columns=COLUMN_NAMES)
    df_final['Area'] = df_final['Area'].replace(MAPPING_REGION)
    datasets[k] = df_final



In [23]:
# Aggiunta del territorio nazionale

for k in list(datasets.keys()):
    df = datasets[k]
    df: pd.DataFrame
    years = list(df["Anno"].unique())
    
    for year in years:
        df_year = df[df["Anno"] == year]
        
        df_year_m = df_year[df_year["Sesso"] == "M"]
        new_item_m = {
            "Area": "Italia",
            "Anno": year,
            "Sesso": "M"
        }
        
        for label in AGE_GROUP["age_labels"]:
            new_item_m[label] = df_year_m[label].sum()

        new_df_m = pd.DataFrame([new_item_m])

        
        df_year_f = df_year[df_year["Sesso"] == "F"]
        new_item_f = {
            "Area": "Italia",
            "Anno": year,
            "Sesso": "F"
        }
        
        for label in AGE_GROUP["age_labels"]:
            new_item_f[label] = df_year_f[label].sum()

        new_df_f = pd.DataFrame([new_item_f])


        df_year_t = df_year[df_year["Sesso"] == "T"]
        new_item_t = {
            "Area": "Italia",
            "Anno": year,
            "Sesso": "T"
        }

        for label in AGE_GROUP["age_labels"]:
            new_item_t[label] = df_year_t[label].sum()

        new_df_t = pd.DataFrame([new_item_t])
        
        update_df = pd.concat([df, new_df_m, new_df_f, new_df_t], ignore_index=True)
        datasets[year] = update_df

In [24]:
# Salvataggio dei dati

DESTINATION_PATH = PROCESSED_DATA_DEMOGRAPHICS + '2002-2018/'

if not os.path.exists(DESTINATION_PATH):
    os.makedirs(DESTINATION_PATH)
    
final_dataset = pd.concat(datasets, ignore_index=True)
final_dataset.drop_duplicates(inplace=True)
final_dataset = final_dataset.sort_values(by=["Area", "Anno", "Sesso"])
final_dataset.to_parquet(DESTINATION_PATH + 'popolazione.parquet', index=False)
final_dataset.to_csv(DESTINATION_PATH + 'popolazione.csv', index=False)

### Dataset dagli anni 2019 agli anni 2023

In [25]:
# Lettura dei dati

SOURCE_PATH = RAW_DATA_DEMOGRAPHICS + '2019-2024/'

datasets = {}

for filename in os.listdir(SOURCE_PATH):
    if filename.endswith('.csv') and filename != '2024.csv':
        year = filename.split('.')[0]
        datasets[year] = pd.read_csv(SOURCE_PATH + filename, skiprows=1, encoding='latin1', sep=';')

In [26]:
# Pulizia dei dati

keep_columns = ["Regione", "ETA", "Totale maschi", "Totale femmine", "Totale"]

for k, df in datasets.items():
    df = df[df["ETA"] != 999]
    df = df.drop(columns=[col for col in df.columns if col not in keep_columns])
    datasets[k] = df   

In [27]:
# Strutturazione dei dati

COLUMN_NAMES = {
    "Regione": "Area",
}

for k, df in datasets.items():
    df["Anno"] = int(k)
    
    # Suddivisione tra maschi e femmine 
    df_male = df[["Anno", "Regione", "ETA", "Totale maschi"]].copy()
    df_male["Sesso"] = "M"
    df_male.rename(columns={"Totale maschi": "Popolazione"}, inplace=True)
    
    df_female = df[["Anno", "Regione", "ETA", "Totale femmine"]].copy()
    df_female["Sesso"] = "F"
    df_female.rename(columns={"Totale femmine": "Popolazione"}, inplace=True)

    df_total = df[["Anno", "Regione", "ETA", "Totale"]].copy()
    df_total["Sesso"] = "T"
    df_total.rename(columns={"Totale": "Popolazione"}, inplace=True)
    
    df = pd.concat([df_male, df_female, df_total], ignore_index=True)
    df['fascia_eta'] = df['ETA'].apply(get_age_group)

    df = df.groupby(['Regione', 'Sesso', 'Anno', 'fascia_eta'], as_index=False)['Popolazione'].sum()
    dataset_pivot = df.pivot_table(
            index=['Regione', 'Sesso', 'Anno'], 
            columns='fascia_eta', 
            values='Popolazione', 
            aggfunc='sum', 
            observed=True
            ).reset_index()
    
    dataset_pivot.columns.name = None 
    dataset_pivot = dataset_pivot.rename_axis(None, axis=1) 
    dataset_pivot = dataset_pivot.rename(columns=COLUMN_NAMES)
    dataset_pivot['Area'] = dataset_pivot['Area'].replace(MAPPING_REGION)
    datasets[k] = dataset_pivot

In [28]:
# Aggiunta del terrorio nazionale

for k in list(datasets.keys()):

    df = datasets[k]
    years = list(df["Anno"].unique())
    for year in years:
        df_year = df[df["Anno"] == year]
        
        df_year_m = df_year[df_year["Sesso"] == "M"]
        new_item_m = {
            "Area": "Italia",
            "Anno": year,
            "Sesso": "M"
        }
        
        for label in AGE_GROUP["age_labels"]:
            new_item_m[label] = df_year_m[label].sum()

        new_df_m = pd.DataFrame([new_item_m])

        
        df_year_f = df_year[df_year["Sesso"] == "F"]
        new_item_f = {
            "Area": "Italia",
            "Anno": year,
            "Sesso": "F"
        }
        
        for label in AGE_GROUP["age_labels"]:
            new_item_f[label] = df_year_f[label].sum()

        new_df_f = pd.DataFrame([new_item_f])


        df_year_t = df_year[df_year["Sesso"] == "T"]
        new_item_t = {
            "Area": "Italia",
            "Anno": year,
            "Sesso": "T"
        }

        for label in AGE_GROUP["age_labels"]:
            new_item_t[label] = df_year_t[label].sum()

        new_df_t = pd.DataFrame([new_item_t])
        
        update_df = pd.concat([df, new_df_m, new_df_f, new_df_t], ignore_index=True)
        datasets[year] = update_df
    

In [29]:
# Salvataggio dei dati

DESTINATION_PATH = PROCESSED_DATA_DEMOGRAPHICS + '2019-2023/'

if not os.path.exists(DESTINATION_PATH):
    os.makedirs(DESTINATION_PATH)
    
final_dataset = pd.concat(datasets, ignore_index=True)
final_dataset.drop_duplicates(inplace=True)

final_dataset = final_dataset.sort_values(by=["Area", "Anno", "Sesso"])
final_dataset.to_parquet(DESTINATION_PATH + 'popolazione.parquet', index=False)
final_dataset.to_csv(DESTINATION_PATH + 'popolazione.csv', index=False)

# Feature Engineering

In [30]:
# Caricamento dei dati

SOURCE_PATH = PROCESSED_DATA_DEMOGRAPHICS

datasets = []

for dirname in os.listdir(SOURCE_PATH):
    for filename in os.listdir(SOURCE_PATH + dirname + "/"):
        if filename.endswith('.parquet'):
            datasets.append(pd.read_parquet(SOURCE_PATH + dirname + '/' + filename))
        
dataset = pd.concat(datasets, ignore_index=True)

In [31]:
# [F] Popolazione totale

age_columns = AGE_GROUP["age_labels"]
dataset['Popolazione_Totale'] = dataset[age_columns].sum(axis=1)

In [32]:
# [F] Percentuale di popolazione per fascia d'età

for col in age_columns:
    dataset[col + '_Perc'] = (dataset[col] / dataset['Popolazione_Totale']) * 100

In [33]:
# [F] Crescita della popolazione

dataset = dataset.sort_values(by=['Area', 'Sesso', 'Anno'])

dataset = dataset.fillna(0)
dataset['Crescita_Popolazione_Totale'] = dataset.groupby(['Area', 'Sesso'])['Popolazione_Totale'].pct_change() * 100                                                   
for col in age_columns:
    dataset[f'Crescita_{col}'] = dataset.groupby(['Area', 'Sesso'])[col].pct_change() * 100
                                        
dataset['Crescita_Popolazione_Totale'] = dataset.groupby(['Area', 'Sesso'])['Crescita_Popolazione_Totale'].transform(lambda x: x.fillna(0))
for col in age_columns:
    dataset[f'Crescita_{col}'] = dataset.groupby(['Area', 'Sesso'])[f'Crescita_{col}'].transform(lambda x: x.fillna(0))
    
dataset = dataset.drop_duplicates(subset=['Anno', 'Area', 'Sesso'])


In [34]:
# Salvataggio dei dati

DESTINATION_PATH = CLEANED_DATA_DEMOGRAPHICS

if not os.path.exists(DESTINATION_PATH):
    os.makedirs(DESTINATION_PATH)

dataset.to_parquet(os.path.join(DESTINATION_PATH, 'popolazione.parquet'), index=False)
dataset.to_csv(os.path.join(DESTINATION_PATH, 'popolazione.csv'), index=False)

DESTINATION_PATH = CLEANED_DATA_DEMOGRAPHICS + 'by_area/'

for area in dataset['Area'].unique():
    
    dataset_area = dataset[dataset['Area'] == area]
    code_area = ITALIAN_REGION_CODE[area]
    path = os.path.join(DESTINATION_PATH, code_area)
    if not os.path.exists(path):
        os.makedirs(path)

    dataset_area = dataset_area.sort_values(by=['Anno', 'Sesso'])
    dataset_area.to_parquet(os.path.join(path, 'popolazione.parquet'), index=False)
    dataset_area.to_csv(os.path.join(path, 'popolazione.csv'), index=False)

# Data Chart

In [2]:
# Caricamento dei dati

age_description_dict = {label: f"Fascia di età dai {label.split('-')[0]} anni ai {label.split('-')[1]} anni" if '-' in label else f"Fascia di età dagli {label.split('+')[0]} anni in su" for label in AGE_GROUP["age_labels"]}

datasets = {}

for code in ITALIAN_REGION_CODE.values():
    path = CLEANED_DATA_DEMOGRAPHICS + f'by_area/{code}/'
    datasets[code] = pd.read_parquet(path + 'popolazione.parquet')



In [None]:
# [C1] Grafico della popolazione per fascia di età e sesso

DESTINATION_PATH = DEMO_YEAR_CHART_PATH

def plot_population_by_age_and_sex(data, year_to_analyze, territory_to_analyze, output_dir):
    data_filtered = data[(data['Anno'] == year_to_analyze) & (data['Area'] == territory_to_analyze)]

    male_data, female_data = (data_filtered[data_filtered['Sesso'] == i] for i in ["M", "F"])
    age_keys = list(age_description_dict.keys())
    if not all(key in data_filtered.columns for key in age_keys):
        raise ValueError("Le colonne delle fasce di età non sono presenti nel dataframe.")

    male_population = male_data[age_keys].sum().values
    female_population = female_data[age_keys].sum().values

    plt.figure(figsize=(10, 8))
    bar_width = 0.4
    indices = np.arange(len(age_description_dict))

    plt.barh(indices - bar_width/2, male_population, bar_width, color='blue', label='Maschi')
    plt.barh(indices + bar_width/2, female_population, bar_width, color='violet', label='Femmine')

    plt.yticks(indices, age_description_dict.values())
    plt.xlabel('Popolazione')
    plt.ylabel('Fascia di Età')
    plt.title(f'Popolazione per Fascia di Età e Sesso in {territory_to_analyze} nel {year_to_analyze}')

    plt.legend(loc="best")
    plt.grid(axis='x', linestyle='--')
    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(os.path.join(output_dir, f'population_{year_to_analyze}.jpeg'))
    plt.close()

if not os.path.exists(DESTINATION_PATH):
    os.makedirs(DESTINATION_PATH)

# For every region
for code, data in tqdm(datasets.items()):
    path = DESTINATION_PATH + code + '/'
    for year in data['Anno'].unique():
        area = ITALIAN_REGIONE_BY_CODE[code]
        plot_population_by_age_and_sex(data, year, area, path)


In [None]:
# [C2] Grafico cartesiano della popolazione - gender specific

DESTINATION_PATH = DEMO_TOTAL_CHART_PATH 

def plot_age_group_growth(data, age_start, age_end, territory, output_dir):
    format_col_age = f"{age_start}-{age_end}"

    if format_col_age == "85-89":
        format_col_age = "85+"

    if format_col_age not in data.columns:
        raise Exception()

    data = data[data['Area'] == territory]
    male_population, female_population = [], []
    years = data['Anno'].unique()

    for year in years:
        male_population.append(int(data[(data["Sesso"] == "M") & (data["Anno"] == year)][format_col_age].values[0]))
        female_population.append(int(data[(data["Sesso"] == "F") & (data["Anno"] == year)][format_col_age].values[0]))


    plt.figure(figsize=(12, 6))
    plt.plot(years, male_population, label='Maschi', color='blue', marker='o')
    plt.plot(years, female_population, label='Femmine', color='violet', marker='o')

    plt.xlabel('Anno')
    plt.ylabel('Popolazione')
    plt.title(f'Popolazione per la Fascia di Età {age_start}-{age_end} in {territory}')
    plt.legend()
    plt.grid(True)
    
    max_population = max(max(male_population), max(female_population))
    step_size = max_population // 12
    plt.yticks(np.arange(0, max_population + step_size, step_size))
    plt.ylim(bottom=0)

    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(os.path.join(output_dir, f'{age_start}_{age_end}.jpeg'))
    plt.close()

if not os.path.exists(DESTINATION_PATH):
    os.makedirs(DESTINATION_PATH)


# For every region
for code, data in tqdm(datasets.items()):
    path = DESTINATION_PATH + f"{code}/gender-specific/ages/"
    ante = 0
    for post in range(4, 90, 5):
        area = ITALIAN_REGIONE_BY_CODE[code]
        plot_age_group_growth(data, ante, post, ITALIAN_REGIONE_BY_CODE[code], path)
        ante += 5

In [None]:
# [C2] Grigli di grafici cartesiani della popolazione - gender specific

DESTINATION_PATH = DEMO_TOTAL_CHART_PATH

def plot_age_group_growth_grid(dataset, territory, output_dir):
    dataset = dataset[dataset['Area'] == territory]
    fig, axs = plt.subplots(3, 6, figsize=(18, 18)) 
    axs = axs.ravel()

    max_population = dataset[AGE_GROUP["age_labels"]].max().max()
    step_size = max_population // 12

    years = dataset['Anno'].unique()
    for i, col_name in enumerate(AGE_GROUP["age_labels"]):
        male_population, female_population = [], []
        
        for year in years:
            male_population.append(int(dataset[(dataset["Sesso"] == "M") & (dataset["Anno"] == year)][col_name].values[0]))
            female_population.append(int(dataset[(dataset["Sesso"] == "F") & (dataset["Anno"] == year)][col_name].values[0]))

        ax = axs[i]

        ax.plot(years, male_population, label='Maschi', color='blue')
        ax.plot(years, female_population, label='Femmine', color='violet')
        ax.set_title(col_name, fontsize=10)
        ax.set_xlabel('Anno')
        ax.set_ylabel('Popolazione')
        ax.grid(True)
        ax.set_yticks(np.arange(0, max_population + step_size * 2, step_size))
        
    for ax in axs[len(AGE_GROUP["age_labels"]):]:
        fig.delaxes(ax)


    fig.legend(['Maschi', 'Femmine'], loc='upper right', fontsize=12)
    plt.tight_layout()
    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(os.path.join(output_dir, f'grid_ages.jpeg'))
    plt.close()

# For every region
for code, data in tqdm(datasets.items()):
    path = DESTINATION_PATH + code + '/gender-specific/'

    if not os.path.exists(path):
        os.makedirs(path)

    plot_age_group_growth_grid(data, ITALIAN_REGIONE_BY_CODE[code], path)

In [None]:
# [C2] Grafico cartesiano della popolazione totale

def plot_tot_polutation(data, territory, output_path):
    data = data[data['Area'] == territory]
    data = data[data['Sesso'] == 'T']

    years = data["Anno"].unique()
    tp = []
    for year in years:
        data_year = data[data["Anno"] == year]
        tp.append(data_year["Popolazione_Totale"].sum())

    plt.figure(figsize=(12, 6))
    plt.plot(years, tp, marker='o', color='blue')
    plt.xlabel('Anno')
    plt.ylabel('Popolazione')

    plt.xticks(years, rotation=90)
    max_population = max(tp)
    step_size = max_population // 12
    min_population = min(tp) - step_size if min(tp) - step_size > 0 else 0
    plt.yticks(np.arange(min_population, max_population + (2*step_size), step_size))
    plt.title(f'Popolazione Totale in {territory} negli anni')
    plt.grid(True)

    os.makedirs(output_path, exist_ok=True)
    plt.savefig(os.path.join(output_path, 'popolazione_totale.jpeg'))
    plt.close()

# For every region
for code, data in tqdm(datasets.items()):
    path = DEMO_TOTAL_CHART_PATH + code + '/'
    if not os.path.exists(path):
        os.makedirs(path)

    plot_tot_polutation(data, ITALIAN_REGIONE_BY_CODE[code], path)
        

In [None]:
# [C2] Griglia di grafici cartesiani della popolazione totale

def plot_total_age_group_growth_grid(dataset, territory, output_dir):
    dataset = dataset[dataset['Area'] == territory]
    fig, axs = plt.subplots(3, 6, figsize=(18, 18)) 
    axs = axs.ravel()

    max_population = dataset[AGE_GROUP["age_labels"]].max().max()
    step_size = max_population // 12

    years = dataset['Anno'].unique()
    for i, col_name in enumerate(AGE_GROUP["age_labels"]):
        total_population = []
        
        for year in years:
            total_population.append(int(dataset[(dataset["Sesso"] == "T") & (dataset["Anno"] == year)][col_name].values[0]))

        ax = axs[i]

        ax.plot(years, total_population, label='Totale', color='red')
        ax.set_title(col_name, fontsize=10)
        ax.set_xlabel('Anno')
        ax.set_ylabel('Popolazione')
        ax.grid(True)
        ax.set_yticks(np.arange(0, max_population + step_size * 2, step_size))
        
    for ax in axs[len(AGE_GROUP["age_labels"]):]:
        fig.delaxes(ax)


    plt.tight_layout()
    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(os.path.join(output_dir, f'griglia_popolazione_totale.jpeg'))
    plt.close()

# For every region
for code, data in tqdm(datasets.items()):
    path = DEMO_TOTAL_CHART_PATH + code + '/'

    if not os.path.exists(path):
        os.makedirs(path)

    plot_total_age_group_growth_grid(data, ITALIAN_REGIONE_BY_CODE[code], path)

In [None]:
# [C3] Grafico cartesiano della crescita della popolazione - gender specific

DESTINATION_PATH = DEMO_GROWTH_CHART_PATH

def plot_age_group_growth(data, age_start, age_end, territory, output_dir):
    format_col_age = f"Crescita_{age_start}-{age_end}"

    if format_col_age == "Crescita_85-89":
        format_col_age = "Crescita_85+"

    if format_col_age not in data.columns:
        raise Exception(f"Colonna {format_col_age} non presente nel dataset.")

    data = data[data['Area'] == territory]
    male_population, female_population = [], []
    years = data['Anno'].unique()

    for year in years:
        male_population.append(data[(data["Sesso"] == "M") & (data["Anno"] == year)][format_col_age].values[0])
        female_population.append(data[(data["Sesso"] == "F") & (data["Anno"] == year)][format_col_age].values[0])


    plt.figure(figsize=(20, 8))
    plt.plot(years, male_population, label='Maschi', color='blue', marker='o', alpha=0.7)   
    plt.plot(years, female_population, label='Femmine', color='violet', marker='o', alpha=0.7)

    plt.xlabel('Anno')
    plt.ylabel('Popolazione')
    plt.title(f'Crescita della Popolazione per la Fascia di Età {age_start}-{age_end} in {territory}')
    plt.legend()
    plt.grid(True)

    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(os.path.join(output_dir, f'{age_start}_{age_end}.jpeg'))
    plt.close()

if not os.path.exists(DESTINATION_PATH):
    os.makedirs(DESTINATION_PATH)


# For every region
for code, data in tqdm(datasets.items()):
    path = DESTINATION_PATH + f"{code}/gender_specific/ages/"
    ante = 0
    for post in range(4, 90, 5):
        area = ITALIAN_REGIONE_BY_CODE[code]
        plot_age_group_growth(data, ante, post, ITALIAN_REGIONE_BY_CODE[code], path)
        ante += 5

In [None]:
# [C3] Griglia di grafici cartesiani della crescita della popolazione - gender specific

DESTINATION_PATH = DEMO_GROWTH_CHART_PATH

def plot_age_group_growth_grid(dataset, territory, output_dir):
    dataset = dataset[dataset['Area'] == territory]
    fig, axs = plt.subplots(3, 6, figsize=(20, 12)) 
    axs = axs.ravel()

    max_population = 10
    step_size = 1

    years = dataset['Anno'].unique()
    for i, col_name in enumerate(AGE_GROUP["age_labels"]):
        male_population, female_population = [], []
        col_name = f'Crescita_{col_name}'
        
        for year in years:
            male_population.append(dataset[(dataset["Sesso"] == "M") & (dataset["Anno"] == year)][col_name].values[0])
            female_population.append(dataset[(dataset["Sesso"] == "F") & (dataset["Anno"] == year)][col_name].values[0])

        ax = axs[i]

        ax.plot(years, male_population, label='Maschi', color='blue')
        ax.plot(years, female_population, label='Femmine', color='violet')
        ax.set_title(col_name, fontsize=10)
        ax.set_xlabel('Anno')
        ax.set_ylabel('Popolazione')
        ax.grid(True)
        ax.set_yticks(np.arange(- max_population, max_population + step_size * 2, step_size))
        
    for ax in axs[len(AGE_GROUP["age_labels"]):]:
        fig.delaxes(ax)


    fig.legend(['Maschi', 'Femmine'], loc='upper right', fontsize=12)
    plt.tight_layout()
    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(os.path.join(output_dir, f'grid_ages.jpeg'))
    plt.close()

# For every region
for code, data in tqdm(datasets.items()):
    path = DESTINATION_PATH + code + '/gender_specific/'

    if not os.path.exists(path):
        os.makedirs(path)

    plot_age_group_growth_grid(data, ITALIAN_REGIONE_BY_CODE[code], path)

In [None]:
# [C3] Grafico cartesiano della crescita della popolazione totale

def plot_tot_polutation(data, territory, output_path):
    data = data[data['Area'] == territory]
    data = data[data['Sesso'] == 'T']

    years = data["Anno"].unique()
    tp = []
    for year in years:
        data_year = data[data["Anno"] == year]
        tp.append(data_year["Crescita_Popolazione_Totale"].sum())

    plt.figure(figsize=(12, 6))
    plt.plot(years, tp, marker='o', color='blue')
    plt.xlabel('Anno')
    plt.ylabel('Popolazione')

    plt.xticks(years, rotation=45)
    plt.title(f'Popolazione Totale in {territory} negli anni')
    plt.grid(True)

    os.makedirs(output_path, exist_ok=True)
    plt.savefig(os.path.join(output_path, 'crescita_popolazione_totale.jpeg'))
    plt.close()

# For every region
for code, data in tqdm(datasets.items()):
    path = DEMO_GROWTH_CHART_PATH + code + '/'
    if not os.path.exists(path):
        os.makedirs(path)

    plot_tot_polutation(data, ITALIAN_REGIONE_BY_CODE[code], path)

In [None]:
# [C3] Griglia di grafici della crescita della popolazione totale

def plot_total_age_group_growth_grid(dataset, territory, output_dir):
    dataset = dataset[dataset['Area'] == territory]
    fig, axs = plt.subplots(3, 6, figsize=(20, 12)) 
    axs = axs.ravel()

    max_population = 10
    step_size = 1

    years = dataset['Anno'].unique()
    for i, col_name in enumerate(AGE_GROUP["age_labels"]):
        total_population = []
        format_col_name = f'Crescita_{col_name}'
        
        for year in years:
            total_population.append(dataset[(dataset["Sesso"] == "T") & (dataset["Anno"] == year)][format_col_name].values[0])

        ax = axs[i]

        ax.plot(years, total_population, label='Totale', color='red')
        ax.set_title(col_name, fontsize=10)
        ax.set_xlabel('Anno')
        ax.set_ylabel('Popolazione')
        ax.grid(True)
        ax.set_yticks(np.arange(-max_population, max_population, step_size))
        
    for ax in axs[len(AGE_GROUP["age_labels"]):]:
        fig.delaxes(ax)


    plt.tight_layout()
    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(os.path.join(output_dir, f'griglia_crescita_popolazione_totale.jpeg'))
    plt.close()

# For every region
for code, data in tqdm(datasets.items()):
    path = DEMO_GROWTH_CHART_PATH + code + '/'

    if not os.path.exists(path):
        os.makedirs(path)

    plot_total_age_group_growth_grid(data, ITALIAN_REGIONE_BY_CODE[code], path)

# Modelling

In [11]:
# Caricamento Dataset

datasets = {}

# for code in ITALIAN_REGION_CODE.values():
for code in ['ITA']:
    path = CLEANED_DATA_DEMOGRAPHICS + f'by_area/{code}/'
    item = {}
    item["path"] = path
    dataset = pd.read_parquet(path + 'popolazione.parquet')

    item["male_data"] = dataset[dataset["Sesso"] == "M"]
    item["female_data"] = dataset[dataset["Sesso"] == "F"]
    item["total_data"] = dataset[dataset["Sesso"] == "T"]

    cutoff = 0.8
    item["cutoff"] = cutoff

    lenght_cutoff = int(len(dataset["Anno"].unique()) * cutoff)
    item["train_data"] = dataset[:lenght_cutoff]
    item["test_data"] = dataset[lenght_cutoff:]

    datasets[code] = item


In [12]:
# [M] Regressione Lineare

for code, item in datasets.items():
    for label in AGE_GROUP["age_labels"] + ['all']:
        model_dir = os.path.join(DEMO_LR_MODEL_PATH.format(code), f'{label}/')
        log_dir = os.path.join(DEMO_LR_MODEL_PATH.format(code), f'{label}/logs/')
        os.makedirs(log_dir, exist_ok=True)
        
        # Imposta il logging
        log_file = os.path.join(log_dir,  f'log_{datetime.now().strftime("%Y%m%d%H%M%S")}.txt')
        logger = setup_logging(log_file)
        
        # Aggiungi timestamp e informazioni sul dataset
        logger.info(f"========== Report per {code} [Age: {label}] - Demografico ==========")
        logger.info(f"Timestamp: {datetime.now().strftime('%Y-%m-%d_%H:%M:%S')}")
        logger.info(f"Dataset Path: {item['path']}")
        logger.info(f"Model: Linear Regression\n")

        logger.info(f"Train Cut Off: {item['cutoff']:.2f}")
        logger.info(f"Test Cut Off: {1 - item['cutoff']:.2f}\n")

        try:
            for dataset_type, data in item.items():
                if dataset_type not in ['male_data', 'female_data', 'total_data']:
                    continue
                logger.info(f"-------- Dataset Type: {dataset_type} --------")

                lenght_cutoff = int(len(data["Anno"].unique()) * item["cutoff"])
                train_data = data[:lenght_cutoff]
                test_data = data[lenght_cutoff:]

                # Prepara i dati
                x_train = train_data["Anno"].values.reshape(-1, 1)
                y_train = train_data["Popolazione_Totale"].values if label == 'all' else train_data[label].values

                x_test = test_data["Anno"].values.reshape(-1, 1)
                y_test = test_data["Popolazione_Totale"].values if label == 'all' else test_data[label].values

                # Addestra il modello
                model = LinearRegression()
                model.fit(x_train, y_train)

                # Effettua le previsioni
                y_pred = model.predict(x_test)

                # Calcola le metriche di performance
                mae = mean_absolute_error(y_test, y_pred)
                mse = mean_squared_error(y_test, y_pred)
                rmse = np.sqrt(mse)
                r2 = r2_score(y_test, y_pred)

                # Log delle metriche
                logger.info(f"Metrics:")
                logger.info(f"\tMAE: {mae:.4f}")
                logger.info(f"\tMSE: {mse:.4f}")
                logger.info(f"\tRMSE: {rmse:.4f}")
                logger.info(f"\tR2 Score: {r2:.4f}\n")

                # Salvataggio del modello
                filename = model_dir + f'model_{dataset_type}.pkl'
                joblib.dump(model, filename)

        except Exception as e:
            logger.error(f"Errore durante l'addestramento del modello per il dataset {code}: {str(e)}")
        
        finally:
            logger.info(f"=====================================\n")
            close_logger(logger)

In [13]:
# [M] Regressione Polinomiale

for code, item in datasets.items():
    for label in AGE_GROUP["age_labels"] + ['all']:
        model_dir = os.path.join(DEMO_PR_MODEL_PATH.format(code), f'{label}/')
        log_dir = os.path.join(DEMO_PR_MODEL_PATH.format(code), f'{label}/logs/')
        os.makedirs(log_dir, exist_ok=True)

        # Imposta il logging
        log_file = os.path.join(log_dir, f'log_{datetime.now().strftime("%Y%m%d%H%M%S")}.txt')
        logger = setup_logging(log_file)

        # Aggiungi timestamp e informazioni sul dataset
        logger.info(f"========== Report per {code} - Demografico ==========")
        logger.info(f"Timestamp: {datetime.now().strftime('%Y-%m-%d_%H:%M:%S')}")
        logger.info(f"Dataset Path: {item['path']}")
        logger.info(f"Model: Polynomial Regression\n")

        logger.info(f"Train Cut Off: {item['cutoff']:.2f}")
        logger.info(f"Test Cut Off: {1 - item['cutoff']:.2f}\n")

        try:
            for dataset_type, data in item.items():
                if dataset_type not in ['male_data', 'female_data', 'total_data']:
                    continue
                logger.info(f"-------- Dataset Type: {dataset_type} --------")

                lenght_cutoff = int(len(data["Anno"].unique()) * item["cutoff"])
                train_data = data[:lenght_cutoff]
                test_data = data[lenght_cutoff:]

                # Prepara i dati
                x_train = train_data["Anno"].values.reshape(-1, 1)
                y_train = train_data["Popolazione_Totale"].values if label == 'all' else train_data[label].values

                x_test = test_data["Anno"].values.reshape(-1, 1)
                y_test = test_data["Popolazione_Totale"].values if label == 'all' else test_data[label].values

                # Addestra il modello
                degree = 10

                logging.info("Model Info:")
                logging.info(f"\tDegree of Polynomial: {degree}\n")

                poly = PolynomialFeatures(degree=degree)
                x_train_poly = poly.fit_transform(x_train)
                x_test_poly = poly.transform(x_test)

                poly_model = LinearRegression() 
                poly_model.fit(x_train_poly, y_train)

                y_pred_poly = poly_model.predict(x_test_poly)   

                # Calcola le metriche di performance
                mae = mean_absolute_error(y_test, y_pred_poly)
                mse = mean_squared_error(y_test, y_pred_poly)
                rmse = np.sqrt(mse)
                r2 = r2_score(y_test, y_pred_poly)

                # Log delle metriche
                logger.info(f"Metrics:")
                logger.info(f"\tMAE: {mae:.4f}")
                logger.info(f"\tMSE: {mse:.4f}")
                logger.info(f"\tRMSE: {rmse:.4f}")
                logger.info(f"\tR2 Score: {r2:.4f}\n")

                # Salvataggio del modello
                filename = model_dir + f'model_{dataset_type}.pkl'
                model_to_save = {
                    "model": poly_model,
                    "poly_features": poly
                }
                joblib.dump(model_to_save, filename)

        except Exception as e:
            logger.error(f"Errore durante l'addestramento del modello per il dataset {code}: {str(e)}")

        finally:
            logger.info(f"=====================================\n")
            close_logger(logger)


In [14]:
# [M] Random Forest Regressor

for code, item in datasets.items():
    model_dir = os.path.join(DEMO_RF_MODEL_PATH.format(code), 'all/')
    log_dir = os.path.join(DEMO_RF_MODEL_PATH.format(code), f'all/logs/')
    os.makedirs(log_dir, exist_ok=True)

    # Imposta il logging
    log_file = os.path.join(log_dir, f'log_{datetime.now().strftime("%Y%m%d%H%M%S")}.txt')
    logger = setup_logging(log_file)

    # Aggiungi timestamp e informazioni sul dataset
    logger.info(f"========== Report per {code} - Demografico ==========")
    logger.info(f"Timestamp: {datetime.now().strftime('%Y-%m-%d_%H:%M:%S')}")
    logger.info(f"Dataset Path: {item['path']}")
    logger.info(f"Model: RandomForest Regressor\n")

    logger.info(f"Train Cut Off: {item['cutoff']:.2f}")
    logger.info(f"Test Cut Off: {1 - item['cutoff']:.2f}\n")

    try:
        for dataset_type, data in item.items():
            if dataset_type not in ['male_data', 'female_data', 'total_data']:
                continue
            logger.info(f"-------- Dataset Type: {dataset_type} --------")

            lenght_cutoff = int(len(data["Anno"].unique()) * item["cutoff"])
            train_data = data[:lenght_cutoff]
            test_data = data[lenght_cutoff:]

            # Prepara i dati
            x_train = train_data["Anno"].values.reshape(-1, 1)
            y_train = train_data["Popolazione_Totale"].values

            x_test = test_data["Anno"].values.reshape(-1, 1)
            y_test = test_data["Popolazione_Totale"].values

            # Addestra il modello Random Forest
            n_estimators = 1000
            max_depth = 5
            random_state = 42
            max_features = 'log2'

            logger.info("Model Info:")
            logger.info(f"\tNumber of Estimators: {n_estimators}")
            logger.info(f"\tMax Depth: {max_depth}")
            logger.info(f"\tMax Features: {max_features}\n")

            rf_model = RandomForestRegressor(
                n_estimators=n_estimators,
                max_depth=max_depth,
                random_state=random_state,
                max_features=max_features
            )

            rf_model.fit(x_train, y_train)
            y_pred_rf = rf_model.predict(x_test)

            # Calcola le metriche di performance
            mae = mean_absolute_error(y_test, y_pred_rf)
            mse = mean_squared_error(y_test, y_pred_rf)
            rmse = np.sqrt(mse)
            r2 = r2_score(y_test, y_pred_rf)

            # Log delle metriche
            logger.info(f"Metrics:")
            logger.info(f"\tMAE: {mae:.4f}")
            logger.info(f"\tMSE: {mse:.4f}")
            logger.info(f"\tRMSE: {rmse:.4f}")
            logger.info(f"\tR2 Score: {r2:.4f}\n")

            # Salvataggio del modello
            filename = model_dir + f'model_{dataset_type}.pkl'
            joblib.dump(rf_model, filename)

    except Exception as e:
        logger.error(f"Errore durante l'addestramento del modello per il dataset {code}: {str(e)}")

    finally:
        logger.info(f"=====================================\n")
        close_logger(logger)


# Post-Processing
Confrontare i dati reali con i dati previsti dal modello

In [5]:
# Caricamento Dataset e Modelli

datasets = {}
models = {}
model_dir = DEMO_MODEL_PATH

# for code in ITALIAN_REGION_CODE.values():
for code in ['ITA']:
    path = CLEANED_DATA_DEMOGRAPHICS + f'by_area/{code}/'
    
    # Caricamento del dataset
    item = {}
    dataset = pd.read_parquet(path + 'popolazione.parquet')

    item["male"] = dataset[dataset["Sesso"] == "M"]
    item["female"] = dataset[dataset["Sesso"] == "F"]
    item["total"] = dataset[dataset["Sesso"] == "T"]
    
    datasets[code] = item 
    
    item = {}
    
    item["LinearRegression"] = {}
    for ages in ['all'] + AGE_GROUP["age_labels"]:
        nested_item = {}
        
        nested_item["male"] = joblib.load(DEMO_LR_MODEL_PATH.format(code) + f'{ages}/model_male_data.pkl')
        nested_item["female"] = joblib.load(DEMO_LR_MODEL_PATH.format(code) + f'{ages}/model_female_data.pkl')
        nested_item["total"] = joblib.load(DEMO_LR_MODEL_PATH.format(code) + f'{ages}/model_total_data.pkl')
        
        item["LinearRegression"][ages] = nested_item
    
    # item["PolynomialRegression"] = {}
    # for ages in ['all'] + AGE_GROUP["age_labels"]:
    #     nested_item = {}
        
    #     nested_item["male"] = joblib.load(DEMO_PR_MODEL_PATH.format(code) + f'{ages}/model_male_data.pkl')
    #     nested_item["female"] = joblib.load(DEMO_PR_MODEL_PATH.format(code) + f'{ages}/model_female_data.pkl')
    #     nested_item["total"] = joblib.load(DEMO_PR_MODEL_PATH.format(code) + f'{ages}/model_total_data.pkl')
        
    #     item["PolynomialRegression"][ages] = nested_item
    
    # item["RandomForestRegressor"] = {}
    # for ages in ['all'] + AGE_GROUP["age_labels"]:
    #     nested_item = {}
        
    #     nested_item["male"] = joblib.load(DEMO_RF_MODEL_PATH.format(code) + f'{ages}/model_male_data.pkl')
    #     nested_item["female"] = joblib.load(DEMO_RF_MODEL_PATH.format(code) + f'{ages}/model_female_data.pkl')
    #     nested_item["total"] = joblib.load(DEMO_RF_MODEL_PATH.format(code) + f'{ages}/model_total_data.pkl')
        
    #     item["RandomForestRegressor"][ages] = nested_item
    
    models[code] = item


In [13]:
# [C1] Confonto Realtà vs Predizione - Regressione Lineare

def compare_real_vs_lr(data, model, output_dir, predict_column='Popolazione_Totale', info=None):
    x = data["Anno"].values.reshape(-1, 1)
    y = data[predict_column].values
    y_pred = model.predict(x)

    plt.figure(figsize=(12, 6))
    plt.plot(data["Anno"], y, label='Reale', color='blue')
    plt.plot(data["Anno"], y_pred, label='Predetto', color='red')

    plt.xlabel('Anno')
    plt.ylabel('Popolazione')
    if info is None:
        plt.title('Confronto Realtà vs Predizione - Regressione Lineare')
    else:
        plt.title(f'Confronto Realtà vs Predizione [Code: {info[0]}, Sesso: {info[2]}, Age: {info[1]}] - Regressione Lineare')
    plt.legend()
    plt.grid(True)

    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(os.path.join(output_dir, 'real_vs_pred.jpeg'))
    plt.close()
    
# For every region
for code, dataset in datasets.items():
    age_models = models[code]["LinearRegression"]
    
    for age, model in age_models.items():
        chart_dir = DEMO_MODEL_CHART_PATH.format(code=code)

        if not os.path.exists(chart_dir):
            os.makedirs(chart_dir)
            
        for dataset_type, data in dataset.items():
            dest_path = chart_dir + f'linear_regression/by_age/{age}/{dataset_type}/'
            info = [code, age, dataset_type]
            if age in ['all']:
                compare_real_vs_lr(data=data, model=model[dataset_type], output_dir=dest_path, info=info)
            else:
                compare_real_vs_lr(data, model[dataset_type], dest_path, predict_column=age, info=info)

        

In [33]:
# [C] Confronto Realtà vs Predizione - Regressione Polinomiale

def compare_real_vs_pl(data, model, poly, output_dir):
    x = data["Anno"].values.reshape(-1, 1)
    y = data["Popolazione_Totale"].values
    
    x_poly = poly.transform(x)
    y_pred = model.predict(x_poly)

    plt.figure(figsize=(12, 6))
    plt.plot(data["Anno"], y, label='Reale', color='blue')
    plt.plot(data["Anno"], y_pred, label='Predetto', color='red')

    plt.xlabel('Anno')
    plt.ylabel('Popolazione')
    plt.title('Confronto Realtà vs Predizione - Regressione Polinomiale')
    plt.legend()
    plt.grid(True)

    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(os.path.join(output_dir, 'real_vs_pred.jpeg'))
    plt.close()
    
# For every region
for code, dataset in datasets.items():
    model = models[code]["PolynomialRegression"]
    path = DEMO_MODEL_CHART_PATH + code + '/'

    if not os.path.exists(path):
        os.makedirs(path)
        
    for dataset_type, data in dataset.items():
        if dataset_type not in ['path']:
            model_data = model[dataset_type]["model"]
            poly_data = model[dataset_type]["poly_features"]
            compare_real_vs_pl(data, model_data, poly_data, path + f'polynomial_regression/{dataset_type}/')

In [39]:
# [C] Confronto Realtà vs Predizione - Random Forest Regressor

def compare_real_vs_rf(data, model, output_dir):
    x = data["Anno"].values.reshape(-1, 1)
    y = data["Popolazione_Totale"].values
    y_pred = model.predict(x)

    plt.figure(figsize=(12, 6))
    plt.plot(data["Anno"], y, label='Reale', color='blue')
    plt.plot(data["Anno"], y_pred, label='Predetto', color='red')

    plt.xlabel('Anno')
    plt.ylabel('Popolazione')
    plt.title('Confronto Realtà vs Predizione - Random Forest Regressor')
    plt.legend()
    plt.grid(True)
    
    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(os.path.join(output_dir, 'real_vs_pred.jpeg'))
    plt.close()
    
# For every region
for code, dataset in datasets.items():
    model = models[code]["RandomForestRegressor"]
    path = DEMO_MODEL_CHART_PATH + code + '/'

    if not os.path.exists(path):
        os.makedirs(path)
        
    for dataset_type, data in dataset.items():
        if dataset_type not in ['path']:
            compare_real_vs_rf(data, model[dataset_type], path + f'random_forest/{dataset_type}/')