In [3]:
import pandas as pd
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta
import pickle

# Load the assembled_dfs dataset
with open("assembled_dfs.pkl", "rb") as f:
    assembled_dfs = pickle.load(f)

# Determine the start date (November 2024) and end date (December 2026)
start_date = datetime(2024, 11, 1)
end_date = datetime(2026, 12, 31)

# Generate a range of months and years
date_range = pd.date_range(start=start_date, end=end_date, freq='MS')

# Get unique Codi_barri values
codi_barri_unique = assembled_dfs['Codi_barri'].unique()

# Initialize synthetic dataset
synthetic_data = []

# Generate synthetic data for each Codi_barri, month, and year
for date in date_range:
    year = date.year
    month = date.month

    for codi in codi_barri_unique:
        # Extract historical data for this Codi_barri
        historical = assembled_dfs[assembled_dfs['Codi_barri'] == codi]

        # Population: Extrapolate based on average yearly growth
        population_growth = historical.groupby('year')['population'].mean().pct_change().mean()
        last_population = historical[historical['year'] == historical['year'].max()]['population'].mean()
        new_population = last_population * (1 + population_growth) ** (year - historical['year'].max())

        # Age: Increment by the number of years since the last available year
        avg_age = historical['age'].mean()
        new_age = avg_age + (year - historical['year'].max())

        # Area_m2: Assume constant (average over historical data)
        avg_area = historical['area_m2'].mean()

        # Temp: Use monthly averages with slight variations
        monthly_avg_temp = historical[historical['month'] == month]['temp'].mean()
        temp_variation = historical['temp'].std() / 2 if not np.isnan(historical['temp'].std()) else 1
        new_temp = monthly_avg_temp + np.random.uniform(-temp_variation, temp_variation)

        # Append the synthetic row
        synthetic_data.append({
            'Codi_barri': codi,
            'year': year,
            'month': month,
            'temp': new_temp,
            'area_m2': avg_area,
            'age': new_age,
            'population': new_population
        })

# Convert the synthetic data to a DataFrame
synthetic_df = pd.DataFrame(synthetic_data)

# Quadruplicate rows and add 'daytime' column
daytimes = ['night', 'morning', 'afternoon', 'evening']
synthetic_df = synthetic_df.loc[synthetic_df.index.repeat(4)].reset_index(drop=True)
synthetic_df['daytime'] = daytimes * (len(synthetic_df) // 4)

# Save the synthetic dataset to a pickle file
with open("synthetic_data.pkl", "wb") as f:
    pickle.dump(synthetic_df, f)


synthetic_df.head(20)

Unnamed: 0,Codi_barri,year,month,temp,area_m2,age,population,daytime
0,1,2024,11,12.352019,65.850216,113.733333,2235.0,night
1,1,2024,11,12.352019,65.850216,113.733333,2235.0,morning
2,1,2024,11,12.352019,65.850216,113.733333,2235.0,afternoon
3,1,2024,11,12.352019,65.850216,113.733333,2235.0,evening
4,2,2024,11,15.375396,88.680808,123.582828,3024.7,night
5,2,2024,11,15.375396,88.680808,123.582828,3024.7,morning
6,2,2024,11,15.375396,88.680808,123.582828,3024.7,afternoon
7,2,2024,11,15.375396,88.680808,123.582828,3024.7,evening
8,3,2024,11,14.4866,60.963636,98.304545,1541.6,night
9,3,2024,11,14.4866,60.963636,98.304545,1541.6,morning


In [2]:
synthetic_df.head()

NameError: name 'synthetic_df' is not defined