In [None]:
#!pip install geopandas
#!pip install contextily
#!pip list
#!nvcc --version
#!pip install scalecast --upgrade
#!pip install hcrystalball

#!nvidia-smi

In [None]:
# Importaciones de bibliotecas estándar de Python
from datetime import datetime, timedelta

# Importaciones de bibliotecas de terceros
import geopandas as gpd
import pandas as pd
import numpy as np
import pickle
import unicodedata
import seaborn as sns
import matplotlib.pyplot as plt
from scalecast.Forecaster import Forecaster
import tensorflow as tf
from tensorflow.python import keras
from tensorflow.python.keras import layers, Model
from tensorflow.python.keras.optimizers import adam_v2
from tensorflow.python.keras.metrics import Mean
from tensorflow.python.keras.callbacks import EarlyStopping
from keras.losses import MeanSquaredError
from google.colab import drive
from scalecast.Forecaster import Forecaster
from keras.callbacks import EarlyStopping
from sklearn.linear_model import LinearRegression
from scalecast.SeriesTransformer import SeriesTransformer
from hcrystalball.wrappers import ExponentialSmoothingWrapper, ProphetWrapper, SarimaxWrapper
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from hcrystalball.compose import TSColumnTransformer
from hcrystalball.feature_extraction import SeasonalityTransformer, HolidayTransformer
from hcrystalball.wrappers import get_sklearn_wrapper

In [None]:
tf.config.list_physical_devices('GPU')
drive.mount('/content/drive')

In [None]:
gdf=gpd.read_file("/content/drive/MyDrive/PDG/data/MGN2021_MPIO_POLITICO/MGN_MPIO_POLITICO.shp")
def remove_accents_and_uppercase(text):
    if not isinstance(text, str):
        return text

    text = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')
    return text.upper()

columnas_a_transformar_colombia = ["DPTO_CNMBR", "MPIO_CNMBR"]
gdf[columnas_a_transformar_colombia]=gdf[columnas_a_transformar_colombia].applymap(remove_accents_and_uppercase)
gdf = gdf.to_crs('EPSG:3857')
gdf['latitud'] = gdf['geometry'].centroid.y
gdf_sorted = gdf.sort_values('latitud', ascending=False)
gdf_sorted = gdf_sorted.reset_index(drop=True)

In [None]:
municipio_a_indice = {(row['DPTO_CNMBR'], row['MPIO_CNMBR']): i for i, row in gdf_sorted.iterrows()}

In [None]:
feminicidios = pd.read_csv('/content/drive/MyDrive/MGN2021_MPIO_POLITICO/datos_observatorio_limpios.csv', encoding='utf-8', sep=',')

feminicidios['num_casos'] = feminicidios['num_casos'].astype(int)

feminicidios['fecha_en_prensa'] = pd.to_datetime(feminicidios['fecha_en_prensa'])

feminicidios.head()

In [None]:
feminicidios_mlr = feminicidios.copy()

feminicidios_mlr['fecha'] = feminicidios_mlr['fecha_en_prensa']

feminicidios_mlr['ubicacion'] = feminicidios_mlr.apply(lambda row: municipio_a_indice[(row['departamento'], row['municipio'])], axis=1)

feminicidios_mlr = feminicidios_mlr.drop(['fecha_en_prensa', 'departamento', 'municipio','num_casos_normalizado'], axis=1)

feminicidios_mlr.head()

In [None]:
# Genera una serie de tiempo completa desde la fecha más antigua hasta la más reciente
all_dates = pd.date_range(start=feminicidios['fecha_en_prensa'].min(), end=feminicidios['fecha_en_prensa'].max())

# Obtén todas las combinaciones únicas de 'departamento' y 'municipio'
combinations = feminicidios[['departamento', 'municipio']].drop_duplicates().values.tolist()

for combo in combinations:
    dept, city = combo
    subset = feminicidios[(feminicidios['departamento'] == dept) & (feminicidios['municipio'] == city)].copy()
    subset.set_index('fecha_en_prensa', inplace=True)
    subset = subset.reindex(all_dates)
    subset.index.name = 'fecha_en_prensa'
    subset['num_casos'].fillna(0, inplace=True)

    # Guarda cada subset en un CSV
    subset.to_csv(f"{dept}_{city}_feminicidios.csv", columns=['num_casos'])


In [None]:
f_depar_mun = pd.read_csv('/content/ANTIOQUIA_BELLO_feminicidios.csv', encoding='utf-8', sep=',')

f_depar_mun['num_casos'] = f_depar_mun['num_casos'].astype(int)

f_depar_mun['fecha_en_prensa'] = pd.to_datetime(f_depar_mun['fecha_en_prensa'])

f_depar_mun.head()

In [None]:
fecha_mas_antigua = feminicidios['fecha_en_prensa'].min()
ultima_fecha = feminicidios['fecha_en_prensa'].max()

print(fecha_mas_antigua,'-',ultima_fecha)

In [None]:
# Define model
model = Pipeline(
    steps=[
        (
            "preprocessing",
            TSColumnTransformer(
                transformers=[
                    ("seasonality", SeasonalityTransformer(freq="D"), ["target"]),
                    ("holidays", HolidayTransformer(country_code="CO"), ["target"]),
                ]
            ),
        ),
        (
            "model",
            get_sklearn_wrapper(RandomForestRegressor, random_state=42),
        ),
    ]
)


f = Forecaster(
    y = f_depar_mun['num_casos'],
    current_dates=f_depar_mun['fecha_en_prensa'],
    DateStartActuals = fecha_mas_antigua,
    DateEndActuals = ultima_fecha,
    Freq = "D",
    ForecastLength = 356,
    Xvars = [],
    Differenced = 0,
    TestLength = .4,
    ValidationLength = 4,
    ValidationMetric=['rmse', 'r2'],
    CILevel = 0.6,
    BootstrapSamples = 5000
)

In [None]:
#f.fit(model)
#predictions = f.predict()


In [None]:
f.plot_pacf(lags=26)
plt.show()

In [None]:
plt.figure(figsize=(20, 6), dpi=360)
result = f.seasonal_decompose(period=365)
fig = result.plot()

# Ajusta el tamaño de la figura
fig.set_size_inches(20, 6)

plt.show()

In [None]:
criticalpval = 0.05

In [None]:

print('-'*100)
print('Antioquia Augmented Dickey-Fuller results:')
stat, pval, _, _, _, _ = f.adf_test(full_res=True)
print('the test-stat value is: {:.2f}'.format(stat))
print('the p-value is {:.4f}'.format(pval))
print('the series is {}'.format('stationary' if pval < criticalpval else 'not stationary'))
print('-'*100)

In [None]:
f.set_test_length(0.30)       # 1. 12 observations to test the results
f.generate_future_dates(365) # 2. 12 future points to forecast
f.set_estimator('lstm')     # 3. LSTM neural network
f.eval_cis()

In [None]:
result1 = f.manual_forecast(call_me='lstm_default')
plt.figure(figsize=(20, 6))
f.plot_test_set(ci=True)
plt.show()



In [None]:
f.manual_forecast(call_me='lstm_24lags',lags=24)
f.plot_test_set(ci=True)

In [None]:
f.manual_forecast(
    call_me='lstm_24lags_5epochs',
    lags=24,
    epochs=5,
    validation_split=.2,
    shuffle=True,
)
f.plot_test_set(ci=True)

In [None]:
f.manual_forecast(
    call_me='lstm_24lags_earlystop_3layers',
    lags=24,
    epochs=25,
    validation_split=.2,
    shuffle=True,
    callbacks=EarlyStopping(
        monitor='val_loss',
        patience=5,
    ),
    lstm_layer_sizes=(16,16,16),
    dropout=(0,0,0),
)

f.plot_test_set(ci=True)

In [None]:
transformer = SeriesTransformer(f)
f = transformer.DiffTransform()

f.add_ar_terms(4)  # Añade términos autorregresivos para capturar dependencias diarias
f.add_seasonal_regressors('day', dummy=True)  # Añade regresores estacionales diarios
f.add_seasonal_regressors('month','quarter', dummy=True)  # Añade regresores estacionales mensuales
f.add_seasonal_regressors('year')  # Añade regresores estacionales anuales
f.add_time_trend()  # Añade un término de tendencia temporal


In [None]:
# Define un callback para la detención temprana Mejor
early_stopping = EarlyStopping(monitor='val_loss', patience=10)

f.manual_forecast(
    call_me='lstm_best',
    lags=42,
    batch_size=32,
    epochs=1000,
    validation_split=.2,
    shuffle=True,
    activation='tanh',
    optimizer='Adam',
    learning_rate=0.005,
    lstm_layer_sizes=(72,)*4,
    dropout=(0.2,)*4,  # Aumenta la tasa de dropout para promover la generalización
    callbacks=[early_stopping],  # Utiliza la detención temprana para prevenir el sobreajuste
    plot_loss=True
)


In [None]:
f.set_estimator('mlr')
f.manual_forecast(call_me='mlr')

f = transformer.DiffRevert(
    exclude_models = [m for m in f.history if m != 'mlr']
) # exclude all lstm models from the revert

f.plot_test_set(order_by='TestSetMAPE',models=['lstm_best','mlr'])
plt.title('Top-2 Models Test-set Performance - Level Data',size=16)
plt.show()

In [None]:
f.export('lvl_test_set_predictions')

In [None]:
diferencia = ultima_fecha- fecha_mas_antigua
num_dias = diferencia.days
print(num_dias)
fecha_inicial=fecha_mas_antigua
fecha_final=ultima_fecha

In [None]:
min_num_casos = feminicidios['num_casos'].min()
max_num_casos = feminicidios['num_casos'].max()
feminicidios['num_casos_normalizado'] = (feminicidios['num_casos'] - min_num_casos) / (max_num_casos - min_num_casos)

In [None]:
fecha_min = feminicidios['fecha_en_prensa'].min()
fecha_max = feminicidios['fecha_en_prensa'].max()
rango_fechas = pd.date_range(fecha_min, fecha_max)

In [None]:
# Define un callback para la detención temprana iNicial mia
early_stopping = EarlyStopping(monitor='val_loss', patience=10)

f.manual_forecast(
    call_me='lstm_best',
    lags=42,
    batch_size=256,
    epochs=1000,
    validation_split=.2,
    shuffle=True,
    activation='tanh',
    optimizer='Adam',
    learning_rate=0.005,
    lstm_layer_sizes=(72,)*4,
    dropout=(0.2,)*4,  # Aumenta la tasa de dropout para promover la generalización
    callbacks=[early_stopping],  # Utiliza la detención temprana para prevenir el sobreajuste
    plot_loss=True
)

f.plot_test_set(order_by='TestSetMAPE', models='top_2', ci=True)
