In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

Carga de datos.

In [None]:
data = pd.read_csv('../data/processed/incidents_temperature.csv')
data.head()

Convertimos 'state' a variables numéricas usando one-hot encoding

In [None]:
data = pd.get_dummies(data, columns=['state'], drop_first=True)

Definimos las características y la variable objetivo.

In [None]:
state_columns = [col for col in data.columns if col.startswith('state_')]
data[state_columns] = data[state_columns].astype(int)


features = ['year', 'month', 'average_temperature'] + state_columns
X = data[features]
y = data['n_incidents']

Dividimos los datos en entrenamiento y test.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Creamos y entrenamos el modelo de regresión lineal.

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

Predecimos el conjunto de test.

In [None]:
y_pred = model.predict(X_test)

Evaluamos el modelo.

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
print(f'MAE: {mae}')
print(f'MAPE: {mape}')
print(f'MSE: {mse}')
print(f'R^2: {r2}')

Visualizamos los valores residuales.

In [None]:
residuals = y_test - y_pred
sns.scatterplot(x=y_test.index, y=residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.title('Residuals Plot')
plt.xlabel('Index')
plt.ylabel('Residuals')
plt.show()

Visualizamos la distribución de valores reales y predichos.

In [None]:
sns.histplot(y_test, bins=30, kde=True, label='Actual', color='blue')
sns.histplot(y_pred, bins=30, kde=True, label='Predicted', color='orange')
plt.title('Distribution of Actual and Predicted Values')
plt.xlabel('Incidents per 100,000 Inhabitants')
plt.ylabel('Frequency')
plt.legend()
plt.show()

Visualizamos las predicciones del modelo en contraposición con los valores reales.

In [None]:
plt.scatter(y_test, y_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], linestyle='--', color='red', linewidth=2)
plt.xlabel('Actual Incidents per 100,000 Inhabitants')
plt.ylabel('Predicted Incidents per 100,000 Inhabitants')
plt.title('Linear Regression Results')
plt.show()

Vemos el coeficiente de la variable 'average_temperature' para comprobar su importancia en el modelo.

In [None]:
coefficient_average_temperature = model.coef_[X.columns.get_loc('average_temperature')]
print(f'Coefficient of average_temperature: {coefficient_average_temperature}')

Realizamos un análisis estadístico para comprobar si la hipótesis puede afirmarse.

In [None]:
X_with_constant = sm.add_constant(X)
model_stats = sm.OLS(y, X_with_constant).fit()
print(model_stats.summary())