In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.svm import SVR

In [None]:
try:
    df = pd.read_csv('input/AIRPOL_data.csv' , sep=';')
except Exception as e:
    print("Erro ao ler o arquivo:", e)
    import sys; sys.exit(1)

In [None]:
################################ Pre-processamento de Dados #######################################
# Identificar colunas categóricas e numéricas
categorical_cols = ['Country', 'NUTS_Code', 'Air_Pollutant', 'Outcome']
numeric_cols = ['Affected_Population', 'Populated_Area[km2]', 'Air_Pollution_Average[ug/m3]']

# Corrigir colunas numéricas lidas como string (com vírgula)
for col in numeric_cols:
    df[col] = df[col].str.replace(',', '.').astype(float)

# Renomear 'Value' para 'Premature_Deaths'
df.rename(columns={'Value': 'Premature_Deaths'}, inplace=True)

# Corrigir Premature_Deaths para ser numérico
df['Premature_Deaths'] = df['Premature_Deaths'].str.replace(',', '.').astype(float)

###################################################################################################

## 4.2.3 Desenvolva modelos para prever mortes prematuras (Premature_Deaths)

In [None]:
# Filtragem por paises de interesse: Southern Europe
countries_of_interest = ['Greece', 'Spain', 'Italy', 'Portugal']
df_4_2_3 = df[df['Country'].isin(countries_of_interest)]
df_4_2_3.reset_index(drop=True, inplace=True)

# Definir X e y filtragem
X = df_4_2_3.drop(columns=['Premature_Deaths'])
y = df_4_2_3['Premature_Deaths']

# Dividir os dados em conjuntos de treino e teste com KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols), # Normalização das colunas numéricas
        ('cat', OneHotEncoder(), categorical_cols) # Codificação das colunas categóricas
    ]
)

### a) Regressão linear múltipla.

In [None]:
# Regressão Linear Múltipla
model_lr = LinearRegression()

# Criar o pipeline para o modelo de Regressão Linear Múltipla
pipeline_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model_lr)
])

# Listas para armazenar as métricas e resíduos
mae_lr, rmse_lr, r2_lr, residuals_lr = [], [], [], []

In [None]:
# Treinar e avaliar o modelo de Regressão Linear Múltipla com KFold
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    pipeline_lr.fit(X_train, y_train)
    y_pred = pipeline_lr.predict(X_test)
    mae_lr.append(mean_absolute_error(y_test, y_pred))
    rmse_lr.append(np.sqrt(mean_squared_error(y_test, y_pred)))
    r2_lr.append(r2_score(y_test, y_pred))
    residuals_lr.append(y_test - y_pred)

# Exibir resultados em formato de tabela
print(f"{'\nModelo':<30}{'MAE':>10}{'RMSE':>10}{'R2':>10}")
print(f"{'Multiple Linear Regression':<30}{np.mean(mae_lr):>10.2f}{np.mean(rmse_lr):>10.2f}{np.mean(r2_lr):>10.2f}")

# Mostrar todas as métricas de todas as folds
print("\nMétricas por fold:")
for i, (mae, rmse, r2) in enumerate(zip(mae_lr, rmse_lr, r2_lr), 1):
    print(f"Fold {i}: MAE = {mae:.2f} | RMSE = {rmse:.2f} | R2 = {r2:.2f}")

In [None]:
# ---------------- Análise de Resíduos (para a primeira fold) ----------------
plt.figure(figsize=(6,4))
plt.scatter(range(len(residuals_lr[0])), residuals_lr[0])
plt.axhline(0, color='red', linestyle='--')
plt.title('Residuals (Fold 1)')
plt.xlabel('Sample')
plt.ylabel('Residual')
plt.tight_layout()
plt.show()

In [None]:
# ---------------- Visualização da Distribuição dos Erros ----------------
fig, axs = plt.subplots(1, 3, figsize=(15,4))
axs[0].boxplot(mae_lr)
axs[0].set_title('MAE ')
axs[1].boxplot(rmse_lr)
axs[1].set_title('RMSE')
axs[2].boxplot(r2_lr)
axs[2].set_title('R2')
plt.tight_layout()
plt.show()

In [None]:
# ---------------- Coeficentes do Modelo de Regressão Linear ----------------
# Ajustar o modelo de regressão linear com todo o conjunto de dados
pipeline_lr.fit(X, y)
# Extrair os coeficientes do modelo
feature_names = (
    numeric_cols +
    list(pipeline_lr.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(categorical_cols))
)
coefs = pipeline_lr.named_steps['model'].coef_

coef_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefs
})

# Separar coeficientes numéricos e categóricos
numeric_coef_df = coef_df[coef_df['Feature'].isin(numeric_cols)]
categorical_coef_df = coef_df[~coef_df['Feature'].isin(numeric_cols)]

# Ordenar coeficientes numéricos
categorical_coef_df['AbsCoefficient'] = categorical_coef_df['Coefficient'].abs()
categorical_coef_df = categorical_coef_df.sort_values(by='AbsCoefficient', ascending=False)

print("\nCoeficientes das variáveis numéricas:\n")
print(numeric_coef_df[['Feature', 'Coefficient']].to_string(index=False))

print("\nTop 10 coeficientes das variáveis categóricas:\n")
print(categorical_coef_df[['Feature', 'Coefficient']].head(10).to_string(index=False))