# Bibliotecas

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import numpy as np


from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from category_encoders import BinaryEncoder, CountEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score

# Funções 

# Carregando dados

In [42]:
df = pd.read_csv('data/train_cleaned.csv')

# Modelagem

## Regressão Linear

### Preparar dataset

In [47]:
# Definir as colunas categóricas e numéricas
categorical_cols_binary = ['NearestStation', 'Use']
categorical_cols_frequency = ['Type', 'Region', 'LandShape', 'Structure', 'Direction', 'Classification', 'CityPlanning']
numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
numeric_cols.remove('TradePrice_log') # remove o target para ele não ser processado no pipeline

In [49]:
# Separar características e alvo
X = df.drop(columns=['TradePrice_log'])
y = df['TradePrice_log']

# Dividir os dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Criar pipeline
Pré-processamento e modelo de RL.

Encoding das categóricas:
- NearestStation, Use = Binary Encoding ou Frequency Encoding
- Outras: Frequency Encoding

In [50]:
# Criar o ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),  # Imputa valores faltantes
            ('scaler', StandardScaler())                # Normaliza os dados
        ]), numeric_cols),
        
        ('binary', BinaryEncoder(cols=categorical_cols_binary), categorical_cols_binary),
        
        ('freq', CountEncoder(), categorical_cols_frequency)
    ],
    remainder='passthrough'  # Inclui todas as outras colunas que não estão listadas acima
)

In [51]:
# Criar o pipeline completo com o preprocessor e o modelo
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())  # Modelo de Regressão Linear
])


### Treinar Modelo e fazer previsões

In [52]:
# Ajustar o pipeline aos dados de treino
pipeline.fit(X_train, y_train)

# Fazer previsões nos dados de teste
y_pred = pipeline.predict(X_test)

### Avaliar modelo

In [53]:
# Calcular RMLSE
rmlse = (mean_squared_log_error(y_test, y_pred))**0.5

# Calcular MAPE
mape = mean_absolute_percentage_error(y_test, y_pred)

# Calcular R^2
r2 = r2_score(y_test, y_pred)

# Criar um DataFrame para armazenar as métricas
metrics_df = pd.DataFrame({
    'Model': ['Linear Regression'],
    'RMLSE': [rmlse],
    'MAPE': [mape],
    'R2': [r2]
})

print(metrics_df.round(4))


               Model   RMLSE    MAPE     R2
0  Linear Regression  0.0304  0.0232  0.424
