In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# --- 1. CARREGAMENTO E TRANSFORMAÇÃO DE DADOS ---
df = pd.read_csv("new_kindle_data.csv")
df = df.rename(columns={"reviews": "reviews_count"})

# Definir Features (X) e Target (y)
X = df.drop("reviews_count", axis=1)
# Aplicação da transformação log(1+x) ao target, conforme a metodologia
y = np.log1p(df["reviews_count"]) 

# --- 2. FEATURE ENGINEERING (BINNING) ---
# Definir o número de bins/categorias principais
TOP_N_AUTHORS = 50 
TOP_N_CATEGORIES = 50

# A) Binning do Autor
top_authors = X['author_name'].value_counts().nlargest(TOP_N_AUTHORS).index
X['author_binned'] = X['author_name'].apply(
    lambda x: x if x in top_authors else 'Author_Other'
)

# B) Binning da Category Name
top_categories = X['category_name'].value_counts().nlargest(TOP_N_CATEGORIES).index
X['category_name_binned'] = X['category_name'].apply(
    lambda x: x if x in top_categories else 'Category_Other'
)

# Definir as features finais a usar no modelo
numerical_features = ["price", "stars"]
categorical_features_binned = [
    "author_binned", 
    "category_name_binned",
    "isEditorsPick", 
    "isGoodReadsChoice", 
    "isKindleUnlimited" 
]
all_features_rf = numerical_features + categorical_features_binned

# Usar apenas as features processadas/binned para o modelo RF
X_rf = X[all_features_rf] 

# --- 3. PRÉ-PROCESSAMENTO (COLUMN TRANSFORMER) ---
preprocess_rf = ColumnTransformer( 
    transformers=[ 
        ('num', StandardScaler(), numerical_features),  
        # OneHotEncoder usa sparse_output=False, adequado para RandomForest
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features_binned) 
    ], 
    remainder='drop' 
)

# --- 4. DIVISÃO DOS DADOS ---
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split( 
    X_rf, y, test_size=0.2, random_state=42 
)

# --- 5. PIPELINE FINAL (RANDOM FOREST) ---
# Modelo final com parâmetros do Grid Search ou a ser otimizado
rf_model_final = RandomForestRegressor(random_state=42)

pipeline_rf = Pipeline(steps=[ 
    ("preprocess", preprocess_rf), 
    ("regressor", rf_model_final)  
])

# --- 6. GRID SEARCH (Otimização do Random Forest) ---
# Usando os melhores parâmetros identificados no seu relatório (n_estimators=150, max_depth=20)
param_grid_rf = { 
    'regressor__n_estimators': [150],  
    'regressor__max_depth': [20], 
}

print("Iniciando Grid Search para o Random Forest...")

# Definir K-Fold para Cross-Validation (cv=3 usado no seu relatório)
kf = KFold(n_splits=3, shuffle=True, random_state=42)

grid_search_rf = GridSearchCV( 
    pipeline_rf,  
    param_grid_rf,  
    cv=kf,  
    scoring='neg_mean_squared_error',  
    verbose=1,  
    n_jobs=-1 
)

grid_search_rf.fit(X_train_rf, y_train_rf) 

best_model_rf = grid_search_rf.best_estimator_
print("\nMelhores parâmetros do Random Forest:", grid_search_rf.best_params_) 

# --- 7. AVALIAÇÃO FINAL ---
y_pred_rf = best_model_rf.predict(X_test_rf) 

mse_rf = mean_squared_error(y_test_rf, y_pred_rf) 
rmse_rf = np.sqrt(mse_rf) 
r2_rf = r2_score(y_test_rf, y_pred_rf) 

print("\n=== RESULTADOS FINAIS: RANDOM FOREST (OTIMIZADO) ===") 
print(f"R² (Variação Explicada): {r2_rf:.4f}") 
print(f"MSE: {mse_rf:.4f}") 
print(f"RMSE: {rmse_rf:.4f}")

Iniciando Grid Search para o Random Forest...
Fitting 3 folds for each of 1 candidates, totalling 3 fits

Melhores parâmetros do Random Forest: {'regressor__max_depth': 20, 'regressor__n_estimators': 150}

=== RESULTADOS FINAIS: RANDOM FOREST (OTIMIZADO) ===
R² (Variação Explicada): 0.8595
MSE: 1.4273
RMSE: 1.1947
