In [None]:
### üß™ Kit Pr√°tico ‚Äì Semana 11: Random Forest e Compara√ß√£o de Modelos
# Tema: Ensemble Learning para melhorar a performance
# Ferramenta: Google Colab
# Dataset: https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

# Passo 1: Carregar os dados
df = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")

# Passo 2: Preparar os dados
df_model = df[['Survived', 'Pclass', 'Sex', 'Age', 'Fare']].copy()
df_model['Age'].fillna(df_model['Age'].mean(), inplace=True)
df_model['Sex'] = df_model['Sex'].map({'male': 0, 'female': 1})

X = df_model[['Pclass', 'Sex', 'Age', 'Fare']]
y = df_model['Survived']

# Passo 3: Dividir em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Passo 4: Treinar o modelo de √Årvore de Decis√£o
modelo_arvore = DecisionTreeClassifier(max_depth=4, random_state=42)
modelo_arvore.fit(X_train, y_train)
y_pred_arvore = modelo_arvore.predict(X_test)

# Passo 5: Treinar o modelo de Random Forest
modelo_rf = RandomForestClassifier(n_estimators=100, random_state=42)
modelo_rf.fit(X_train, y_train)
y_pred_rf = modelo_rf.predict(X_test)

# Passo 6: Avaliar os modelos
print("\nüå≥ √Årvore de Decis√£o")
print("Acur√°cia:", accuracy_score(y_test, y_pred_arvore))
print(classification_report(y_test, y_pred_arvore))

print("\nüå≤ Random Forest")
print("Acur√°cia:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# DESAFIO EXTRA:
# Testar a Random Forest variando o n√∫mero de √°rvores (n_estimators) para ver o impacto na acur√°cia.


In [None]:
acuracias_rf = []
estimators = [10, 20, 50, 100, 200, 500]

for n in estimators:
    modelo_teste_rf = RandomForestClassifier(n_estimators=n, random_state=42)
    modelo_teste_rf.fit(X_train, y_train)
    
    y_pred_teste = modelo_teste_rf.predict(X_test)
    acc = accuracy_score(y_test, y_pred_teste)
    acuracias_rf.append(acc)
    
    print(f"N√∫mero de √Årvores (n_estimators): {n:3d} | Acur√°cia: {acc:.4f}")

plt.figure(figsize=(10, 6))
plt.plot(estimators, acuracias_rf, marker='o', linestyle='-')
plt.title('üìà Impacto do n_estimators na Acur√°cia do Random Forest')
plt.xlabel('N√∫mero de √Årvores na Floresta (n_estimators)')
plt.ylabel('Acur√°cia no Conjunto de Teste')
plt.xscale('log')
plt.grid(True)
plt.show()
