In [1]:
# Paso 1: Instalación e importación de librerías
!pip install -q kaggle
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Paso 2: Subir credenciales de Kaggle (kaggle.json)
from google.colab import files
files.upload()  # Subir kaggle.json

# Paso 3: Configurar kaggle y descargar datos
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c playground-series-s4e2
!unzip -q playground-series-s4e2.zip

# Paso 4: Cargar datos
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample = pd.read_csv("sample_submission.csv")

print("Dimensiones del train:", train.shape)
print("Dimensiones del test:", test.shape)
train.head()

# Paso 5: Análisis exploratorio
print("Clases únicas:", train['Obesity_Risk'].unique())
print("\nDistribución de clases:\n", train['Obesity_Risk'].value_counts())

# Paso 6: Preprocesamiento
X = train.drop(['id', 'Obesity_Risk'], axis=1)
y = train['Obesity_Risk']
X_test = test.drop(['id'], axis=1)

# Codificar variables categóricas
cat_cols = X.select_dtypes(include='object').columns
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    X_test[col] = le.transform(X_test[col])

# Escalado opcional (Gradient Boosting y RF no lo necesitan pero se incluye por claridad)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# Dividir para validación local
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Paso 7: Entrenar modelos
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_val)

gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)
gb_preds = gb.predict(X_val)

# Paso 8: Evaluar modelos
print("Random Forest:\n", classification_report(y_val, rf_preds))
print("Gradient Boosting:\n", classification_report(y_val, gb_preds))

# Matriz de confusión
plt.figure(figsize=(8, 4))
sns.heatmap(confusion_matrix(y_val, rf_preds), annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y), yticklabels=np.unique(y))
plt.title("Matriz de confusión - Random Forest")
plt.xlabel("Predicho")
plt.ylabel("Real")
plt.show()

# Paso 9: Entrenar con todo el set y generar predicciones para envío
rf.fit(X_scaled, y)
predicciones = rf.predict(X_test_scaled)

# Paso 10: Crear submission.csv
sample['Obesity_Risk'] = predicciones
sample.to_csv('submission.csv', index=False)

# Paso 11: Descargar para enviar en Kaggle
from google.colab import files
files.download('submission.csv')


Saving kaggle.json to kaggle.json
403 Client Error: Forbidden for url: https://www.kaggle.com/api/v1/competitions/data/download-all/playground-series-s4e2
unzip:  cannot find or open playground-series-s4e2.zip, playground-series-s4e2.zip.zip or playground-series-s4e2.zip.ZIP.


FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'