<a href="https://colab.research.google.com/github/isabelsanchez2-cpu/IA2025/blob/main/03_modelo_con_preprocesado_de_tal_forma_y_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# 03 - modelo SVM OPTIMIZADO

import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.calibration import CalibratedClassifierCV
import gc
import warnings
warnings.filterwarnings('ignore')

# %% Cargar datos
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print(f'Train: {train.shape}, Test: {test.shape}')

# %% Separar componentes
train_ids = train['ID'].values
test_ids = test['ID'].values
y_train = train['RENDIMIENTO_GLOBAL'].copy()

X_train = train.drop(['ID', 'RENDIMIENTO_GLOBAL'], axis=1)
X_test = test.drop(['ID'], axis=1)

del train, test
gc.collect()


print('\n--- Preprocesamiento ---')

# Identificar columnas
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

print(f'Numéricas: {len(numeric_cols)}, Categóricas: {len(categorical_cols)}')

# Limpieza
for col in categorical_cols:
    X_train[col] = X_train[col].replace(['', ' ', 'nan'], np.nan)
    X_test[col] = X_test[col].replace(['', ' ', 'nan'], np.nan)

# Imputación
if len(numeric_cols) > 0:
    num_imp = SimpleImputer(strategy='median')
    X_train[numeric_cols] = num_imp.fit_transform(X_train[numeric_cols])
    X_test[numeric_cols] = num_imp.transform(X_test[numeric_cols])
    del num_imp

if len(categorical_cols) > 0:
    cat_imp = SimpleImputer(strategy='most_frequent')
    X_train[categorical_cols] = cat_imp.fit_transform(X_train[categorical_cols])
    X_test[categorical_cols] = cat_imp.transform(X_test[categorical_cols])
    del cat_imp

# %% Codificación
for col in categorical_cols:
    le = LabelEncoder()
    combined = pd.concat([X_train[col].astype(str), X_test[col].astype(str)])
    le.fit(combined)
    X_train[col] = le.transform(X_train[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))
    del combined

print(f' {len(categorical_cols)} categóricas codificadas')
gc.collect()

# %% Limpieza final
X_train = X_train.replace([np.inf, -np.inf], np.nan).fillna(0)
X_test = X_test.replace([np.inf, -np.inf], np.nan).fillna(0)

print('\n--- Escalado ---')
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f' Features escaladas: {X_train_scaled.shape}')

del X_train, X_test
gc.collect()

print('\n' + '='*70)

print('='*70)


print('\n[1/2] LinearSVC (100x más rápido que SVM RBF)...')
linear_svm = LinearSVC(
    C=1.0,
    class_weight='balanced',
    max_iter=2000,
    random_state=42,
    dual=False
)

print('  Entrenando...')
linear_svm.fit(X_train_scaled, y_train)
train_acc_1 = linear_svm.score(X_train_scaled, y_train)
print(f'  LinearSVC entrenado - Train Acc: {train_acc_1:.4f}')


print('\n[2/2] SGDClassifier (ultra-rápido)...')
sgd_model = SGDClassifier(
    loss='hinge',  # SVM loss
    penalty='l2',
    alpha=0.0001,
    max_iter=1000,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)

print('  Entrenando...')
sgd_model.fit(X_train_scaled, y_train)
train_acc_2 = sgd_model.score(X_train_scaled, y_train)
print(f'  SGD entrenado - Train Acc: {train_acc_2:.4f}')

print('\n--- Generando Predicciones con Ensemble ---')

# Predicciones de ambos modelos
pred1 = linear_svm.predict(X_test_scaled)
pred2 = sgd_model.predict(X_test_scaled)

print(f'Tipo de pred1: {type(pred1[0])}, Ejemplo: {pred1[:5]}')
print(f'Tipo de pred2: {type(pred2[0])}, Ejemplo: {pred2[:5]}')

# Convertir predicciones a string si vienen como int
pred1_str = pred1.astype(str) if pred1.dtype != 'object' else pred1
pred2_str = pred2.astype(str) if pred2.dtype != 'object' else pred2

# Ensemble por voting (moda) - trabajar con strings
predictions = []
for i in range(len(X_test_scaled)):
    votes = [pred1_str[i], pred2_str[i]]
    # Obtener el valor más común
    prediction = max(set(votes), key=votes.count)
    predictions.append(prediction)

# Convertir a numpy array
predictions = np.array(predictions)

print(f' {len(predictions)} predicciones generadas')
print(f'Tipo de predicciones: {type(predictions[0])}, Ejemplos: {predictions[:5]}')

# %% Distribución
print('\n--- Distribución de Predicciones ---')
unique, counts = np.unique(predictions, return_counts=True)
for val, count in zip(unique, counts):
    pct = count / len(predictions) * 100
    print(f'  Clase {val}: {count} ({pct:.1f}%)')


submission = pd.DataFrame({
    'ID': test_ids,
    'RENDIMIENTO_GLOBAL': predictions
})

print('\n--- Verificación ---')
print(f' Shape: {submission.shape}')
print(f' IDs únicos: {submission["ID"].nunique()}')
print(f' Clases: {sorted(submission["RENDIMIENTO_GLOBAL"].unique())}')

submission.to_csv('submission_svm_.csv', index=False)


# %% Limpieza
del X_train_scaled, X_test_scaled
gc.collect()


print('\n' + '='*70)

print('='*70)

print(f'✓ Train Accuracy: LinearSVC={train_acc_1:.4f}, SGD={train_acc_2:.4f}')


Train: (692500, 21), Test: (296786, 20)

--- Preprocesamiento ---
Numéricas: 5, Categóricas: 14
 14 categóricas codificadas

--- Escalado ---
 Features escaladas: (692500, 19)


[1/2] LinearSVC (100x más rápido que SVM RBF)...
  Entrenando...
  LinearSVC entrenado - Train Acc: 0.3759

[2/2] SGDClassifier (ultra-rápido)...
  Entrenando...
  SGD entrenado - Train Acc: 0.3526

--- Generando Predicciones con Ensemble ---
Tipo de pred1: <class 'str'>, Ejemplo: ['alto' 'medio-alto' 'bajo' 'bajo' 'medio-alto']
Tipo de pred2: <class 'numpy.str_'>, Ejemplo: ['medio-alto' 'bajo' 'bajo' 'bajo' 'bajo']
 296786 predicciones generadas
Tipo de predicciones: <class 'numpy.str_'>, Ejemplos: ['alto' 'bajo' 'bajo' 'bajo' 'bajo']

--- Distribución de Predicciones ---
  Clase alto: 98169 (33.1%)
  Clase bajo: 152852 (51.5%)
  Clase medio-alto: 19321 (6.5%)
  Clase medio-bajo: 26444 (8.9%)

--- Verificación ---
 Shape: (296786, 2)
 IDs únicos: 296786
 Clases: ['alto', 'bajo', 'medio-alto', 'medio-bajo']

✓ 