In [None]:
# Imports y carga del dataset (sin rutas rigidas)
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, classification_report, confusion_matrix

csv_path = None
for root, _, files in os.walk('.'):
    if 'fallos_producto.csv' in files:
        csv_path = os.path.join(root, 'fallos_producto.csv')
        break
if csv_path is None:
    raise FileNotFoundError('fallos_producto.csv no encontrado')
df_fallos = pd.read_csv(csv_path)
df_fallos.head()

Realiza un AED sobre el conjunto de datos.

In [None]:
df_fallos.info()
df_fallos.head()
df_fallos.isna().mean().sort_values(ascending=False).head(10)
df_fallos['failure'].isna().value_counts(dropna=False)
cat_cols = ['product_code', 'attribute_0', 'attribute_1']
missing_cat = [c for c in cat_cols if c not in df_fallos.columns]
missing_cat

Estadísticos iniciales. 0.2 puntos

In [None]:
df_fallos.describe(include='all').T

Distribuciones de las variables numéricas del conjunto de datos. 0.3 puntos

In [None]:
df_fallos.select_dtypes(include='number').hist(bins=30, figsize=(12, 8))
plt.suptitle('Distribuciones numericas')
plt.show()

Matriz de correlación. 0.5 puntos

In [None]:
corr = df_fallos.select_dtypes(include='number').corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, cmap='coolwarm', center=0)
plt.title('Matriz de correlacion')
plt.show()

Realiza el preprocesamiento de datos de tu problema.

In [None]:
# Separar labeled / unlabeled (failure NaN)
df_labeled = df_fallos[df_fallos['failure'].notna()].copy()
df_unlabeled = df_fallos[df_fallos['failure'].isna()].copy()

# Features/target solo con labeled
X_labeled = df_labeled.drop(columns=['failure'])
y_labeled = df_labeled['failure'].astype(int)

# Definir columnas categoricas y numericas
cat_cols = ['product_code', 'attribute_0', 'attribute_1']
num_cols = [c for c in X_labeled.columns if c not in cat_cols + ['id']]

X_unlabeled = df_unlabeled.drop(columns=['failure'])

# Asserts anti-leakage (labeled/unlabeled)
assert df_unlabeled['failure'].isna().all()

Reserva un conjunto de datos para validación y otro para testeo. 0.5 puntos

In [None]:
# Split estratificado SOLO dentro de labeled
X_train, X_temp, y_train, y_temp = train_test_split(
    X_labeled, y_labeled, test_size=0.30, stratify=y_labeled, random_state=42
)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42
)

# Asserts anti-leakage obligatorios
assert y_valid.notna().all() and y_test.notna().all()
assert set(X_train.index).isdisjoint(set(X_valid.index))
assert set(X_train.index).isdisjoint(set(X_test.index))
assert set(X_valid.index).isdisjoint(set(X_test.index))
assert df_fallos.loc[X_valid.index, "failure"].notna().all()
assert df_fallos.loc[X_test.index, "failure"].notna().all()

pd.Series({
    "train": len(X_train),
    "valid": len(X_valid),
    "test": len(X_test)
})

Columnas inútiles, valores sin sentido y atípicos. 1 punto

In [None]:
# Eliminar id (identificador) y justificar
X_train = X_train.drop(columns=['id'], errors='ignore')
X_valid = X_valid.drop(columns=['id'], errors='ignore')
X_test = X_test.drop(columns=['id'], errors='ignore')
X_unlabeled = X_unlabeled.drop(columns=['id'], errors='ignore')

# Reglas de atipicos: definir SOLO con train si se aplican
# En ausencia de criterio de negocio, se documenta que no se eliminan outliers.
X_train.shape, X_valid.shape, X_test.shape, X_unlabeled.shape

Tratamiento de valores nulos. 0.5 puntos

In [None]:
# Imputacion + OneHot + escalado (definir pipes, fit solo con train mas adelante)
num_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
cat_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

df_fallos.isna().mean().sort_values(ascending=False).head(10)

Análisis de variabilidad. 0.5 puntos

In [None]:
# Analisis de baja varianza SOLO en numericas de train
var_series = X_train[num_cols].var(numeric_only=True)
low_var_cols = var_series[var_series < 1e-6].index.tolist()
if low_var_cols:
    X_train = X_train.drop(columns=low_var_cols)
    X_valid = X_valid.drop(columns=low_var_cols)
    X_test = X_test.drop(columns=low_var_cols)
    X_unlabeled = X_unlabeled.drop(columns=low_var_cols)
    num_cols = [c for c in num_cols if c not in low_var_cols]
low_var_cols

Columnas categóricas. 0.5 punto

In [None]:
# Fit SOLO con train y transform al resto
preprocessor = ColumnTransformer(
    [
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols)
    ]
)

X_train_proc = preprocessor.fit_transform(X_train)
X_valid_proc = preprocessor.transform(X_valid)
X_test_proc = preprocessor.transform(X_test)
X_unlabeled_proc = preprocessor.transform(X_unlabeled)

X_train_proc.shape, X_valid_proc.shape, X_test_proc.shape, X_unlabeled_proc.shape

Reducción de la dimensionalidad. 1 punto

In [None]:
# TruncatedSVD para salida sparse de OneHot
max_components = max(2, min(100, X_train_proc.shape[1] - 1))
svd = TruncatedSVD(n_components=max_components, random_state=42)
X_train_svd = svd.fit_transform(X_train_proc)
X_valid_svd = svd.transform(X_valid_proc)
X_test_svd = svd.transform(X_test_proc)
X_unlabeled_svd = svd.transform(X_unlabeled_proc)

svd.explained_variance_ratio_.sum()

Realiza un etiquetado automático. 1 punto

In [None]:
# Self-training iterativo con umbral 0.90
threshold = 0.90
max_iters = 10

X_train_aug = pd.DataFrame(X_train_svd, index=X_train.index)
y_train_aug = y_train.copy()

X_unl = pd.DataFrame(X_unlabeled_svd, index=X_unlabeled.index)
unlabeled_stats = []

base_model = LogisticRegression(max_iter=2000, class_weight="balanced")

for it in range(1, max_iters + 1):
    if X_unl.shape[0] == 0:
        unlabeled_stats.append({"iter": it, "added": 0, "remaining": 0})
        break
    base_model.fit(X_train_aug, y_train_aug)
    proba = base_model.predict_proba(X_unl)
    max_proba = proba.max(axis=1)
    pseudo_mask = max_proba >= threshold
    added = int(pseudo_mask.sum())
    remaining = int((~pseudo_mask).sum())
    unlabeled_stats.append({"iter": it, "added": added, "remaining": remaining})
    if added == 0:
        break
    y_pseudo = proba[pseudo_mask].argmax(axis=1)
    X_train_aug = pd.concat([X_train_aug, X_unl[pseudo_mask]])
    y_train_aug = pd.concat([y_train_aug, pd.Series(y_pseudo, index=X_unl[pseudo_mask].index)])
    X_unl = X_unl.loc[~pseudo_mask]

stats_df = pd.DataFrame(unlabeled_stats)
stats_df

plt.plot(stats_df["iter"], stats_df["remaining"])
plt.title("Unlabeled restantes por iteracion")
plt.xlabel("Iteracion")
plt.ylabel("Remaining")
plt.show()

Entrena y optimiza distintos modelos supervisados.

In [None]:
# Helpers de evaluacion en valid
def eval_metrics(y_true, y_pred):
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred),
        "balanced_accuracy": balanced_accuracy_score(y_true, y_pred)
    }

X_valid_svd_df = pd.DataFrame(X_valid_svd, index=X_valid.index)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

Modelo 1. 1 punto

In [None]:
# Modelo 1: LogisticRegression
model1 = LogisticRegression(max_iter=2000, class_weight="balanced")
param_grid1 = {
    "C": [0.1, 1.0, 10.0],
    "solver": ["liblinear"]
}
grid1 = GridSearchCV(model1, param_grid1, cv=cv, scoring="f1", n_jobs=-1)
grid1.fit(X_train_aug, y_train_aug)
best1 = grid1.best_estimator_
pred1 = best1.predict(X_valid_svd_df)
metrics1 = eval_metrics(y_valid, pred1)
grid1.best_params_, metrics1

Modelo 2. 1 punto

In [None]:
# Modelo 2: RandomForest
model2 = RandomForestClassifier(random_state=42, class_weight="balanced")
param_grid2 = {
    "n_estimators": [200, 400],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5]
}
grid2 = GridSearchCV(model2, param_grid2, cv=cv, scoring="f1", n_jobs=-1)
grid2.fit(X_train_aug, y_train_aug)
best2 = grid2.best_estimator_
pred2 = best2.predict(X_valid_svd_df)
metrics2 = eval_metrics(y_valid, pred2)
grid2.best_params_, metrics2

Modelo 3. 1 punto

In [None]:
# Modelo 3: SVC (probability=True para soft voting)
model3 = SVC(probability=True, class_weight="balanced")
param_grid3 = {
    "C": [0.5, 1.0, 2.0],
    "kernel": ["rbf"],
    "gamma": ["scale", "auto"]
}
grid3 = GridSearchCV(model3, param_grid3, cv=cv, scoring="f1", n_jobs=-1)
grid3.fit(X_train_aug, y_train_aug)
best3 = grid3.best_estimator_
pred3 = best3.predict(X_valid_svd_df)
metrics3 = eval_metrics(y_valid, pred3)
grid3.best_params_, metrics3

Crea un modelo ensemble y explica el criterio que utilizas. 1 punto

In [None]:
# Pesos basados en f1 de valid (normalizados)
f1_scores = np.array([metrics1["f1"], metrics2["f1"], metrics3["f1"]])
weights = (f1_scores / f1_scores.sum()).tolist() if f1_scores.sum() > 0 else [1, 1, 1]

ensemble = VotingClassifier(
    estimators=[("lr", best1), ("rf", best2), ("svc", best3)],
    voting="soft",
    weights=weights
)
ensemble.fit(X_train_aug, y_train_aug)
valid_pred = ensemble.predict(X_valid_svd_df)
ensemble_metrics = eval_metrics(y_valid, valid_pred)
weights, ensemble_metrics

In [None]:
# Evaluacion final SOLO en test (una vez al final)
X_test_svd_df = pd.DataFrame(X_test_svd, index=X_test.index)
test_pred = ensemble.predict(X_test_svd_df)
print(classification_report(y_test, test_pred))
print("Accuracy:", accuracy_score(y_test, test_pred))
print("Confusion matrix:")
print(confusion_matrix(y_test, test_pred))

Por completar tras ejecutar: accuracy, f1, balanced_accuracy, matriz de confusión y conclusión final.