In [1]:
import pandas as pd
import pyarrow.parquet as pq
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
# from ydata_profiling import ProfileReport

In [2]:
TRAIN_FILE = "data/train.parquet"
TARGET_CHANNELS_FILE = "data/target_channels.csv"
TEST_FILE = "data/test.parquet"

In [3]:
starter_ch = [f"channel_{i}" for i in range(41, 47)]

df_train = pd.read_parquet(
    TRAIN_FILE,
    columns=["id", "is_anomaly"] + starter_ch
)

df_test = pd.read_parquet(
    TEST_FILE,
    columns=["id"] + starter_ch
)

print("Train shape:", df_train.shape)
print("Test shape:", df_test.shape)
df_train.head()

Train shape: (14728321, 8)
Test shape: (521280, 7)


Unnamed: 0,id,is_anomaly,channel_41,channel_42,channel_43,channel_44,channel_45,channel_46
0,0,0,0.812578,0.786344,0.7719,0.799178,0.816855,0.765296
1,1,0,0.812578,0.786344,0.7719,0.799178,0.816855,0.765296
2,2,0,0.821213,0.789557,0.770317,0.809411,0.816006,0.765296
3,3,0,0.819642,0.786344,0.770317,0.80705,0.816855,0.766985
4,4,0,0.821996,0.788753,0.770317,0.807837,0.818551,0.761073


In [4]:
# profile = ProfileReport(df_train, title="Profiling Report")
# profile.to_file("report.html")

In [5]:
print(df_train.dtypes)

id              int64
is_anomaly      uint8
channel_41    float32
channel_42    float32
channel_43    float32
channel_44    float32
channel_45    float32
channel_46    float32
dtype: object


¿Por qué este método?

    Identificación clara de missing: marcamos explícitamente los 0.0 como NaN, distinguiéndolos de ceros reales (aunque aquí 0 no es físico).

    Sencillez y eficiencia: LOCF/FOCB es muy rápido en series largas y no inventa valores nuevos, simplemente replica un valor cercano.

    Preserva la dinámica local: al copiar el último valor conocido, no introduces saltos extraños ni picos artificiales que desentonen en tu ventana de análisis.

    Adecuado para huecos pequeños y dispersos: en tu serie los ceros aparecen en bloques cortos. Este método arrastra un valor ligeramente desfasado solo durante ese breve intervalo, minimizando el sesgo.

Si luego quisieras suavizar aún más, podrías incorporar interpolación lineal o un filtro de Kalman, pero para arrancar rápido y sin romper la continuidad, LOCF+FOCB es la opción más práctica.

In [6]:
# 1. Detectar canales con ceros sentinela
zero_counts = (df_train[starter_ch] == 0.0).sum()
zero_chs = zero_counts[zero_counts > 0].index.tolist()
print("Canales a imputar:", zero_chs)

# 2. Reemplazar 0.0 → NaN
df_train[zero_chs] = df_train[zero_chs].replace(0.0, np.nan)

# 3. Imputar con forward-fill y backward-fill
df_train[zero_chs] = df_train[zero_chs].ffill().bfill()

# 4. Verificar que ya no quedan NaN
print("NaNs tras imputación:", df_train[zero_chs].isna().sum().sum())


Canales a imputar: ['channel_41', 'channel_42', 'channel_43', 'channel_44', 'channel_45', 'channel_46']
NaNs tras imputación: 0


In [7]:
scaler = StandardScaler()

df_train[starter_ch] = scaler.fit_transform(df_train[starter_ch])

In [8]:
# División temporal manual (80 % primeros para train, 20 % últimos para validation)
split = int(len(df_train) * 0.8)

train_df = df_train.iloc[:split].reset_index(drop=True)
val_df   = df_train.iloc[split:].reset_index(drop=True)

X_train = train_df[starter_ch]
y_train = train_df["is_anomaly"]
X_val   = val_df[starter_ch]
y_val   = val_df["is_anomaly"]


In [9]:
# Parámetro de ventana
W = 100

# 1️⃣ Crear rolling window sobre X_train
roll_train = X_train.rolling(window=W, min_periods=1)
X_train_feat = pd.DataFrame(index=X_train.index)

for ch in starter_ch:
    X_train_feat[f"{ch}_mean_{W}"] = roll_train[ch].mean()
    X_train_feat[f"{ch}_std_{W}"]  = roll_train[ch].std().fillna(0)
    X_train_feat[f"{ch}_min_{W}"]  = roll_train[ch].min()
    X_train_feat[f"{ch}_max_{W}"]  = roll_train[ch].max()
    X_train_feat[f"{ch}_diff"]     = X_train[ch].diff().fillna(0)

# 2️⃣ Crear rolling window sobre X_val
roll_val = X_val.rolling(window=W, min_periods=1)
X_val_feat = pd.DataFrame(index=X_val.index)

for ch in starter_ch:
    X_val_feat[f"{ch}_mean_{W}"] = roll_val[ch].mean()
    X_val_feat[f"{ch}_std_{W}"]  = roll_val[ch].std().fillna(0)
    X_val_feat[f"{ch}_min_{W}"]  = roll_val[ch].min()
    X_val_feat[f"{ch}_max_{W}"]  = roll_val[ch].max()
    X_val_feat[f"{ch}_diff"]     = X_val[ch].diff().fillna(0)

# 3️⃣ Verificar shapes
print("Train features shape:", X_train_feat.shape)
print("Val   features shape:", X_val_feat.shape)

Train features shape: (11782656, 30)
Val   features shape: (2945665, 30)


In [10]:
# 4️⃣ Escalado de las features de ventana
from sklearn.preprocessing import StandardScaler

scaler_feat = StandardScaler()
X_train_feat_scaled = scaler_feat.fit_transform(X_train_feat)
X_val_feat_scaled   = scaler_feat.transform(X_val_feat)

# Verificar
print("Train feat scaled shape:", X_train_feat_scaled.shape)
print("Val   feat scaled shape:", X_val_feat_scaled.shape)

Train feat scaled shape: (11782656, 30)
Val   feat scaled shape: (2945665, 30)


## Entrenamiento de modelos

In [11]:
import xgboost as xgb
from sklearn.metrics import f1_score

# Entrenar modelo con todos los datos disponibles
model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric="logloss",
    tree_method="gpu_hist"  # Usa "gpu_hist" si tienes GPU
)

model.fit(X_train_feat_scaled, y_train)

# Evaluación en validación
y_val_pred = model.predict(X_val_feat_scaled)
print("F1-score en validación:", f1_score(y_val, y_val_pred))



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "use_label_encoder" } are not used.


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




F1-score en validación: 0.09993570050725155


In [12]:
# Cargar test
df_test = pd.read_parquet(TEST_FILE, columns=["id"] + starter_ch)

# Imputar ceros como en train
df_test[zero_chs] = df_test[zero_chs].replace(0.0, np.nan)
df_test[zero_chs] = df_test[zero_chs].ffill().bfill()

# Escalar
df_test[starter_ch] = scaler.transform(df_test[starter_ch])

# Crear features de rolling window
roll_test = df_test[starter_ch].rolling(window=W, min_periods=1)
X_test_feat = pd.DataFrame(index=df_test.index)

for ch in starter_ch:
    X_test_feat[f"{ch}_mean_{W}"] = roll_test[ch].mean()
    X_test_feat[f"{ch}_std_{W}"]  = roll_test[ch].std().fillna(0)
    X_test_feat[f"{ch}_min_{W}"]  = roll_test[ch].min()
    X_test_feat[f"{ch}_max_{W}"]  = roll_test[ch].max()
    X_test_feat[f"{ch}_diff"]     = df_test[ch].diff().fillna(0)

# Escalar features
X_test_feat_scaled = scaler_feat.transform(X_test_feat)

# Predicción
y_pred_test = model.predict(X_test_feat_scaled)

# Guardar CSV
submission = pd.DataFrame({
    "id": df_test["id"],
    "is_anomaly": y_pred_test.astype(int)
})
submission.to_csv("submission.csv", index=False)
print("✅ Archivo 'submission.csv' generado con éxito.")


✅ Archivo 'submission.csv' generado con éxito.
