In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from lightgbm import LGBMClassifier

In [2]:
train_log = pd.read_csv("train_log.csv")

print(train_log.shape)
print(train_log.isna().mean().sort_values(ascending=False))

(3043, 8)
Z_err                  1.0
object_id              0.0
Z                      0.0
EBV                    0.0
SpecType               0.0
English Translation    0.0
split                  0.0
target                 0.0
dtype: float64


In [3]:
train_log = train_log.drop(columns=["Z_err"])

In [5]:
y = train_log["target"]
print(y.value_counts(normalize=True))

target
0    0.951364
1    0.048636
Name: proportion, dtype: float64


In [7]:
def extract_lc_features(df):
    features = []

    for obj_id, g in df.groupby("object_id"):
        feat = {"object_id": obj_id}

        for f in ["g", "r", "i", "z", "y"]:
            gf = g[g["Filter"] == f]

            feat[f"{f}_mean"] = gf["Flux"].mean()
            feat[f"{f}_std"]  = gf["Flux"].std()
            feat[f"{f}_max"]  = gf["Flux"].max()
            feat[f"{f}_min"]  = gf["Flux"].min()
            feat[f"{f}_nobs"] = len(gf)

        feat["n_obs"] = len(g)
        feat["duration"] = g["Time (MJD)"].max() - g["Time (MJD)"].min()
        feat["flux_skew"] = g["Flux"].skew()
        feat["flux_kurt"] = g["Flux"].kurt()

        features.append(feat)

    return pd.DataFrame(features).fillna(0)

In [8]:
train_feats = []

for i in range(1, 21):
    split = f"split_{i:02d}"
    print(f"Processing {split}")

    train_lc = pd.read_csv(f"{split}/train_full_lightcurves.csv")
    train_feats.append(extract_lc_features(train_lc))

train_feats = pd.concat(train_feats, ignore_index=True)

Processing split_01
Processing split_02
Processing split_03
Processing split_04
Processing split_05
Processing split_06
Processing split_07
Processing split_08
Processing split_09
Processing split_10
Processing split_11
Processing split_12
Processing split_13
Processing split_14
Processing split_15
Processing split_16
Processing split_17
Processing split_18
Processing split_19
Processing split_20


In [14]:
X = train_df.drop(
    columns=["target", "object_id", "SpecType", "English Translation", "split"],
    errors="ignore"
)

y = train_df["target"]

In [15]:
pos_weight = (y == 0).sum() / (y == 1).sum()


In [16]:
model = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("clf", LGBMClassifier(
        n_estimators=1000,
        learning_rate=0.03,
        num_leaves=64,
        subsample=0.8,
        colsample_bytree=0.8,
        class_weight={0:1, 1:pos_weight},
        objective="binary",
        metric="auc",
        random_state=42
    ))
])

In [17]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
aucs = []

for fold, (tr, val) in enumerate(skf.split(X, y)):
    model.fit(X.iloc[tr], y.iloc[tr])
    preds = model.predict_proba(X.iloc[val])[:,1]

    auc = roc_auc_score(y.iloc[val], preds)
    aucs.append(auc)

    print(f"Fold {fold+1} AUC: {auc:.4f}")

print("Mean AUC:", np.mean(aucs))

[LightGBM] [Info] Number of positive: 118, number of negative: 2316
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000249 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6773
[LightGBM] [Info] Number of data points in the train set: 2434, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499154 -> initscore=-0.003384
[LightGBM] [Info] Start training from score -0.003384




Fold 1 AUC: 0.7609
[LightGBM] [Info] Number of positive: 118, number of negative: 2316
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000211 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6777
[LightGBM] [Info] Number of data points in the train set: 2434, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499154 -> initscore=-0.003384
[LightGBM] [Info] Start training from score -0.003384




Fold 2 AUC: 0.8440
[LightGBM] [Info] Number of positive: 118, number of negative: 2316
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000282 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6777
[LightGBM] [Info] Number of data points in the train set: 2434, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499154 -> initscore=-0.003384
[LightGBM] [Info] Start training from score -0.003384
Fold 3 AUC: 0.7693
[LightGBM] [Info] Number of positive: 119, number of negative: 2316
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000298 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6768
[LightGBM] [Info] Number of data points in the train set: 2435, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501264 -> initscore=0.005055
[LightGBM] [Info] Start training from







Fold 4 AUC: 0.8465
[LightGBM] [Info] Number of positive: 119, number of negative: 2316
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000267 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6761
[LightGBM] [Info] Number of data points in the train set: 2435, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501264 -> initscore=0.005055
[LightGBM] [Info] Start training from score 0.005055
Fold 5 AUC: 0.8085
Mean AUC: 0.8058170051416434




In [18]:
test_probs = model.predict_proba(X_test)[:, 1]
test_preds = (test_probs > best_t).astype(int)

submission = pd.DataFrame({
    "object_id": test_df["object_id"],
    "prediction": test_preds
})

submission.to_csv("newSubmission.csv", index=False)

NameError: name 'X_test' is not defined