In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

train_log = pd.read_csv('drive/MyDrive/data/train_log.csv')
train_full_lightcurves = pd.read_csv('drive/MyDrive/data/split_01/train_full_lightcurves.csv')

In [21]:
print(train_full_lightcurves.columns)
print(train_log.columns)

Index(['object_id', 'Time (MJD)', 'Flux', 'Flux_err', 'Filter'], dtype='object')
Index(['object_id', 'Z', 'Z_err', 'EBV', 'SpecType', 'English Translation',
       'split', 'target'],
      dtype='object')


In [2]:
import numpy as np
import pandas as pd
from scipy.stats import skew, kurtosis

def extract_basic_features(lc):
    rows = []

    for obj_id, g in lc.groupby("object_id"):
        row = {"object_id": obj_id}

        # ====================
        # GLOBAL FLUX FEATURES
        # ====================
        flux = g["Flux"].values
        time = g["Time (MJD)"].values

        row["flux_mean"] = np.mean(flux)
        row["flux_std"] = np.std(flux)
        row["flux_min"] = np.min(flux)
        row["flux_max"] = np.max(flux)
        row["flux_median"] = np.median(flux)

        row["flux_amp"] = row["flux_max"] - row["flux_min"]
        row["flux_range"] = row["flux_amp"]

        # Quantiles
        q25 = np.quantile(flux, 0.25)
        q75 = np.quantile(flux, 0.75)
        row["flux_q25"] = q25
        row["flux_q75"] = q75
        row["flux_iqr"] = q75 - q25

        # Distribution shape
        row["flux_skew"] = skew(flux) if len(flux) > 2 else 0.0
        row["flux_kurtosis"] = kurtosis(flux) if len(flux) > 3 else 0.0

        # Ratios statistiques
        eps = 1e-6
        row["flux_std_over_mean"] = row["flux_std"] / (abs(row["flux_mean"]) + eps)
        row["flux_max_over_mean"] = row["flux_max"] / (abs(row["flux_mean"]) + eps)

        # ====================
        # TEMPORAL FEATURES
        # ====================
        row["time_span"] = time.max() - time.min()
        row["time_mean"] = np.mean(time)
        row["time_std"] = np.std(time)
        row["time_gap_mean"] = np.mean(np.diff(np.sort(time))) if len(time) > 2 else 0.0

        peak_idx = np.argmax(flux)
        t_peak = time[peak_idx]

        row["peak_time_frac"] = (
            (t_peak - time.min()) / (row["time_span"] + eps)
        )

        if len(time) > 1 and np.unique(time).size > 1:
            slope = np.polyfit(time, flux, 1)[0]
            row["flux_slope"] = slope if np.isfinite(slope) else 0.0
        else:
            row["flux_slope"] = 0.0

        row["obs_density"] = len(time) / (row["time_span"] + eps)

        # ====================
        # RISE / DECAY FEATURES
        # ====================
        flux_before = flux[time <= t_peak]
        flux_after = flux[time > t_peak]

        row["rise_rate"] = (
            (flux_before.max() - flux_before.min()) /
            (t_peak - time.min() + eps)
            if len(flux_before) > 1 else 0.0
        )

        row["decay_rate"] = (
            (flux_after.max() - flux_after.min()) /
            (time.max() - t_peak + eps)
            if len(flux_after) > 1 else 0.0
        )

        row["asymmetry"] = row["rise_rate"] / (row["decay_rate"] + eps)

        # ====================
        # FILTER-WISE FEATURES
        # ====================
        for f in ["u", "g", "r", "i", "z", "y"]:
            gf = g[g["Filter"] == f]
            f_flux = gf["Flux"].values

            row[f"n_obs_{f}"] = len(f_flux)

            if len(f_flux) > 0:
                row[f"flux_mean_{f}"] = np.mean(f_flux)
                row[f"flux_std_{f}"] = np.std(f_flux)
                row[f"flux_amp_{f}"] = np.max(f_flux) - np.min(f_flux)
            else:
                row[f"flux_mean_{f}"] = 0.0
                row[f"flux_std_{f}"] = 0.0
                row[f"flux_amp_{f}"] = 0.0

        # ====================
        # INTER-FILTER RATIOS
        # ====================
        row["g_r_ratio"] = row["flux_mean_g"] / (row["flux_mean_r"] + eps)
        row["r_i_ratio"] = row["flux_mean_r"] / (row["flux_mean_i"] + eps)
        row["i_z_ratio"] = row["flux_mean_i"] / (row["flux_mean_z"] + eps)

        # ====================
        # COMBINED FEATURES
        # ====================
        row["flux_amp_x_density"] = row["flux_amp"] * row["obs_density"]
        row["slope_x_amp"] = row["flux_slope"] * row["flux_amp"]


        row["flux_p10"] = np.percentile(flux, 10)
        row["flux_p90"] = np.percentile(flux, 90)
        row["flux_p90_p10"] = row["flux_p90"] - row["flux_p10"]
        row["flux_median"] = np.median(flux)
        row["flux_mad"] = np.median(np.abs(flux - row["flux_median"]))


        if len(time) > 2:
            dt = np.diff(np.sort(time))
            df = np.diff(flux[np.argsort(time)])
            row["flux_change_mean"] = np.mean(np.abs(df))
            row["flux_change_std"] = np.std(df)
            row["flux_change_rate"] = np.mean(np.abs(df) / (dt + 1e-6))
        else:
            row["flux_change_mean"] = 0
            row["flux_change_std"] = 0
            row["flux_change_rate"] = 0


        row["n_peaks"] = np.sum(flux > (row["flux_mean"] + 2 * row["flux_std"]))
        row["peak_ratio"] = row["n_peaks"] / (len(flux) + 1e-6)


        row["early_flux_mean"] = np.mean(flux[time < np.percentile(time, 30)])
        row["late_flux_mean"]  = np.mean(flux[time > np.percentile(time, 70)])
        row["early_late_ratio"] = row["early_flux_mean"] / (row["late_flux_mean"] + 1e-6)


        rows.append(row)

    df = pd.DataFrame(rows)

    # Nettoyage final
    num_cols = df.columns.drop("object_id")
    df[num_cols] = df[num_cols].replace([np.inf, -np.inf], 0).fillna(0)

    return df


In [3]:
DATA_DIR = "drive/MyDrive/data"  # dossier racine kaggle
SPLITS = [f"split_{i:02d}" for i in range(1, 21)]



all_train_features = []
all_train_labels = []

for split in SPLITS:
    print(f"Processing {split}...")

    # chemins
    train_lc_path = f"{DATA_DIR}/{split}/train_full_lightcurves.csv"
    #train_log_path = f"{DATA_DIR}/{split}/train_log.csv"

    # chargement
    train_lc  = pd.read_csv(train_lc_path)
    #train_log = pd.read_csv(train_log_path)

    # extraction features
    feats = extract_basic_features(train_lc)

    # jointure labels
    feats = feats.merge(
        train_log[["object_id", "target"]],
        on="object_id",
        how="inner"
    )

    all_train_features.append(feats)

# dataset final
train_df = pd.concat(all_train_features, ignore_index=True)

print(train_df.shape)
train_df.head()


Processing split_01...
Processing split_02...
Processing split_03...
Processing split_04...
Processing split_05...
Processing split_06...
Processing split_07...
Processing split_08...
Processing split_09...
Processing split_10...
Processing split_11...
Processing split_12...
Processing split_13...
Processing split_14...
Processing split_15...
Processing split_16...
Processing split_17...
Processing split_18...
Processing split_19...
Processing split_20...
(3043, 67)


Unnamed: 0,object_id,flux_mean,flux_std,flux_min,flux_max,flux_median,flux_amp,flux_range,flux_q25,flux_q75,...,flux_mad,flux_change_mean,flux_change_std,flux_change_rate,n_peaks,peak_ratio,early_flux_mean,late_flux_mean,early_late_ratio,target
0,Dornhoth_fervain_onodrim,0.928483,4.766352,-2.756285,25.047343,-0.36784,27.803628,27.803628,-1.288658,0.984804,...,1.056697,1.696714,3.804346,437241.687603,3,0.046154,-0.699784,-1.325416,0.527973,0
1,Dornhoth_galadh_ylf,0.388622,1.367368,-1.747082,11.375499,0.094237,13.122581,13.122581,-0.099817,0.442587,...,0.278818,0.778374,1.240541,215882.267442,6,0.035928,0.120878,0.292355,0.413462,0
2,Elrim_melethril_thul,1.691347,2.602937,-6.400816,6.617915,1.076724,13.018732,13.018732,0.300356,3.161688,...,1.143666,1.348214,1.804061,384639.465704,0,0.0,0.669518,1.458723,0.458975,0
3,Ithil_tobas_rodwen,0.375366,0.85922,-7.641818,5.353821,0.327391,12.995639,12.995639,-0.035096,0.729434,...,0.376897,0.632979,1.057125,321605.375697,17,0.021303,-0.073222,0.434552,-0.1685,0
4,Mirion_adar_Druadan,0.233832,1.142101,-3.060399,5.384463,0.308845,8.444862,8.444862,-0.565717,0.842176,...,0.656705,0.659981,1.018386,221888.33915,2,0.015504,0.069015,-0.716978,-0.096259,0


In [4]:
# aucune fuite
assert train_df["object_id"].is_unique

# pas de NaN
assert train_df.isna().sum().sum() == 0

# séparation X / y
X = train_df.drop(columns=["object_id", "target"])
y = train_df["target"]

print(X.shape, y.value_counts())


(3043, 65) target
0    2895
1     148
Name: count, dtype: int64


In [5]:
#from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
import numpy as np

from sklearn.model_selection import GroupKFold

groups = train_df["object_id"]
cv = GroupKFold(n_splits=5)


scores = []

for train_idx, val_idx in cv.split(X, y, groups):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = LGBMClassifier(
        n_estimators=600,
        learning_rate=0.05,
        num_leaves=63,
        class_weight="balanced",
        random_state=42
    )

    model.fit(X_tr, y_tr)
    probas = model.predict_proba(X_val)[:, 1]

    score = roc_auc_score(y_val, probas)
    scores.append(score)

print("CV ROC AUC (GROUP) :", np.mean(scores))



[LightGBM] [Info] Number of positive: 112, number of negative: 2322
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003437 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15176
[LightGBM] [Info] Number of data points in the train set: 2434, number of used features: 65
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 128, number of negative: 2306
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002141 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15166
[LightGBM] [Info] Number of data points in the train set: 2434, number of used features: 65
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Nu

TEST ET SOUMISSION

In [6]:
all_test_features = []

for split in SPLITS:
    print(f"Processing test {split}...")
    test_lc_path = f"{DATA_DIR}/{split}/test_full_lightcurves.csv"
    test_lc = pd.read_csv(test_lc_path)

    feats = extract_basic_features(test_lc)
    all_test_features.append(feats)

# concaténation finale
test_df = pd.concat(all_test_features, ignore_index=True)

# features uniquement
X_test = test_df.drop(columns=["object_id"])

print(test_df.shape)


Processing test split_01...
Processing test split_02...
Processing test split_03...
Processing test split_04...
Processing test split_05...
Processing test split_06...
Processing test split_07...
Processing test split_08...
Processing test split_09...
Processing test split_10...
Processing test split_11...
Processing test split_12...
Processing test split_13...
Processing test split_14...
Processing test split_15...
Processing test split_16...
Processing test split_17...
Processing test split_18...
Processing test split_19...
Processing test split_20...
(7135, 66)


In [7]:
# from lightgbm import LGBMClassifier


# all_test_features = []

# for split in SPLITS:
#     print(f"Processing test {split}...")
#     test_lc_path = f"{DATA_DIR}/{split}/test_full_lightcurves.csv"
#     test_lc = pd.read_csv(test_lc_path)
#     feats = extract_basic_features(test_lc)  # features v3
#     all_test_features.append(feats)

# # concaténation finale
# test_df = pd.concat(all_test_features, ignore_index=True)
# X_test = test_df.drop(columns=["object_id"])


# # final_model = LGBMClassifier(
# #     n_estimators=500,
# #     learning_rate=0.05,
# #     num_leaves=31,
# #     class_weight="balanced",
# #     random_state=42
# # )

# # final_model.fit(X, y)

# # test_probas = final_model.predict_proba(X_test)[:, 1]

# # BEST_THRESHOLD = best  # valeur trouvée au-dessus

# # test_df["target"] = (test_probas >= BEST_THRESHOLD).astype(int)
# # #nouveau


In [8]:

#A SUPPRIMER

# from sklearn.metrics import f1_score
# scale_pos_weight = (len(y) - y.sum()) / y.sum()

# final_model = LGBMClassifier(
#     n_estimators=1500,
#     learning_rate=0.02,
#     num_leaves=127,
#     max_depth=-1,
#     min_child_samples=20,
#     subsample=0.8,
#     colsample_bytree=0.8,
#     scale_pos_weight=scale_pos_weight,
#     random_state=42,
#     n_jobs=-1
# )

# model.fit(X, y)

# # Probabilités sur validation
# probas_val = model.predict_proba(X_val)[:, 1]

# # Recherche du meilleur seuil
# thresholds = np.linspace(0.05, 0.95, 200)
# scores = []

# for t in thresholds:
#     preds = (probas_val >= t).astype(int)
#     scores.append(f1_score(y_val, preds))

# BEST_THRESHOLD = thresholds[np.argmax(scores)]
# BEST_SCORE = max(scores)

# print("BEST_THRESHOLD =", BEST_THRESHOLD)
# print("BEST_F1 =", BEST_SCORE)


In [9]:
all_test_features = []

for split in SPLITS:
    print(f"Processing test {split}...")
    test_lc_path = f"{DATA_DIR}/{split}/test_full_lightcurves.csv"
    test_lc = pd.read_csv(test_lc_path)

    feats = extract_basic_features(test_lc)
    all_test_features.append(feats)

# Concaténation finale
test_df = pd.concat(all_test_features, ignore_index=True)

X_test = test_df.drop(columns=["object_id"])


Processing test split_01...
Processing test split_02...
Processing test split_03...
Processing test split_04...
Processing test split_05...
Processing test split_06...
Processing test split_07...
Processing test split_08...
Processing test split_09...
Processing test split_10...
Processing test split_11...
Processing test split_12...
Processing test split_13...
Processing test split_14...
Processing test split_15...
Processing test split_16...
Processing test split_17...
Processing test split_18...
Processing test split_19...
Processing test split_20...


In [10]:
neg, pos = y.value_counts()
scale_pos_weight = neg / pos

models = [
    LGBMClassifier(
        n_estimators=600,
        num_leaves=63,
        learning_rate=0.05,
        scale_pos_weight=scale_pos_weight,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=1,
        n_jobs=-1
    ),
    LGBMClassifier(
        n_estimators=800,
        num_leaves=127,
        learning_rate=0.03,
        scale_pos_weight=scale_pos_weight,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=2,
        n_jobs=-1
    ),
    LGBMClassifier(
        n_estimators=400,
        num_leaves=31,
        learning_rate=0.07,
        scale_pos_weight=scale_pos_weight,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=3,
        n_jobs=-1
    ),
]

probas_test = np.zeros(len(X_test))

for m in models:
    m.fit(X, y)
    probas_test += m.predict_proba(X_test)[:, 1]

probas_test /= len(models)



from sklearn.calibration import CalibratedClassifierCV

calibrator = CalibratedClassifierCV(
    models[1],  # le plus fort
    method="isotonic",
    cv=5
)

calibrator.fit(X, y)
probas_test = calibrator.predict_proba(X_test)[:, 1]
THRESHOLD = 0.10  # on ajustera après
test_df["target"] = (probas_test >= THRESHOLD).astype(int)


[LightGBM] [Info] Number of positive: 148, number of negative: 2895
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002497 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15204
[LightGBM] [Info] Number of data points in the train set: 3043, number of used features: 65
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.048636 -> initscore=-2.973528
[LightGBM] [Info] Start training from score -2.973528
[LightGBM] [Info] Number of positive: 148, number of negative: 2895
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002476 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15204
[LightGBM] [Info] Number of data points in the train set: 3043, number of used features: 65
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.048636 -> initscore=-2.973528
[LightGBM] [Info] Start training from score -2.973528
[LightGBM] [Info] 

In [12]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [13]:
from lightgbm import LGBMClassifier
from sklearn.calibration import CalibratedClassifierCV
from catboost import CatBoostClassifier
import numpy as np

# =====================
# 1. Déséquilibre
# =====================
neg, pos = y.value_counts()
scale_pos_weight = neg / pos

# =====================
# 2. LightGBM (base)
# =====================
lgb_base = LGBMClassifier(
    n_estimators=800,
    num_leaves=63,
    learning_rate=0.03,
    scale_pos_weight=scale_pos_weight,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

# =====================
# 3. Calibration
# =====================
lgb_calibrated = CalibratedClassifierCV(
    estimator=lgb_base,
    method="isotonic",
    cv=5
)

# ⚠️ OBLIGATOIRE
lgb_calibrated.fit(X, y)

# =====================
# 4. CatBoost
# =====================
cat_model = CatBoostClassifier(
    iterations=1200,
    learning_rate=0.03,
    depth=8,
    loss_function="Logloss",
    eval_metric="AUC",
    scale_pos_weight=scale_pos_weight,
    verbose=False,
    random_seed=42
)

cat_model.fit(X, y)

# =====================
# 5. Prédictions
# =====================
proba_lgb = lgb_calibrated.predict_proba(X_test)[:, 1]
proba_cat = cat_model.predict_proba(X_test)[:, 1]

# =====================
# 6. Ensemble pondéré
# =====================
probas_ensemble = 0.6 * proba_lgb + 0.4 * proba_cat

# =====================
# 7. Seuil
# =====================
THRESHOLD = 0.10
test_df["target"] = (probas_ensemble >= THRESHOLD).astype(int)


[LightGBM] [Info] Number of positive: 118, number of negative: 2316
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001263 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 15167
[LightGBM] [Info] Number of data points in the train set: 2434, number of used features: 65
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.048480 -> initscore=-2.976912
[LightGBM] [Info] Start training from score -2.976912
[LightGBM] [Info] Number of positive: 118, number of negative: 2316
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010071 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 15171
[LightGBM] [Info] Number of data points in the train set: 2434, number of used features: 65
[LightGBM] [Info] [binar

In [14]:
#!pip install catboost


In [15]:
#DUPLICATION
# submission = (
#     test_df[["object_id", "target"]]
#     .groupby("object_id", as_index=False)
#     .max()   # max = logique OR → parfait pour binaire
# )

submission = (
    test_df[["object_id", "target"]]
    .drop_duplicates(subset="object_id", keep="last")
)

#VERIFICATION CRITIQUE AVANT SOUMISSION
print("Shape :", submission.shape)
print("Target dtype :", submission["target"].dtype)
print("Unique targets :", submission["target"].unique())
print("Duplicate object_id :", submission["object_id"].duplicated().sum())
#SAUVEGARDE POUR KAGGLE
#test_df["target"] = (test_df["target"] >= 0.5).astype(int)
submission.to_csv("/content/submission.csv", index=False)
print("submission.csv généré — prêt pour Kaggle ✅")


Shape : (7135, 2)
Target dtype : int64
Unique targets : [0 1]
Duplicate object_id : 0
submission.csv généré — prêt pour Kaggle ✅


In [16]:
print(submission["target"].min(), submission["target"].max())


0 1


In [17]:
sub = pd.read_csv('submission.csv')
sub.head()

Unnamed: 0,object_id,target
0,Elrim_sador_hun,0
1,Eluwaith_Mithrim_nothrim,0
2,Eru_heledir_archam,0
3,Gonhir_anann_fuin,0
4,Gwathuirim_eilian_fervain,0
