In [24]:
import pandas as pd
import numpy as np
import os
import lightgbm as lgb
from scipy.optimize import curve_fit
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C, WhiteKernel
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from joblib import Parallel, delayed
import warnings

In [25]:
warnings.filterwarnings('ignore')

In [26]:
DATA_PATH = '/kaggle/input/mallorn-dataset' 
print(f"Data Path: {DATA_PATH}")

Data Path: /kaggle/input/mallorn-dataset


In [27]:
def bazin_func(t, A, B, t0, tau_fall, tau_rise):
    with np.errstate(over='ignore', invalid='ignore'):
        flux = A * (np.exp(-(t - t0) / tau_fall) / (1 + np.exp(-(t - t0) / tau_rise))) + B
    return np.nan_to_num(flux)

In [28]:
def fit_bazin(time, flux, flux_err):
    if len(time) < 5: 
        return {k: np.nan for k in ['A', 'B', 't0', 'tau_fall', 'tau_rise', 'chi2']}

    peak_idx = np.argmax(flux)
    # Initial guesses
    p0 = [flux[peak_idx], np.min(flux), time[peak_idx], 50.0, 10.0]
    # Bounds
    bounds = ([0, -np.inf, time.min()-50, 1e-3, 1e-3], [np.inf, np.inf, time.max()+50, 500, 500])

    try:
        popt, _ = curve_fit(bazin_func, time, flux, p0=p0, sigma=flux_err, bounds=bounds, maxfev=1000)
        residuals = flux - bazin_func(time, *popt)
        chi2 = np.sum((residuals / flux_err)**2) / (len(time) - 5)
        return {'A': popt[0], 'B': popt[1], 't0': popt[2], 'tau_fall': popt[3], 'tau_rise': popt[4], 'chi2': chi2}
    except:
        return {k: np.nan for k in ['A', 'B', 't0', 'tau_fall', 'tau_rise', 'chi2']}

In [29]:
def get_gp_prediction(time, flux, flux_err, t_query):
    if len(time) < 3: return np.nan
    kernel = C(1.0) * RBF(length_scale=20.0) + WhiteKernel(noise_level=1.0)
    gp = GaussianProcessRegressor(kernel=kernel, alpha=flux_err**2, n_restarts_optimizer=0)
    try:
        gp.fit(time.reshape(-1, 1), flux)
        pred, _ = gp.predict(np.array([[t_query]]), return_std=True)
        return pred[0]
    except:
        return np.nan

In [30]:
def calculate_stetson(flux, flux_err):
    n = len(flux)
    if n < 2: return np.nan, np.nan
    
    mean_flux = np.mean(flux)
    delta = (flux - mean_flux) / (flux_err + 1e-6) 
    
    abs_delta_mean = np.mean(np.abs(delta))
    delta_sq_mean = np.mean(delta**2)
    k = (1 / np.sqrt(n)) * (abs_delta_mean / np.sqrt(delta_sq_mean))

    j = np.sum(np.sign(delta[:-1] * delta[1:]) * np.sqrt(np.abs(delta[:-1] * delta[1:])))
    j = (j / (n - 1)) * np.sign(j) 
    
    return j, k

In [31]:
def process_single_object(obj_id, df_obj):
    feats = {'object_id': obj_id}
    t_min = df_obj['Time (MJD)'].min()
    df_obj['Time_Rel'] = df_obj['Time (MJD)'] - t_min
    
    filters = ['u', 'g', 'r', 'i', 'z', 'y']
    peak_time = np.nan
    max_flux_global = -np.inf
    
    for f in filters:
        df_f = df_obj[df_obj['Filter'] == f]
        if df_f.empty:
            for stat in ['mean', 'max', 'min', 'std', 'skew', 'stetson_j', 'stetson_k']:
                feats[f'{f}_{stat}'] = np.nan
            continue
            
        feats[f'{f}_mean'] = df_f['Flux'].mean()
        feats[f'{f}_max'] = df_f['Flux'].max()
        feats[f'{f}_min'] = df_f['Flux'].min()
        feats[f'{f}_std'] = df_f['Flux'].std()
        feats[f'{f}_skew'] = df_f['Flux'].skew()
        
        j, k = calculate_stetson(df_f['Flux'].values, df_f['Flux_err'].values)
        feats[f'{f}_stetson_j'] = j
        feats[f'{f}_stetson_k'] = k
        
        if f in ['g', 'r']:
            current_max = df_f['Flux'].max()
            if current_max > max_flux_global:
                max_flux_global = current_max
                peak_time = df_f.loc[df_f['Flux'].idxmax(), 'Time_Rel']

        if f in ['g', 'r', 'i']:
            bazin = fit_bazin(df_f['Time_Rel'].values, df_f['Flux'].values, df_f['Flux_err'].values)
            for k_bazin, v_bazin in bazin.items():
                feats[f'bazin_{f}_{k_bazin}'] = v_bazin

    if not np.isnan(peak_time):
        flux_at_peak = {}
        for f in filters:
            df_f = df_obj[df_obj['Filter'] == f]
            flux_at_peak[f] = get_gp_prediction(
                df_f['Time_Rel'].values, df_f['Flux'].values, df_f['Flux_err'].values, peak_time
            )
        pairs = [('u','g'), ('g','r'), ('r','i'), ('i','z')]
        for f1, f2 in pairs:
            val1, val2 = flux_at_peak.get(f1, np.nan), flux_at_peak.get(f2, np.nan)
            feats[f'gp_color_{f1}_{f2}'] = val1 - val2 if (not np.isnan(val1) and not np.isnan(val2)) else np.nan
            
    return feats

In [32]:
def extract_features_parallel(log_df, data_path, n_jobs=-1):
    print("Loading raw lightcurves into memory...")
    all_chunks = []
    unique_splits = log_df['split'].unique()
    
    for split in unique_splits:
        is_train = 'target' in log_df.columns
        filename = 'train_full_lightcurves.csv' if is_train else 'test_full_lightcurves.csv'
        
        path = os.path.join(data_path, split, filename)
        if os.path.exists(path):
            df_chunk = pd.read_csv(path)
            valid_ids = set(log_df[log_df['split'] == split]['object_id'])
            df_chunk = df_chunk[df_chunk['object_id'].isin(valid_ids)]
            all_chunks.append(df_chunk)
            
    if not all_chunks:
        print("Error: No lightcurves found!")
        return pd.DataFrame()

    full_lc = pd.concat(all_chunks)
    
    grouped = full_lc.groupby('object_id')
    object_ids = list(grouped.groups.keys())
    
    print(f"Extracting features for {len(object_ids)} objects using {n_jobs} cores...")
    
    results = Parallel(n_jobs=n_jobs, backend='loky')(
        delayed(process_single_object)(obj_id, grouped.get_group(obj_id))
        for obj_id in tqdm(object_ids)
    )
    
    return pd.DataFrame(results)

from tqdm.auto import tqdm

In [33]:
print("--- PROCESSING TRAIN DATA ---")
train_log = pd.read_csv(os.path.join(DATA_PATH, 'train_log.csv'))

df_train_features = extract_features_parallel(train_log, DATA_PATH, n_jobs=4)

df_train_final = train_log.merge(df_train_features, on='object_id', how='left')
print(f"Train Data Shape: {df_train_final.shape}")

--- PROCESSING TRAIN DATA ---
Loading raw lightcurves into memory...
Extracting features for 3043 objects using 4 cores...


  0%|          | 0/3043 [00:00<?, ?it/s]

  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return tra

Train Data Shape: (3043, 72)




In [34]:
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import LabelEncoder

In [35]:
ignore_cols = ['object_id', 'target', 'split', 'English Translation', 'SpecType']
features = [c for c in df_train_final.columns if c not in ignore_cols]

In [36]:
X = df_train_final[features].copy()

X = X.replace([np.inf, -np.inf], np.nan)

X = X.fillna(0)
y = df_train_final['target']

In [37]:
for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

In [38]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X))
models_ensemble = []

In [39]:
from catboost import CatBoostClassifier 

In [40]:
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
    print(f"Fold {fold+1}: Running SMOTE...")
    try:
        smote = SMOTE(sampling_strategy=0.2, random_state=42)
        X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
    except ValueError:
        X_train_res, y_train_res = X_train, y_train

    clf_lgb = lgb.LGBMClassifier(
        objective='binary', boosting_type='gbdt', learning_rate=0.03,
        num_leaves=31, max_depth=10, n_estimators=1000, 
        subsample=0.8, colsample_bytree=0.7, verbose=-1, random_state=42
    )
    
    clf_xgb = XGBClassifier(
        objective='binary:logistic', eval_metric='logloss', learning_rate=0.03,
        max_depth=8, n_estimators=1000, subsample=0.8, colsample_bytree=0.7,
        random_state=42, use_label_encoder=False
    )
    
    clf_cat = CatBoostClassifier(
        loss_function='Logloss', eval_metric='F1', iterations=1000,
        learning_rate=0.03, depth=6, l2_leaf_reg=3,
        verbose=0, random_seed=42, allow_writing_files=False
    )
    
    eclf = VotingClassifier(
        estimators=[('lgb', clf_lgb), ('xgb', clf_xgb), ('cat', clf_cat)],
        voting='soft',
        weights=[1, 1, 1] 
    )
    
    eclf.fit(X_train_res, y_train_res)
    
    val_probs = eclf.predict_proba(X_val)[:, 1]
    oof_preds[val_idx] = val_probs
    
    best_f1, best_th = 0, 0.5
    for th in np.linspace(0.1, 0.9, 100):
        score = f1_score(y_val, (val_probs > th).astype(int))
        if score > best_f1: best_f1, best_th = score, th
            
    print(f"Fold {fold+1} Trifecta F1: {best_f1:.4f} (Th: {best_th:.2f})")
    models_ensemble.append(eclf)

Fold 1: Running SMOTE...
Fold 1 Trifecta F1: 0.4364 (Th: 0.21)
Fold 2: Running SMOTE...
Fold 2 Trifecta F1: 0.5263 (Th: 0.21)
Fold 3: Running SMOTE...
Fold 3 Trifecta F1: 0.6970 (Th: 0.10)
Fold 4: Running SMOTE...
Fold 4 Trifecta F1: 0.6552 (Th: 0.29)
Fold 5: Running SMOTE...
Fold 5 Trifecta F1: 0.5846 (Th: 0.20)


In [41]:
thresholds = np.linspace(0.01, 0.99, 200)
f1_list = [f1_score(y, (oof_preds > t).astype(int)) for t in thresholds]
global_best_thresh = thresholds[np.argmax(f1_list)]
print(f"\nGlobal Best F1: {np.max(f1_list):.4f} at Threshold: {global_best_thresh:.2f}")


Global Best F1: 0.5515 at Threshold: 0.21


In [42]:
test_log = pd.read_csv(os.path.join(DATA_PATH, 'test_log.csv'))

In [None]:
df_test_features = extract_features_parallel(test_log, DATA_PATH, n_jobs=4)

Loading raw lightcurves into memory...
Extracting features for 7135 objects using 4 cores...


  0%|          | 0/7135 [00:00<?, ?it/s]

  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return transform * (func(xdata, *params) - ydata)
  return tra

In [47]:
df_test_final = test_log.merge(df_test_features, on='object_id', how='left')
print(f"Test Data Shape: {df_test_final.shape}")

Test Data Shape: (7135, 71)


In [48]:
X_test = df_test_final[features].copy()
X_test = X_test.replace([np.inf, -np.inf], np.nan).fillna(0)

for col in X_test.select_dtypes(include=['object']).columns:
    X_test[col] = pd.to_numeric(X_test[col], errors='coerce').fillna(0)

test_probs = np.zeros(len(X_test))
for model in models_ensemble:
    test_probs += model.predict_proba(X_test)[:, 1] / len(models_ensemble)

predictions = (test_probs > global_best_thresh).astype(int)

In [49]:
submission = pd.DataFrame({
    'object_id': df_test_final['object_id'],
    'prediction': predictions
})

submission.to_csv('submission_2.csv', index=False)
print("\nSuccess! Saved submission.csv with SMOTE & Ensemble.")
print(submission.head())


Success! Saved submission.csv with SMOTE & Ensemble.
                      object_id  prediction
0      Eluwaith_Mithrim_nothrim           0
1            Eru_heledir_archam           0
2             Gonhir_anann_fuin           0
3  Gwathuirim_haradrim_tegilbor           0
4              achas_minai_maen           0
