In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from pathlib import Path
import pickle
import sys

FEAT_DIR = Path('/kaggle/input/wdwfdws')
FE_TRAIN = FEAT_DIR / 'fe_v2_train.parquet'
FE_TEST  = FEAT_DIR / 'fe_v2_test.parquet'
WORK_DIR = Path('/kaggle/working')
MODEL_PKL = WORK_DIR / 'lgbm_baseline.pkl'
OOF_CSV   = WORK_DIR / 'oof_preds.csv'
TEST_CSV  = WORK_DIR / 'test_preds.csv'

# Fast-path: skip if already trained
if MODEL_PKL.exists() and TEST_CSV.exists():
    print('Baseline model already trained – skipping.')
    sys.exit(0)

train = pd.read_parquet(FE_TRAIN)
test  = pd.read_parquet(FE_TEST)

TARGET = 'y'
ID_COLS = ['id1', 'id2', 'id3', 'id5']

y = train[TARGET].astype('float32')
X = train.drop(columns=[TARGET] + [col for col in ID_COLS if col in train.columns])
X_test = test.drop(columns=[col for col in ID_COLS if col in test.columns])

# Ensure all columns are numeric
for df in [X, X_test]:
    for col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

bad_cols = X.select_dtypes(exclude=['int', 'float', 'bool']).columns
assert len(bad_cols) == 0, f"Non-numeric columns remain: {bad_cols}"

# MAP@7 scorer
def apk(actual, pred, k=7):
    if len(pred) > k: pred = pred[:k]
    score, num_hits = 0.0, 0
    for i, p in enumerate(pred, start=1):
        if p in actual and p not in pred[:i-1]:
            num_hits += 1
            score += num_hits / i
    return score / min(len(actual), k) if actual else 0.0

def mapk(df, k=7):
    scores = []
    for cm, grp in df.groupby('id2'):
        true_offers = grp.loc[grp.y_true == 1, 'id3'].tolist()
        pred_offers = grp.sort_values('y_pred', ascending=False)['id3'].tolist()
        scores.append(apk(true_offers, pred_offers, k))
    return np.mean(scores)

params = dict(
    objective='binary',
    metric='binary_logloss',  # required for early stopping
    learning_rate=0.05,
    num_leaves=64,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=1,
    max_depth=-1,
    verbose=-1,
    n_estimators=1000
)

n_splits = 4
folds = KFold(n_splits=n_splits, shuffle=False)

oof_pred = np.zeros(len(train), dtype='float32')
test_pred = np.zeros(len(test), dtype='float32')

for fold, (tr_idx, val_idx) in enumerate(folds.split(train)):
    print(f'fold {fold+1}/{n_splits} ...', end='')
    dtrain = lgb.Dataset(X.iloc[tr_idx], label=y.iloc[tr_idx])
    dval = lgb.Dataset(X.iloc[val_idx], label=y.iloc[val_idx])

    clf = lgb.train(
        params,
        dtrain,
        valid_sets=[dval],
        valid_names=['val'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=100)
        ],
        num_boost_round=10000
    )

    oof_pred[val_idx] = clf.predict(X.iloc[val_idx], num_iteration=clf.best_iteration)
    test_pred += clf.predict(X_test, num_iteration=clf.best_iteration) / n_splits
    print('done')

with open(MODEL_PKL, 'wb') as f:
    pickle.dump(clf, f)

oof_df = train[['id1', 'id2', 'id3']].copy()
oof_df['y_true'] = y.values
oof_df['y_pred'] = oof_pred
score = mapk(oof_df, k=7)
print(f'CV MAP@7 = {score:.5f}')
oof_df.to_csv(OOF_CSV, index=False)

sub = test[['id1', 'id2', 'id3', 'id5']].copy()
sub['pred'] = test_pred
sub.to_csv(TEST_CSV, index=False)
print('Artefacts saved:', MODEL_PKL.name, OOF_CSV.name, TEST_CSV.name)


fold 1/4 ...



Training until validation scores don't improve for 100 rounds
[100]	val's binary_logloss: 0.116586
Early stopping, best iteration is:
[87]	val's binary_logloss: 0.116438
done
fold 2/4 ...



Training until validation scores don't improve for 100 rounds
[100]	val's binary_logloss: 0.107993
[200]	val's binary_logloss: 0.108425
Early stopping, best iteration is:
[101]	val's binary_logloss: 0.107975
done
fold 3/4 ...



Training until validation scores don't improve for 100 rounds
[100]	val's binary_logloss: 0.0908431
[200]	val's binary_logloss: 0.0903453
Early stopping, best iteration is:
[194]	val's binary_logloss: 0.0903195
done
fold 4/4 ...



Training until validation scores don't improve for 100 rounds
[100]	val's binary_logloss: 0.113605
Early stopping, best iteration is:
[88]	val's binary_logloss: 0.113534
done
CV MAP@7 = 0.04832
Artefacts saved: lgbm_baseline.pkl oof_preds.csv test_preds.csv


In [3]:
import pandas as pd
from pathlib import Path

WORK_DIR = Path('/kaggle/working')
TEAM_NAME = 'amex'  # <-- Replace with your team name
TEST_PRED = WORK_DIR / 'test_preds.csv'
SUBMISSION = WORK_DIR / f'r2_submission_{TEAM_NAME}.csv'

# Read test predictions
sub = pd.read_csv(TEST_PRED)

# Ensure correct columns and order
required_cols = ['id1', 'id2', 'id3', 'id5', 'pred']
sub = sub[required_cols]

# Save submission file
sub.to_csv(SUBMISSION, index=False)
print(f"Submission file saved as {SUBMISSION}")
print(sub.head())


Submission file saved as /kaggle/working/r2_submission_amex.csv
                                               id1        id2       id3  \
0   1362907_91950_16-23_2023-11-04 18:56:26.000794  1362907.0   91950.0   
1      1082599_88356_16-23_2023-11-04 06:08:53.373  1082599.0   88356.0   
2  1888466_958700_16-23_2023-11-05 10:07:28.000725  1888466.0  958700.0   
3     1888971_795739_16-23_2023-11-04 12:25:28.244  1888971.0  795739.0   
4      1256369_82296_16-23_2023-11-05 06:45:26.657  1256369.0   82296.0   

          id5      pred  
0  2023-11-04  0.005301  
1  2023-11-04  0.012505  
2  2023-11-05  0.943889  
3  2023-11-04  0.006382  
4  2023-11-05  0.006091  
