<a href="https://colab.research.google.com/github/jaadu-1/Algo-trading/blob/main/notebook674cd42c99.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

krish39696_submission_path = kagglehub.dataset_download('krish39696/submission')
krish39696_krish_data_path = kagglehub.dataset_download('krish39696/krish-data')

print('Data source import complete.')


In [None]:
!pip install lightgbm
!pip install pyarrow --quiet

import pandas as pd
import pyarrow.parquet as pq
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score



In [None]:
# Base data path
DATA_PATH = "/kaggle/input/krish-data"


In [None]:
# Core datasets
train = pd.read_parquet(f"{DATA_PATH}/train_data.parquet")
test = pd.read_parquet(f"{DATA_PATH}/test_data.parquet")
submission = pd.read_csv(f"{DATA_PATH}/685404e30cfdb_submission_template.csv")

# Supplementary files
events = pd.read_parquet(f"{DATA_PATH}/add_event.parquet")
offers = pd.read_parquet(f"{DATA_PATH}/offer_metadata.parquet")
data_dict = pd.read_csv(f"{DATA_PATH}/data_dictionary.csv")

# This file was uploaded separately
transactions = pd.read_parquet("/kaggle/input/krish-data/add_trans.parquet")

print("✅ All 7 datasets loaded successfully.")


✅ All 7 datasets loaded successfully.


In [None]:
# ====================
# STEP X: Preprocessing
# ====================

from sklearn.preprocessing import LabelEncoder

# Separate column types
num_cols = train.select_dtypes(include=['float64', 'int64']).columns
cat_cols = train.select_dtypes(include=['object']).columns

# Ensure we drop the target variable if it's in cat_cols
if 'y' in cat_cols:
    cat_cols = cat_cols.drop('y')

# Handle numeric columns (fill missing with mean)
for col in num_cols:
    if col != 'y':  # don't fill target column
        train[col] = train[col].fillna(train[col].mean())
        test[col] = test[col].fillna(train[col].mean())

# Handle categorical columns (label encoding)
for col in cat_cols:
    if col in test.columns:
        le = LabelEncoder()
        combined = pd.concat([train[col], test[col]], axis=0).astype(str)
        le.fit(combined)
        train[col] = le.transform(train[col].astype(str))
        test[col] = le.transform(test[col].astype(str))
    else:
        print(f"⚠️ Skipping {col} - not found in test set.")

print("✅ Preprocessing done.")


✅ Preprocessing done.


In [None]:
# Get only numerical columns (excluding 'y' if present)
num_cols = train.select_dtypes(include=['float64', 'int64']).columns.drop('y', errors='ignore')

# Fit: train only
train['num_mean'] = train[num_cols].mean(axis=1)
train['num_std'] = train[num_cols].std(axis=1)
train['num_min'] = train[num_cols].min(axis=1)
train['num_max'] = train[num_cols].max(axis=1)

# Apply same columns to test
test['num_mean'] = test[num_cols].mean(axis=1)
test['num_std'] = test[num_cols].std(axis=1)
test['num_min'] = test[num_cols].min(axis=1)
test['num_max'] = test[num_cols].max(axis=1)

print("✅ Basic row-wise statistical features added.")


  train['num_mean'] = train[num_cols].mean(axis=1)
  train['num_std'] = train[num_cols].std(axis=1)
  train['num_min'] = train[num_cols].min(axis=1)
  train['num_max'] = train[num_cols].max(axis=1)
  test['num_mean'] = test[num_cols].mean(axis=1)
  test['num_std'] = test[num_cols].std(axis=1)
  test['num_min'] = test[num_cols].min(axis=1)


✅ Basic row-wise statistical features added.


  test['num_max'] = test[num_cols].max(axis=1)


In [None]:
# Apply only if categorical exists
if 'id2' in train.columns:
    freq_map = train['id2'].value_counts().to_dict()
    train['id2_freq'] = train['id2'].map(freq_map)
    test['id2_freq'] = test['id2'].map(freq_map)
    print("✅ Frequency encoding for id2 added.")

✅ Frequency encoding for id2 added.


  train['id2_freq'] = train['id2'].map(freq_map)
  test['id2_freq'] = test['id2'].map(freq_map)


In [None]:
# === Create aggregate sum feature from f23 to f27 ===
cols_to_sum = [f"f{i}" for i in range(23, 28) if f"f{i}" in train.columns]

train["f23_f27_sum"] = train[cols_to_sum].sum(axis=1)
test["f23_f27_sum"] = test[cols_to_sum].sum(axis=1)

print("✅ Created feature: f23_f27_sum")


  train["f23_f27_sum"] = train[cols_to_sum].sum(axis=1)


✅ Created feature: f23_f27_sum


  test["f23_f27_sum"] = test[cols_to_sum].sum(axis=1)


In [None]:
# === Feature Engineering: Interest Scores (f1 to f12) ===
interest_cols = [f"f{i}" for i in range(1, 13) if f"f{i}" in train.columns]

# Sum, mean, std, max, min, range
for df in [train, test]:
    df["interest_sum"] = df[interest_cols].sum(axis=1)
    df["interest_mean"] = df[interest_cols].mean(axis=1)
    df["interest_max"] = df[interest_cols].max(axis=1)
    df["interest_min"] = df[interest_cols].min(axis=1)
    df["interest_std"] = df[interest_cols].std(axis=1)
    df["interest_range"] = df["interest_max"] - df["interest_min"]

    # Find index of top topic (f1 → 1, f2 → 2, ...)
    df["top_topic_idx"] = df[interest_cols].idxmax(axis=1).str.extract(r"f(\d+)").astype(float)

print("✅ Created features based on interest scores (f1 to f12).")


  df["interest_sum"] = df[interest_cols].sum(axis=1)
  df["interest_mean"] = df[interest_cols].mean(axis=1)
  df["interest_max"] = df[interest_cols].max(axis=1)
  df["interest_min"] = df[interest_cols].min(axis=1)
  df["interest_std"] = df[interest_cols].std(axis=1)
  df["interest_range"] = df["interest_max"] - df["interest_min"]
  df["top_topic_idx"] = df[interest_cols].idxmax(axis=1).str.extract(r"f(\d+)").astype(float)
  df["interest_sum"] = df[interest_cols].sum(axis=1)
  df["interest_mean"] = df[interest_cols].mean(axis=1)
  df["interest_max"] = df[interest_cols].max(axis=1)
  df["interest_min"] = df[interest_cols].min(axis=1)
  df["interest_std"] = df[interest_cols].std(axis=1)
  df["interest_range"] = df["interest_max"] - df["interest_min"]


✅ Created features based on interest scores (f1 to f12).


  df["top_topic_idx"] = df[interest_cols].idxmax(axis=1).str.extract(r"f(\d+)").astype(float)


In [None]:
# === Feature Engineering: Engagement Score (f14 to f21) ===
engagement_cols = [f"f{i}" for i in range(14, 22) if f"f{i}" in train.columns]

for df in [train, test]:
    df["engagement_score"] = df[engagement_cols].sum(axis=1)

print("✅ Created engagement_score from f14 to f21.")


  df["engagement_score"] = df[engagement_cols].sum(axis=1)


✅ Created engagement_score from f14 to f21.


  df["engagement_score"] = df[engagement_cols].sum(axis=1)


In [None]:
# === Feature Engineering: Combined Feature from f39 to f75 ===
block_39_75_cols = [f"f{i}" for i in range(39, 76) if f"f{i}" in train.columns]

for df in [train, test]:
    df["feature_block_39_75"] = df[block_39_75_cols].sum(axis=1)

print("✅ Created feature_block_39_75 from f39 to f75.")


✅ Created feature_block_39_75 from f39 to f75.


  df["feature_block_39_75"] = df[block_39_75_cols].sum(axis=1)
  df["feature_block_39_75"] = df[block_39_75_cols].sum(axis=1)


In [None]:
# Step: Time spent features
time_cols = [f"f{i}" for i in range(59, 76) if i != 67]

# 1. Total time spent
train["total_time_spent"] = train[time_cols].sum(axis=1)
test["total_time_spent"] = test[time_cols].sum(axis=1)

# 2. Mean time spent
train["mean_time_spent"] = train[time_cols].mean(axis=1)
test["mean_time_spent"] = test[time_cols].mean(axis=1)

# 3. Standard deviation of time spent
train["std_time_spent"] = train[time_cols].std(axis=1)
test["std_time_spent"] = test[time_cols].std(axis=1)

# 4. Max time spent on any activity
train["max_time_spent"] = train[time_cols].max(axis=1)
test["max_time_spent"] = test[time_cols].max(axis=1)

# 5. Time spent range (max - min)
train["range_time_spent"] = train[time_cols].max(axis=1) - train[time_cols].min(axis=1)
test["range_time_spent"] = test[time_cols].max(axis=1) - test[time_cols].min(axis=1)

print("✅ 5 time-spent features created from f59-f75 (excluding f67).")

  train["total_time_spent"] = train[time_cols].sum(axis=1)
  test["total_time_spent"] = test[time_cols].sum(axis=1)
  train["mean_time_spent"] = train[time_cols].mean(axis=1)
  test["mean_time_spent"] = test[time_cols].mean(axis=1)
  train["std_time_spent"] = train[time_cols].std(axis=1)
  test["std_time_spent"] = test[time_cols].std(axis=1)
  train["max_time_spent"] = train[time_cols].max(axis=1)
  test["max_time_spent"] = test[time_cols].max(axis=1)
  train["range_time_spent"] = train[time_cols].max(axis=1) - train[time_cols].min(axis=1)


✅ 5 time-spent features created from f59-f75 (excluding f67).


  test["range_time_spent"] = test[time_cols].max(axis=1) - test[time_cols].min(axis=1)


In [None]:
# Step: Ratio-based features
ratio_cols = [f"f{i}" for i in range(78, 94)]

# 1. Sum of ratios
train["sum_ratios"] = train[ratio_cols].sum(axis=1)
test["sum_ratios"] = test[ratio_cols].sum(axis=1)

# 2. Mean of ratios
train["mean_ratio"] = train[ratio_cols].mean(axis=1)
test["mean_ratio"] = test[ratio_cols].mean(axis=1)

# 3. Max ratio value
train["max_ratio"] = train[ratio_cols].max(axis=1)
test["max_ratio"] = test[ratio_cols].max(axis=1)

# 4. Std deviation of ratios
train["std_ratio"] = train[ratio_cols].std(axis=1)
test["std_ratio"] = test[ratio_cols].std(axis=1)

# 5. Ratio spread (max - min)
train["ratio_spread"] = train[ratio_cols].max(axis=1) - train[ratio_cols].min(axis=1)
test["ratio_spread"] = test[ratio_cols].max(axis=1) - test[ratio_cols].min(axis=1)

print("✅ 5 ratio-based features created from f78 to f93.")


  train["sum_ratios"] = train[ratio_cols].sum(axis=1)
  test["sum_ratios"] = test[ratio_cols].sum(axis=1)
  train["mean_ratio"] = train[ratio_cols].mean(axis=1)
  test["mean_ratio"] = test[ratio_cols].mean(axis=1)
  train["max_ratio"] = train[ratio_cols].max(axis=1)
  test["max_ratio"] = test[ratio_cols].max(axis=1)
  train["std_ratio"] = train[ratio_cols].std(axis=1)
  test["std_ratio"] = test[ratio_cols].std(axis=1)
  train["ratio_spread"] = train[ratio_cols].max(axis=1) - train[ratio_cols].min(axis=1)


✅ 5 ratio-based features created from f78 to f93.


  test["ratio_spread"] = test[ratio_cols].max(axis=1) - test[ratio_cols].min(axis=1)


In [None]:
# Step: Behavioral category features
behavior_cols = [f"f{i}" for i in range(94, 114)]

# 1. Sum of behavioral scores (activity intensity)
train["behavior_sum"] = train[behavior_cols].sum(axis=1)
test["behavior_sum"] = test[behavior_cols].sum(axis=1)

# 2. Mean behavioral score
train["behavior_mean"] = train[behavior_cols].mean(axis=1)
test["behavior_mean"] = test[behavior_cols].mean(axis=1)

# 3. Count of non-zero behaviors (how many categories are active)
train["behavior_active_count"] = (train[behavior_cols] != 0).sum(axis=1)
test["behavior_active_count"] = (test[behavior_cols] != 0).sum(axis=1)

# 4. Max behavioral score (dominant behavior strength)
train["behavior_max"] = train[behavior_cols].max(axis=1)
test["behavior_max"] = test[behavior_cols].max(axis=1)

# 5. Binary behavior pattern (is any one behavior dominant)
train["behavior_high_variance"] = train[behavior_cols].std(axis=1) > 1.0
test["behavior_high_variance"] = test[behavior_cols].std(axis=1) > 1.0

print("✅ 5 behavioral features created from f94 to f113.")


  train["behavior_sum"] = train[behavior_cols].sum(axis=1)
  test["behavior_sum"] = test[behavior_cols].sum(axis=1)
  train["behavior_mean"] = train[behavior_cols].mean(axis=1)
  test["behavior_mean"] = test[behavior_cols].mean(axis=1)
  train["behavior_active_count"] = (train[behavior_cols] != 0).sum(axis=1)
  test["behavior_active_count"] = (test[behavior_cols] != 0).sum(axis=1)
  train["behavior_max"] = train[behavior_cols].max(axis=1)
  test["behavior_max"] = test[behavior_cols].max(axis=1)


✅ 5 behavioral features created from f94 to f113.


  train["behavior_high_variance"] = train[behavior_cols].std(axis=1) > 1.0
  test["behavior_high_variance"] = test[behavior_cols].std(axis=1) > 1.0


In [None]:
# ================================
# Logical Feature Engineering: Ratios, Clicks, CTRs (Safe Version)
# ================================

# 1. Define relevant columns
ratio_cols = [f"f{i}" for i in range(113, 123)]
click_cols = [f"f{i}" for i in range(124, 130)]
ctr_cols = [f"f{i}" for i in range(130, 139)]

# 2. Create logical derived features
def create_logical_features(df):
    features = {}

    # --- Ratio Features ---
    features["ratio_mean"] = df[ratio_cols].mean(axis=1)
    features["ratio_std"] = df[ratio_cols].std(axis=1)
    features["ratio_max"] = df[ratio_cols].max(axis=1)
    features["ratio_min"] = df[ratio_cols].min(axis=1)
    features["ratio_range"] = features["ratio_max"] - features["ratio_min"]

    # --- Click Features ---
    features["total_clicks"] = df[click_cols].sum(axis=1)
    features["click_mean"] = df[click_cols].mean(axis=1)
    features["click_std"] = df[click_cols].std(axis=1)
    features["click_max"] = df[click_cols].max(axis=1)

    # --- CTR Features ---
    features["ctr_mean"] = df[ctr_cols].mean(axis=1)
    features["ctr_std"] = df[ctr_cols].std(axis=1)
    features["ctr_max"] = df[ctr_cols].max(axis=1)
    features["ctr_min"] = df[ctr_cols].min(axis=1)

    # --- Interaction Features ---
    features["click_to_ratio"] = features["total_clicks"] / (features["ratio_mean"] + 1e-5)
    features["ctr_to_ratio"] = features["ctr_mean"] / (features["ratio_mean"] + 1e-5)
    features["engagement_score"] = (
        0.5 * features["total_clicks"] +
        0.3 * features["ctr_mean"] +
        0.2 * features["ratio_mean"]
    )

    features["ctr_spread"] = features["ctr_max"] - features["ctr_min"]
    features["ctr_to_clicks"] = features["ctr_mean"] / (features["total_clicks"] + 1e-5)

    return pd.DataFrame(features)

# 3. Reset index for clean concat
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

# 4. Apply feature creation
train_new_feats = create_logical_features(train)
test_new_feats = create_logical_features(test)

# 5. Concatenate without warning or index issues
train = pd.concat([train, train_new_feats], axis=1)
test = pd.concat([test, test_new_feats], axis=1)

print("✅ Logical CTR/Click/Ratio features added (safe and aligned)")


✅ Logical CTR/Click/Ratio features added (safe and aligned)


In [None]:
import numpy as np

# =============================
# STEP: Ratio Features (f152/f174 to f162/f183)
# =============================

# Define numerator and denominator column lists
numerator_cols = [f"f{i}" for i in range(152, 163)]   # f152 to f162
denominator_cols = [f"f{i}" for i in range(174, 185)] # f174 to f184 (need 11 total)

# Safety check
assert len(numerator_cols) == len(denominator_cols), "Mismatch in ratio feature count."

def make_ratio_features(df):
    # 1. Compute ratios
    ratio_df = pd.DataFrame()
    for num_col, den_col in zip(numerator_cols, denominator_cols):
        ratio_df[f"{num_col}_over_{den_col}"] = df[num_col] / (df[den_col] + 1e-5)

    # 2. Create aggregate features from all 11 ratios
    agg_feats = pd.DataFrame()
    agg_feats["ratio_block_mean"] = ratio_df.mean(axis=1)
    agg_feats["ratio_block_std"] = ratio_df.std(axis=1)
    agg_feats["ratio_block_max"] = ratio_df.max(axis=1)
    agg_feats["ratio_block_min"] = ratio_df.min(axis=1)
    agg_feats["ratio_block_sum"] = ratio_df.sum(axis=1)
    agg_feats["ratio_block_skew"] = ratio_df.skew(axis=1)
    agg_feats["ratio_block_kurt"] = ratio_df.kurtosis(axis=1)

    # 3. Top 3, Bottom 3 features
    agg_feats["top1_ratio"] = ratio_df.apply(lambda x: np.sort(x)[-1], axis=1)
    agg_feats["top2_ratio"] = ratio_df.apply(lambda x: np.sort(x)[-2], axis=1)
    agg_feats["top3_ratio"] = ratio_df.apply(lambda x: np.sort(x)[-3], axis=1)
    agg_feats["bottom1_ratio"] = ratio_df.apply(lambda x: np.sort(x)[0], axis=1)
    agg_feats["bottom2_ratio"] = ratio_df.apply(lambda x: np.sort(x)[1], axis=1)
    agg_feats["bottom3_ratio"] = ratio_df.apply(lambda x: np.sort(x)[2], axis=1)

    # 4. Quantiles
    agg_feats["ratio_q25"] = ratio_df.quantile(0.25, axis=1)
    agg_feats["ratio_q50"] = ratio_df.quantile(0.50, axis=1)
    agg_feats["ratio_q75"] = ratio_df.quantile(0.75, axis=1)

    # 5. Range and ratio between top and bottom
    agg_feats["ratio_range"] = agg_feats["top1_ratio"] - agg_feats["bottom1_ratio"]
    agg_feats["top1_to_sum_ratio"] = agg_feats["top1_ratio"] / (agg_feats["ratio_block_sum"] + 1e-5)

    # 6. Add original ratios (11)
    all_features = pd.concat([ratio_df, agg_feats], axis=1)

    return all_features

# Apply and add to train/test
train_ratios = make_ratio_features(train)
test_ratios = make_ratio_features(test)

# Reset index to avoid fragmentation issues
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

# Concatenate safely
train = pd.concat([train, train_ratios], axis=1)
test = pd.concat([test, test_ratios], axis=1)

print("✅ Ratio features (f152/f174 to f162/f183) added (30+ total)")


✅ Ratio features (f152/f174 to f162/f183) added (30+ total)


In [None]:
# =============================
# STEP: Add feature from f226 to f309
# =============================

# List of relevant columns
f226_to_f309 = [f"f{i}" for i in range(226, 310)]

# Check if all columns exist
missing_cols = [col for col in f226_to_f309 if col not in train.columns]
if missing_cols:
    print(f"⚠️ Warning: These columns are missing and will be skipped: {missing_cols}")
    f226_to_f309 = [col for col in f226_to_f309 if col in train.columns]

# Compute total sum across these features
train["f226_to_f309_sum"] = train[f226_to_f309].sum(axis=1)
test["f226_to_f309_sum"] = test[f226_to_f309].sum(axis=1)

print("✅ Feature 'f226_to_f309_sum' added to train and test")


✅ Feature 'f226_to_f309_sum' added to train and test


In [None]:
# =============================
# STEP: Add features from f361 to f366
# =============================

f361_366 = [f"f{i}" for i in range(361, 367)]

# Check existence
missing_cols = [col for col in f361_366 if col not in train.columns]
if missing_cols:
    print(f"⚠️ Skipping missing columns: {missing_cols}")
    f361_366 = [col for col in f361_366 if col in train.columns]

# Compute aggregate features
train_f361_366 = train[f361_366]
test_f361_366 = test[f361_366]

agg_feats_train = pd.DataFrame({
    "f361_366_sum": train_f361_366.sum(axis=1),
    "f361_366_mean": train_f361_366.mean(axis=1),
    "f361_366_std": train_f361_366.std(axis=1),
    "f361_366_max": train_f361_366.max(axis=1),
    "f361_366_min": train_f361_366.min(axis=1),
})

agg_feats_test = pd.DataFrame({
    "f361_366_sum": test_f361_366.sum(axis=1),
    "f361_366_mean": test_f361_366.mean(axis=1),
    "f361_366_std": test_f361_366.std(axis=1),
    "f361_366_max": test_f361_366.max(axis=1),
    "f361_366_min": test_f361_366.min(axis=1),
})

# Concatenate in one go to avoid fragmentation warning
train = pd.concat([train, agg_feats_train], axis=1)
test = pd.concat([test, agg_feats_test], axis=1)

print("✅ f361 to f366 features engineered and added.")


✅ f361 to f366 features engineered and added.


In [None]:
# ============================
# 🟡 ADVANCED EVENTS FEATURES (Optimized & Fixed)
# ============================

# Ensure required columns exist
required_cols = ['id2', 'id4', 'id6', 'id7']
missing_cols = [col for col in required_cols if col not in events.columns]
if missing_cols:
    print(f"⚠️ Skipping advanced event features. Missing columns: {missing_cols}")
else:
    # Convert id6 and id7 to numeric for aggregation
    events["id6"] = pd.to_numeric(events["id6"], errors="coerce")
    events["id7"] = pd.to_numeric(events["id7"], errors="coerce")

    # --- 1. Count frequency of event types (id4) per user (LIMIT high-cardinality danger)
    # Only take top 20 most common id4 values to avoid memory blow-up
    top_id4_values = events["id4"].value_counts().nlargest(20).index
    filtered_events = events[events["id4"].isin(top_id4_values)]

    id4_counts = (
        filtered_events.groupby(["id2", "id4"])
        .size()
        .unstack(fill_value=0)
        .reset_index()
    )
    id4_counts.columns = ['id2'] + [f"event_type_{col}" for col in id4_counts.columns if col != 'id2']

    # --- 2. Aggregate statistics for id6 and id7
    event_stats = events.groupby("id2").agg(
        id6_mean=("id6", "mean"),
        id6_std=("id6", "std"),
        id6_sum=("id6", "sum"),
        id7_mean=("id7", "mean"),
        id7_max=("id7", "max"),
        id7_min=("id7", "min")
    ).reset_index()

    # --- Merge into train and test
    train = train.merge(id4_counts, on="id2", how="left")
    test = test.merge(id4_counts, on="id2", how="left")

    train = train.merge(event_stats, on="id2", how="left")
    test = test.merge(event_stats, on="id2", how="left")

    print("✅ Advanced event features (id4, id6, id7) merged successfully.")


✅ Advanced event features (id4, id6, id7) merged successfully.


In [None]:
# ✅ Load Offer Metadata
offer = pd.read_parquet("/kaggle/input/krish-data/offer_metadata.parquet")

# Ensure 'id3' is string for merging
offer["id3"] = offer["id3"].astype(str)
train["id3"] = train["id3"].astype(str)
test["id3"] = test["id3"].astype(str)

# Select only the useful columns
offer_sub = offer[["id3", "id9", "f375", "f376", "f378"]].copy()

# Convert all necessary fields to numeric
offer_sub["f375"] = pd.to_numeric(offer_sub["f375"], errors="coerce")
offer_sub["f376"] = pd.to_numeric(offer_sub["f376"], errors="coerce")
offer_sub["f378"] = pd.to_numeric(offer_sub["f378"], errors="coerce")

# 🛠️ Feature 1: Combined discount strength (mean of f375, f376)
offer_sub["discount_strength"] = offer_sub[["f375", "f376"]].mean(axis=1)

# 🛠️ Feature 2: Use f378 directly as interest_score
offer_sub["interest_score"] = offer_sub["f378"]

# 🛠️ Feature 3: Encode id9 (offer group)
offer_sub["offer_group"] = offer_sub["id9"].astype(str)

# Optional Encoding
from sklearn.preprocessing import LabelEncoder
le_offer = LabelEncoder()
offer_sub["offer_group_encoded"] = le_offer.fit_transform(offer_sub["offer_group"])

# Final cleanup: Drop redundant
offer_features = offer_sub[["id3", "discount_strength", "interest_score", "offer_group_encoded"]]

# 🔁 Merge with train/test
train = train.merge(offer_features, on="id3", how="left")
test = test.merge(offer_features, on="id3", how="left")

print("✅ Clean offer features extracted and merged.")


✅ Clean offer features extracted and merged.


In [None]:
# STEP: Load and prepare transaction data
trans = pd.read_parquet("/kaggle/input/krish-data/add_trans.parquet")

# Convert 'id2' to string for safe merging
trans['id2'] = trans['id2'].astype(str)
train['id2'] = train['id2'].astype(str)
test['id2'] = test['id2'].astype(str)

# Convert relevant columns to correct types
trans["f367"] = pd.to_numeric(trans["f367"], errors="coerce")
trans["f371"] = pd.to_numeric(trans["f371"], errors="coerce")
trans["f370"] = pd.to_datetime(trans["f370"], errors="coerce")

# 🛠️ Feature: Time since last transaction
latest_trans = trans.groupby("id2")["f370"].max().reset_index()
latest_trans["days_since_last_transaction"] = (pd.to_datetime("today") - latest_trans["f370"]).dt.days
latest_trans.drop(columns=["f370"], inplace=True)

# 🛠️ Feature: Aggregate f367 and f371 (mean, sum, std, max, min)
trans_agg = trans.groupby("id2").agg({
    "f367": ['mean', 'std', 'sum', 'max', 'min'],
    "f371": ['mean', 'std', 'sum', 'max', 'min']
}).reset_index()

# Flatten MultiIndex columns
trans_agg.columns = ['id2'] + [f"{col}_{agg}" for col, agg in trans_agg.columns.tolist()[1:]]

# Merge all transaction features
trans_features = trans_agg.merge(latest_trans, on="id2", how="left")

# Final Merge with train/test
train = train.merge(trans_features, on="id2", how="left")
test = test.merge(trans_features, on="id2", how="left")

print("✅ Cleaned and enhanced transaction features merged.")


✅ Cleaned and enhanced transaction features merged.


In [None]:
from sklearn.model_selection import train_test_split

# Drop ID columns or unnecessary ones
drop_cols = ['id1', 'id3', 'id5', 'y']
X = train.drop(columns=[col for col in drop_cols if col in train.columns], errors='ignore')
y = train['y']
X_test = test[X.columns]  # Align test to train columns

# Final shape print
print(f"✅ Final training data shape: {X.shape}")
print(f"✅ Final test data shape: {X_test.shape}")


✅ Final training data shape: (770164, 491)
✅ Final test data shape: (369301, 495)


In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("✅ Train-validation split completed.")


✅ Train-validation split completed.


In [None]:
# Drop identifiers
X = train.drop(columns=['y', 'id2', 'id3', 'id4', 'id7'], errors='ignore')
y = train['y'].astype(float)

# 🚨 Step 1: Remove duplicate columns
X = X.loc[:, ~X.columns.duplicated()]

# 🚨 Step 2: Sanitize feature names
X.columns = (
    X.columns
    .astype(str)
    .str.strip()                     # Remove leading/trailing spaces
    .str.replace(r"[^\w]+", "_", regex=True)  # Replace special characters with _
)

# Optional: check if any column is still problematic
bad_cols = [col for col in X.columns if any(c in col for c in ['"', "'", '\\', '\n'])]
if bad_cols:
    print("⚠️ Still problematic column names:", bad_cols)
else:
    print("✅ Column names cleaned.")


✅ Column names cleaned.


In [None]:
import lightgbm as lgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from lightgbm import early_stopping, log_evaluation

# -------------------------------
# ✅ Drop identifier columns
# -------------------------------
X = train.drop(columns=['y', 'id2', 'id3', 'id4', 'id7'], errors='ignore')
y = train['y'].astype(int)  # Must be 0 or 1 for classification

# -------------------------------
# ✅ Remove duplicate column names (e.g., engagement_score)
# -------------------------------
X = X.loc[:, ~X.columns.duplicated()]

# -------------------------------
# ✅ Sanitize column names (fix special characters for LightGBM)
# -------------------------------
X.columns = X.columns.str.replace(r"[^\w]", "_", regex=True)

# -------------------------------
# ✅ Split into train/validation
# -------------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -------------------------------
# ✅ Match sanitization in validation set
# -------------------------------
X_val.columns = X_train.columns  # Ensure same sanitized names

# -------------------------------
# ✅ Train LightGBM classifier
# -------------------------------
model = lgb.LGBMClassifier(
    objective='binary',
    learning_rate=0.05,
    num_leaves=128,
    max_depth=-1,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=5,
    n_estimators=1000,
    random_state=42
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[
        early_stopping(stopping_rounds=50),
        log_evaluation(period=100)
    ]
)

# -------------------------------
# ✅ Validation Prediction and AUC
# -------------------------------
val_preds = model.predict_proba(X_val)[:, 1]
val_score = roc_auc_score(y_val, val_preds)
print(f"✅ Validation ROC AUC: {val_score:.5f}")


[LightGBM] [Info] Number of positive: 29702, number of negative: 586429
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.882764 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 62453
[LightGBM] [Info] Number of data points in the train set: 616131, number of used features: 405
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.048207 -> initscore=-2.982837
[LightGBM] [Info] Start training from score -2.982837
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.0775969
[200]	valid_0's binary_logloss: 0.0742437
[300]	valid_0's binary_logloss: 0.0730113
[400]	valid_0's binary_logloss: 0.0723582
[500]	valid_0's binary_logloss: 0.0720636
Early stopping, best iteration is:
[526]	valid_0's binary_logloss: 0.0720437
✅ Validation ROC AUC: 0.95625


In [None]:
# -------------------------------
# ✅ Drop identifier columns from test
# -------------------------------
X_test = test.drop(columns=['y', 'id2', 'id3', 'id4', 'id7'], errors='ignore')

# -------------------------------
# ✅ Remove duplicate columns if any
# -------------------------------
X_test = X_test.loc[:, ~X_test.columns.duplicated()]
X_train = X_train.loc[:, ~X_train.columns.duplicated()]

# -------------------------------
# ✅ Align test columns with train columns
# -------------------------------
X_test = X_test.copy()
missing_cols = [col for col in X_train.columns if col not in X_test.columns]
extra_cols = [col for col in X_test.columns if col not in X_train.columns]

# Add missing columns as 0
for col in missing_cols:
    X_test[col] = 0

# Drop any extra columns
X_test.drop(columns=extra_cols, inplace=True)

# Ensure exact same column order
X_test = X_test[X_train.columns]


In [None]:
# -------------------------------
# ✅ Evaluate using ROC AUC on validation
# -------------------------------
val_preds = model.predict_proba(X_val)[:, 1]  # Get probability for class 1
val_score = roc_auc_score(y_val, val_preds)
print(f"✅ Validation ROC AUC: {val_score:.5f}")


✅ Validation ROC AUC: 0.95625


In [None]:
# -------------------------------
# ✅ Predict probabilities for test set
# -------------------------------
test_preds = model.predict_proba(X_test)[:, 1]  # Probability of taking the offer

# -------------------------------
# ✅ Save to submission file
# -------------------------------
submission = pd.read_csv('/kaggle/input/krish-data/685404e30cfdb_submission_template.csv')
submission['pred'] = test_preds
submission.to_csv('/kaggle/working/final_submission.csv', index=False)
print("✅ Submission saved to /kaggle/working/final_submission.csv")


✅ Submission saved to /kaggle/working/final_submission.csv
