In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [7]:
# FEATURE-FACTORY v1 for AMEX Challenge
import pandas as pd, numpy as np, gc, time, sys
from pathlib import Path

RAW_DIR = Path('/kaggle/input/amexda')            # original competition data

# Locate cleaned data produced in Step-3
def locate_clean_dir() -> Path:
    wd = Path('/kaggle/working')
    if (wd / 'train_clean.parquet').exists():
        return wd
    for ds in Path('/kaggle/input').iterdir():
        if (ds / 'train_clean.parquet').exists():
            return ds
    raise FileNotFoundError('train_clean.parquet not found. Add the Data-Prep notebook output as a dataset or keep running in the same session.')

CLEAN_DIR = locate_clean_dir()
print('✓ using cleaned files from →', CLEAN_DIR)

WORK_DIR   = Path('/kaggle/working')
WORK_DIR.mkdir(exist_ok=True, parents=True)
OUT_TRAIN  = WORK_DIR / 'fe_v1_train.parquet'
OUT_TEST   = WORK_DIR / 'fe_v1_test.parquet'

# Fast-path: skip if features already built
if OUT_TRAIN.exists() and OUT_TEST.exists():
    print('✔ fe_v1_* already exists – skipping regeneration.')
    sys.exit(0)

# 1 ▸ load cleaned data
t0 = time.time()
train = pd.read_parquet(CLEAN_DIR / 'train_clean.parquet')
test  = pd.read_parquet(CLEAN_DIR / 'test_clean.parquet')
print(f'loaded clean data  ({time.time()-t0:.1f}s)')

# Force id2 and id3 to int64 in train/test
train['id2'] = pd.to_numeric(train['id2'], errors='coerce').astype('int64')
train['id3'] = pd.to_numeric(train['id3'], errors='coerce').astype('int64')
test['id2'] = pd.to_numeric(test['id2'], errors='coerce').astype('int64')
test['id3'] = pd.to_numeric(test['id3'], errors='coerce').astype('int64')

# 2 ▸ minimal events table (impression / click)
events_cols = ['id2', 'id3', 'id4', 'id7']
events = pd.read_parquet(RAW_DIR / 'add_event.parquet', columns=events_cols)
events['id2'] = pd.to_numeric(events['id2'], errors='coerce').astype('int64')
events['id3'] = pd.to_numeric(events['id3'], errors='coerce').astype('int64')

events['imp']   = 1
events['click'] = events['id7'].notna().astype('int8')

# 3 ▸ customer × offer aggregates
agg = (events
       .groupby(['id2', 'id3'])
       .agg(imp_cnt=('imp', 'sum'),
            click_cnt=('click', 'sum'))
       .reset_index())
agg['ctr_id2_id3'] = agg['click_cnt'] / agg['imp_cnt']

# 4 ▸ global offer popularity & customer engagement
offer_agg = (agg.groupby('id3')
                 .agg(offer_imp_cnt=('imp_cnt', 'sum'),
                      offer_click_cnt=('click_cnt', 'sum'))
                 .reset_index())
offer_agg['offer_ctr'] = offer_agg['offer_click_cnt'] / offer_agg['offer_imp_cnt']

cust_agg = (agg.groupby('id2')
                .agg(cust_imp_cnt=('imp_cnt', 'sum'),
                     cust_click_cnt=('click_cnt', 'sum'))
                .reset_index())
cust_agg['cust_ctr'] = cust_agg['cust_click_cnt'] / cust_agg['cust_imp_cnt']

del events ; gc.collect()

# 5 ▸ merge all signals into train & test
def enrich(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['id2'] = pd.to_numeric(df['id2'], errors='coerce').astype('int64')
    df['id3'] = pd.to_numeric(df['id3'], errors='coerce').astype('int64')
    df = (df.merge(agg,       on=['id2', 'id3'], how='left')
            .merge(offer_agg, on='id3',          how='left')
            .merge(cust_agg,  on='id2',          how='left'))
    num_cols = df.select_dtypes(include=[np.number]).columns
    df[num_cols] = df[num_cols].fillna(0).astype('float32')
    return df

train = enrich(train)
test  = enrich(test)

# 6 ▸ save feature matrices
train.to_parquet(OUT_TRAIN, index=False)
test.to_parquet (OUT_TEST,  index=False)
print('🎉 fe_v1 files written →', OUT_TRAIN, ',', OUT_TEST)


✓ using cleaned files from → /kaggle/input/jnbhjkbkbk
loaded clean data  (19.5s)
🎉 fe_v1 files written → /kaggle/working/fe_v1_train.parquet , /kaggle/working/fe_v1_test.parquet


In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

# Paths
RAW_DIR = Path('/kaggle/input/amexda')
FEAT_DIR = Path('/kaggle/input/wdwfdws')
WORK_DIR = Path('/kaggle/working')

# Load previous features
train = pd.read_parquet(FEAT_DIR / 'fe_v1_train.parquet')
test = pd.read_parquet(FEAT_DIR / 'fe_v1_test.parquet')

# Load transaction data
trans = pd.read_parquet(RAW_DIR / 'add_trans.parquet')
trans['id2'] = pd.to_numeric(trans['id2'], errors='coerce')
trans['f367'] = pd.to_numeric(trans['f367'], errors='coerce')  # transaction amount
trans['f370'] = pd.to_datetime(trans['f370'], errors='coerce')  # transaction date

# Aggregate features per customer
trans_agg = trans.groupby('id2').agg(
    trans_count=('f367', 'count'),
    trans_total_amt=('f367', 'sum'),
    trans_avg_amt=('f367', 'mean'),
    last_trans_date=('f370', 'max')
).reset_index()

# For recency, need the impression date from train/test
for df, name in [(train, 'train'), (test, 'test')]:
    df['id2'] = pd.to_numeric(df['id2'], errors='coerce')
    df['id5'] = pd.to_datetime(df['id5'], errors='coerce')  # impression date
    df = df.merge(trans_agg, on='id2', how='left')
    df['trans_recency_days'] = (df['id5'] - df['last_trans_date']).dt.days
    df['trans_recency_days'] = df['trans_recency_days'].fillna(df['trans_recency_days'].max())
    df['trans_count'] = df['trans_count'].fillna(0)
    df['trans_total_amt'] = df['trans_total_amt'].fillna(0)
    df['trans_avg_amt'] = df['trans_avg_amt'].fillna(0)
    df = df.drop(columns=['last_trans_date'])
    # Save updated DataFrame
    if name == 'train':
        train = df
    else:
        test = df

# Save as new feature set
train.to_parquet(WORK_DIR / 'fe_v2_train.parquet', index=False)
test.to_parquet(WORK_DIR / 'fe_v2_test.parquet', index=False)
print('🎉 v2 features with transaction recency/frequency saved!')


🎉 v2 features with transaction recency/frequency saved!
