In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
df1=pd.read_csv('../data/item_properties_part1.csv')
df2=pd.read_csv('../data/item_properties_part2.csv')

In [3]:
p1 = df1.copy()
p2 = df2.copy()

items = pd.concat([p1, p2], ignore_index=True)
cats = items[items["property"] == "categoryid"]

cats = cats[["itemid", "value"]].drop_duplicates()
cats = cats.rename(columns={"value": "category_id"})

In [4]:
cats

Unnamed: 0,itemid,category_id
0,460429,1338
140,281245,1277
151,35575,1059
189,8313,1147
197,55102,47
...,...,...
20275531,261475,959
20275658,28038,828
20275696,372949,209
20275797,255696,1404


In [5]:


def load_and_clean_events(path, cat_df):
    """
    Preprocess events data and create LightGBM-friendly features.
    """

    # -----------------------
    # Load
    # -----------------------
    df = pd.read_csv(path)

    # -----------------------
    # Time features
    # -----------------------
    df["timestamp"] = pd.to_datetime(df["timestamp"], unit="ms")
    df["hour"] = df["timestamp"].dt.hour
    df["dayofweek"] = df["timestamp"].dt.dayofweek

    # -----------------------
    # Label
    # -----------------------
    df["label"] = (df["event"] == "transaction").astype(int)

    # -----------------------
    # Merge category
    # -----------------------
    df = df.merge(cat_df, on="itemid", how="left")
    df["category_id"] = df["category_id"].fillna(-1).astype(int)

    # -----------------------
    # Item popularity (GLOBAL)
    # -----------------------
    item_popularity = (
        df.groupby("itemid")
        .size()
        .reset_index(name="item_popularity")
    )
    df = df.merge(item_popularity, on="itemid", how="left")

    # -----------------------
    # Category popularity
    # -----------------------
    category_popularity = (
        df.groupby("category_id")
        .size()
        .reset_index(name="category_popularity")
    )
    df = df.merge(category_popularity, on="category_id", how="left")

    # -----------------------
    # User activity level
    # -----------------------
    user_activity = (
        df.groupby("visitorid")
        .size()
        .reset_index(name="user_event_count")
    )
    df = df.merge(user_activity, on="visitorid", how="left")

    # -----------------------
    # Recency feature (hours since event)
    # -----------------------
    max_ts = df["timestamp"].max()
    df["event_age_hours"] = (
        (max_ts - df["timestamp"]).dt.total_seconds() / 3600
    )

    # -----------------------
    # Final feature set
    # -----------------------
    df = df[
        [
            "visitorid",
            "itemid",
            "category_id",
            "hour",
            "dayofweek",
            "item_popularity",
            "category_popularity",
            "user_event_count",
            "event_age_hours",
            "label",
            "timestamp",
        ]
    ]

    return df


In [6]:
df=load_and_clean_events("../data/events.csv",cats)

In [7]:
df

Unnamed: 0,visitorid,itemid,category_id,hour,dayofweek,item_popularity,category_popularity,user_event_count,event_age_hours,label,timestamp
0,257597,355908,1173,5,1,57,24572,2,2589.959909,0,2015-06-02 05:02:12.117
1,992329,248676,1231,5,1,34,5286,30,2589.159340,0,2015-06-02 05:50:14.164
2,111016,318965,-1,5,1,15,255585,2,2589.774434,0,2015-06-02 05:13:19.827
3,483717,253185,914,5,1,14,8050,3,2589.786632,0,2015-06-02 05:12:35.914
4,951259,367447,1613,5,1,520,40207,2,2589.958523,0,2015-06-02 05:02:17.106
...,...,...,...,...,...,...,...,...,...,...,...
2965026,591435,261427,1623,3,5,166,2650,1,1151.778291,0,2015-08-01 03:13:05.939
2965027,762376,115946,1616,3,5,197,2635,1,1151.492957,0,2015-08-01 03:30:13.142
2965028,1251746,78144,969,2,5,103,6279,2,1152.046461,0,2015-08-01 02:57:00.527
2965029,1184451,283392,1589,3,5,15,7897,1,1151.849190,0,2015-08-01 03:08:50.703


In [8]:
df=df.sort_values('timestamp')

In [26]:
itemids=[1,2,4,6,8,22,44]

In [27]:
visitorid=11

In [28]:
from datetime import datetime
import pandas as pd

FEATURES = [
    "category_id",
    "hour",
    "dayofweek",
    "item_popularity",
    "category_popularity",
    "user_event_count",
    "event_age_hours",
]


def build_features(df, visitorid, itemids):
    now = datetime.now()

    base = (
        df[df["itemid"].isin(itemids)]
        .drop_duplicates("itemid")
        [["itemid", "category_id", "item_popularity", "category_popularity"]]
        .copy()
    )

    user_df = df[df["visitorid"] == visitorid]

    base["hour"] = now.hour
    base["dayofweek"] = now.weekday()

    base["user_event_count"] = (
        base["itemid"]
        .map(user_df["itemid"].value_counts())
        .fillna(0)
    )

    last_event = user_df.groupby("itemid")["timestamp"].max()

    base["event_age_hours"] = (
        (now - base["itemid"].map(last_event))
        .dt.total_seconds() / 3600
    ).fillna(9999)

    X = base[FEATURES]
    X["itemid"] = base["itemid"]

    return X


In [29]:
build_features(df,visitorid,itemids)

Unnamed: 0,category_id,hour,dayofweek,item_popularity,category_popularity,user_event_count,event_age_hours,itemid
1657680,1091,1,2,29,2812,0.0,9999.0,6
1813915,1359,1,2,2,996,0.0,9999.0,22
649330,1038,1,2,3,585,0.0,9999.0,4


In [9]:
cutoff = df["timestamp"].quantile(0.8)

train_df = df[df["timestamp"] <= cutoff]
test_df  = df[df["timestamp"] > cutoff]


In [10]:
features = [
    "category_id",
    "hour",
    "dayofweek",
    "item_popularity",
    "category_popularity",
    "user_event_count",
    "event_age_hours"
]
target = "label"


In [24]:
X_train

Unnamed: 0,category_id,hour,dayofweek,item_popularity,category_popularity,user_event_count,event_age_hours
1569985,1130,3,6,63,1116,3,3311.995390
1571976,463,3,6,79,3938,1,3311.993472
1570011,-1,3,6,248,255585,1,3311.992983
1572508,1628,3,6,63,1429,1,3311.989898
1569962,1130,3,6,63,1116,3,3311.989322
...,...,...,...,...,...,...,...
976269,858,1,1,110,14852,4,745.725490
965842,-1,1,1,3,255585,2,745.725203
959078,1573,1,1,107,3011,1,745.724981
981072,792,1,1,117,5708,3,745.723949


In [11]:
X_train = train_df[features]
y_train = train_df[target]

X_test  = test_df[features]
y_test  = test_df[target]

In [12]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV


In [13]:
pos = y_train.sum()
neg = len(y_train) - pos

scale_pos_weight = neg / pos
print("scale_pos_weight:", round(scale_pos_weight, 2))


scale_pos_weight: 123.97


In [18]:
param_grid = {
    "num_leaves": [31, 63],
    "min_child_samples": [50],
    "subsample": [0.8],
    "colsample_bytree": [0.8],
}


In [19]:
base_model = lgb.LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_jobs=-1
)


In [20]:
grid = GridSearchCV(
    estimator=base_model,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=3,
    n_jobs=-1,
    verbose=2
)

grid.fit(X_train, y_train)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[LightGBM] [Info] Number of positive: 18981, number of negative: 2353044
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012865 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1220
[LightGBM] [Info] Number of data points in the train set: 2372025, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.008002 -> initscore=-4.820027
[LightGBM] [Info] Start training from score -4.820027


0,1,2
,estimator,LGBMClassifie...838944207366))
,param_grid,"{'colsample_bytree': [0.8], 'min_child_samples': [50], 'num_leaves': [31, 63], 'subsample': [0.8]}"
,scoring,'roc_auc'
,n_jobs,-1
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,boosting_type,'gbdt'
,num_leaves,63
,max_depth,-1
,learning_rate,0.05
,n_estimators,300
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [21]:
print("Best parameters found:")
print(grid.best_params_)

print("\nBest CV ROC-AUC:")
print(round(grid.best_score_, 4))


Best parameters found:
{'colsample_bytree': 0.8, 'min_child_samples': 50, 'num_leaves': 63, 'subsample': 0.8}

Best CV ROC-AUC:
0.7656


In [22]:
best_model = grid.best_estimator_


In [None]:
y_test_proba = best_model.predict_proba(X_test)[:, 1]

test_auc = roc_auc_score(y_test, y_test_proba)

print("\nFinal Test ROC-AUC:", round(test_auc, 4))



Final Test ROC-AUC: 0.8399
