# Insider Trial Day
## Personalized Recommendation Model & Inference

In [1]:
import pandas as pd

df = pd.read_parquet(path="train.parquet", engine="fastparquet")

# Convert 'date' to datetime
df["date"] = pd.to_datetime(df["date"])

# Sort by userId and date to ensure chronological order
df = df.sort_values(by=["date", "userId"])

In [2]:
import ast


# Helper function to safely evaluate items
def safe_eval(item):
    if item.startswith("[") and item.endswith("]"):
        try:
            return ast.literal_eval(item)
        except:
            return [item]
    return item


def get_items(item_list):
    items = []
    for item in item_list:
        if item != "[]":
            evaluated_item = safe_eval(item)
            if isinstance(evaluated_item, list):
                items.extend(evaluated_item)
            else:
                items.append(evaluated_item)
    return items

In [3]:
# Aggregate the data by sessionId
session_df = df.groupby(['userId', 'sessionId']).agg({
    'date': ['min', 'max'],  # Session start and end times
    'pageType': lambda x: list(x),  # List of page types visited
    'itemId': get_items,
    'category': get_items,
    'productPrice': ['mean', 'max', 'min'],  # Price statistics
    'oldProductPrice': ['mean', 'max', 'min']  # Old price statistics
}).reset_index()

# Rename columns
session_df.columns = ['userId', 'sessionId', 'session_start', 'session_end',
                      'page_types', 'items', 'categories',
                      'avg_price', 'max_price', 'min_price',
                      'avg_old_price', 'max_old_price', 'min_old_price']

# Fill missing values
session_df['avg_price'] = session_df['avg_price'].fillna(0)
session_df['min_price'] = session_df['min_price'].fillna(0)
session_df['max_price'] = session_df['max_price'].fillna(0)
session_df['avg_old_price'] = session_df['avg_old_price'].fillna(0)
session_df['min_old_price'] = session_df['min_old_price'].fillna(0)
session_df['max_old_price'] = session_df['max_old_price'].fillna(0)

# Feature Engineering: Add additional features
session_df['session_length'] = (session_df['session_end'] - session_df['session_start']).dt.total_seconds()
session_df['num_items'] = session_df['items'].apply(len)
session_df['num_categories'] = session_df['categories'].apply(len)
session_df['num_page_types'] = session_df['page_types'].apply(len)
session_df['purchase'] = session_df['page_types'].apply(lambda x: 1 if 'success' in x else 0)

# Extract hour of the day
session_df['hour_of_day'] = session_df['session_start'].dt.hour

# Extract day of the week
session_df['day_of_week'] = session_df['session_start'].dt.dayofweek

In [4]:
# Calculate total number of sessions per user
user_session_count = session_df.groupby('userId')['sessionId'].nunique().rename('total_sessions')
session_df = session_df.merge(user_session_count, on='userId', how='left')

# Calculate total number of purchases per user
user_purchase_count = session_df.groupby('userId')['purchase'].sum().rename('total_purchases')
session_df = session_df.merge(user_purchase_count, on='userId', how='left')

# Calculate average session duration per user
user_avg_session_duration = session_df.groupby('userId')['session_length'].mean().rename('avg_session_duration')
session_df = session_df.merge(user_avg_session_duration, on='userId', how='left')

# Ensure sessions are sorted by userId and session_start
session_df = session_df.sort_values(by=['userId', 'session_start'])

# Calculate recency (days since last session)
session_df['last_session'] = session_df.groupby('userId')['session_end'].shift(1)
session_df['recency'] = (session_df['session_start'] - session_df['last_session']).dt.total_seconds() / (60 * 60 * 24)
session_df['recency'] = session_df['recency'].fillna(session_df['recency'].max())  # Fill NaNs with max recency

# Drop the temporary 'last_session' column
session_df = session_df.drop(columns=['last_session'])

In [5]:
# Flatten the list of categories per user and count occurrences
user_preferred_categories = session_df.explode('categories').groupby(['userId', 'categories']).size().unstack(fill_value=0)
user_top_categories = user_preferred_categories.apply(lambda x: x.sort_values(ascending=False).index[:3].tolist(), axis=1).rename('top_categories')

# Add the top categories to the session dataframe
session_df = session_df.merge(user_top_categories, on='userId', how='left')

# Similarly, for items
user_preferred_items = session_df.explode('items').groupby(['userId', 'items']).size().unstack(fill_value=0)
user_top_items = user_preferred_items.apply(lambda x: x.sort_values(ascending=False).index[:3].tolist(), axis=1).rename('top_items')

# Add the top items to the session dataframe
session_df = session_df.merge(user_top_items, on='userId', how='left')

# Fill missing values in 'top_items' and 'top_categories' with empty lists
session_df['top_items'] = session_df['top_items'].fillna('').apply(list)
session_df['top_categories'] = session_df['top_categories'].fillna('').apply(list)

In [6]:
df = session_df.copy()

# Model Training

## LightFM 

1. Hybrid Approach:
   LightFM is a hybrid model that combines collaborative filtering with content-based filtering.
   - It can leverage both user-item interactions  and item features.
   - This hybrid approach helps mitigate the cold-start problem for new items or users, as it can make recommendations based on item features even when there's no interaction data.

2. Scalability:
   LightFM is built to handle large-scale datasets efficiently. It uses stochastic gradient descent for optimization, which allows it to scale to large numbers of users and items.

3. Handling Sparse Data:
   E-commerce datasets are often very sparse (users interact with only a tiny fraction of available items). LightFM is designed to handle this sparsity well.

4. Feature Incorporation:
   The ability to incorporate item features (in our case, categories) allows the model to understand similarities between items beyond just user interactions. This can lead to more diverse recommendations.


#### Alternatives Considered:
- Simple collaborative filtering methods (like user-user or item-item similarity) were deemed too simplistic for the complexity of e-commerce data.
- Matrix Factorization techniques (like SVD) don't easily incorporate item features.
- Deep learning models (like neural collaborative filtering) were considered potentially overkill for the dataset size and would require more computational resources.

#### Potential Limitations:
- If the dataset is extremely large, even LightFM might struggle, and we might need to consider distributed computing solutions.
- If we need real-time updates to the model, we might need to consider online learning approaches, which LightFM doesn't natively support.


In [7]:
import pandas as pd
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset
from sklearn.preprocessing import LabelEncoder


# Flatten items and categories
item_data = []
for _, row in df.iterrows():
    user_id = row["userId"]
    for item, cat in zip(row["items"], row["categories"]):
        if item != "[]" and cat != "[]":
            # Handle potential nested lists in categories
            if isinstance(cat, list):
                for c in cat:
                    if c != "[]":
                        item_data.append((user_id, item, c))
            else:
                item_data.append((user_id, item, cat))

item_df = pd.DataFrame(item_data, columns=["user_id", "item_id", "category"])

# Encode categories
le = LabelEncoder()
item_df["category"] = le.fit_transform(item_df["category"].apply(str))

# Create item features
item_features = pd.get_dummies(item_df["category"], prefix="cat")
item_features["item_id"] = item_df["item_id"]
item_features = item_features.groupby("item_id").sum().reset_index()

# Create user features
# user_features = pd.get_dummies(df["top_categories"].explode(), prefix="cat")
# user_features = df.drop(columns=["sessionId", "session_start", "session_end", "page_types", "items", "categories"])  # Drop unnecessary columns)
user_features = df.drop(columns=["sessionId", "session_start", "session_end", "page_types", "items", "categories", "top_items", "top_categories"])  # Drop unnecessary columns
#rename userId to user_id
user_features.rename(columns={'userId':'user_id'}, inplace=True)
# drop users with num_items = 0, num_categories = 0
user_features = user_features[user_features['num_items'] > 0]
user_features = user_features[user_features['num_categories'] > 0]

# 3. Create LightFM Dataset
dataset = Dataset()
dataset.fit(
    item_df["user_id"],
    item_df["item_id"], 
    user_features=user_features.columns[1:],
    item_features=item_features.columns[1:],
)

# Create interaction matrix
interactions, weights = dataset.build_interactions(
    (row["user_id"], row["item_id"]) for _, row in item_df.iterrows()
)

# Create user features matrix
user_features_matrix = dataset.build_user_features(
    (row["user_id"], row.iloc[1:].to_dict()) for _, row in user_features.iterrows()
)

# Create item features matrix
item_features_matrix = dataset.build_item_features(
    (row["item_id"], row.iloc[1:].to_dict()) for _, row in item_features.iterrows()
)



In [8]:
# 4. Train the Model
model = LightFM(
    no_components=30,
    learning_rate=0.05,
    loss="warp",
    random_state=42,
)
model.fit(
    interactions,
    user_features=user_features_matrix,
    item_features=item_features_matrix,
    epochs=50,
    num_threads=4,
)

<lightfm.lightfm.LightFM at 0x3d1cad790>

# Inference

In [9]:
# 5. Generate Recommendations
def get_recommendations(
    user_id, model, dataset, user_features_matrix, item_features_matrix, n=5
):
    n_users, n_items = dataset.interactions_shape()

    user_id_map = dataset.mapping()[0]
    item_id_map = dataset.mapping()[2]
    # Create a reverse mapping of internal item indices to item IDs
    reverse_item_map = {v: k for k, v in item_id_map.items()}

    if user_id not in user_id_map:
        return []

    user_idx = user_id_map[user_id]

    scores = model.predict(
        user_idx,
        np.arange(n_items),
        item_features=item_features_matrix,
        user_features=user_features_matrix,
    )
    top_items = np.argsort(-scores)

    return [reverse_item_map[item] for item in top_items[:n]]

In [10]:
# Example usage
user_id = item_df["user_id"].iloc[0]

recommendations = get_recommendations(user_id, model, dataset,user_features_matrix, item_features_matrix)
print(f"Recommendations for user {user_id}:")
print(recommendations)

Recommendations for user 0001d86ea81e6eef12cebaa1dcbdadc2:
['e24b3c0c71eac81c49867fda76bcd1a3', '126b7550606fcc57843c50adf557f482', '4fcb83d905354249b964950d8fb3a2e3', 'e21e83f213fff4d98aa3df84e088ec1b', 'c78fb70476792ceacefd511d0abb4c5c']


# Evaluate the model

In [11]:
# Evaluate the model
from lightfm.evaluation import precision_at_k, auc_score

train_precision = precision_at_k(
    model,
    interactions,
    user_features=user_features_matrix,
    item_features=item_features_matrix,
    k=10,
).mean()

train_auc = auc_score(
    model,
    interactions,
    user_features=user_features_matrix,
    item_features=item_features_matrix,
).mean()

print(f"Train Precision@10: {train_precision:.4f}")

print(f"Train AUC: {train_auc:.4f}")

Train Precision@10: 0.0273
Train AUC: 0.9146


In [13]:
import joblib

joblib.dump(model, 'model_artifacts/lightfm_model.joblib')
joblib.dump(dataset, 'model_artifacts/lightfm_dataset.joblib')
joblib.dump(item_features_matrix, 'model_artifacts/item_features_matrix.joblib')
joblib.dump(item_features, 'model_artifacts/item_features.joblib')
joblib.dump(user_features_matrix, 'model_artifacts/user_features_matrix.joblib')
joblib.dump(user_features, 'model_artifacts/user_features.joblib')

['model_artifacts/user_features.joblib']

# Results

1. Train Precision@10: 0.0273 (2.73%)
2. Train AUC: 0.9146 (91.46%)

#### 1. Precision@10 (0.0273):
   This score suggests that, on average, 2.73% of the top 10 recommended items for each user are relevant (i.e., items that the user has actually interacted with). At first glance, this might seem low, but it's important to consider a few factors:
   - In recommendation systems, especially with large item catalogs, even seemingly low precision scores can be valuable.
   - The score depends on the sparsity of the data. If users typically interact with only a small fraction of available items, a lower precision is expected.

#### 2. AUC (0.9146):
   This is actually a good score. AUC (Area Under the ROC Curve) of 0.9146 indicates that the model is very good at distinguishing between items a user is likely to interact with and those they're not. A perfect AUC would be 1.0, so 0.9146 suggests strong predictive power.
