In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans

# Load the classification dataset
df = pd.read_csv("final_pet_food_dataset.csv")

# Encode categorical features
df_encoded = df.copy()
for col in df_encoded.select_dtypes(include='object').columns:
    df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col])

# Feature Generation 
# Ratio Feature
df_encoded["avg_minutes_per_session"] = (
    df_encoded["total_minutes_on_website"] / (df_encoded["total_web_sessions"] + 1)
)

# Additive Feature
df_encoded["total_engagement"] = (
    df_encoded["total_web_sessions"] + df_encoded["total_web_sessions_since_last_order"]
)

# KMeans Clustering Feature
kmeans = KMeans(n_clusters=3, random_state=42)
cluster_features = df_encoded[["total_web_sessions", "total_minutes_on_website", "total_order_kcal"]]
df_encoded["user_cluster"] = kmeans.fit_predict(cluster_features)

# View new features
print("Newly Created Features:")
print(df_encoded[["avg_minutes_per_session", "total_engagement", "user_cluster"]].head())

# Optional: Save the new dataset
df_encoded.to_csv("classification_with_generated_features.csv", index=False)

Newly Created Features:
   avg_minutes_per_session  total_engagement  user_cluster
0                 6.000000                 1             0
1                 1.500000                14             0
2               205.000000                 3             0
3               101.625000                 9             1
4                 9.882353                17             1


We engineered three new features to enhance model signal: a ratio-based engagement metric (avg_minutes_per_session), an additive session activity metric (total_engagement), and a behavior-based cluster label (user_cluster) via KMeans. These features capture complex user behavior and can improve model interpretability and performance.