In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
orders = pd.read_csv('/content/drive/MyDrive/ml_project/orders.csv')
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [None]:
prior_orders=orders[orders['eval_set']=='prior']

In [None]:
order_prior=pd.read_csv('/content/drive/MyDrive/ml_project/order_products__prior.csv')

In [None]:
prior_df=pd.merge(order_prior,prior_orders,on='order_id',how='left')

In [None]:
user_total_orders=(prior_df .groupby('user_id')['order_id'].nunique().rename('user_total_orders'))

In [None]:
user_reorder_ratio=(prior_df.groupby('user_id')['reordered'].mean().rename('user_reorder_ratio'))

The user reorder ratio was computed by taking the mean of the reordered variable across all prior user–product interactions, resulting in a user-level feature that reflects each user’s tendency to repurchase previously ordered products using historical data only.

In [None]:
user_avg_basket_size=(prior_df.groupby(['user_id', 'order_id'])['product_id'].count().groupby('user_id').mean().rename('user_avg_basket_size'))

In [None]:
user_mean_days_between_orders=(prior_df.groupby('user_id')['days_since_prior_order'].mean().rename('user_mean_days_between_orders'))

In [None]:
user_days_since_last_order=(prior_df.groupby('user_id')['days_since_prior_order'].last().rename('user_days_since_last_order'))

Average basket size, mean days between orders, and recency were computed as user-level features using prior orders only to summarize user purchasing behavior and timing.

In [None]:
user_features=pd.concat([user_total_orders,user_reorder_ratio,user_avg_basket_size,user_mean_days_between_orders,
user_days_since_last_order],axis=1).reset_index()

In [None]:
user_features=user_features.astype({
    'user_total_orders':'int32',
    'user_avg_basket_size':'float32',
    'user_reorder_ratio':'float32',
    'user_mean_days_between_orders':'float32',
    'user_days_since_last_order':'float32'})

In [None]:
product_reorder_rate=(prior_df.groupby('product_id')['reordered'].mean().rename('product_reorder_rate'))

In [None]:
product_avg_cart_position=(order_prior.groupby('product_id')['add_to_cart_order'].mean().rename('product_avg_cart_position'))

In [None]:
product_time=pd.merge(order_prior,prior_orders[['order_id', 'order_number']],on='order_id',how='left')
product_popularity_over_time=(product_time.groupby('product_id')['order_number'].mean().rename('product_popularity_over_time'))

In [None]:
user_product_purchase_count=(prior_df.groupby(['user_id','product_id']).size().rename('user_product_purchase_count').reset_index())

In [None]:
user_product_avg_reorder_prob=(prior_df.groupby(['user_id','product_id'])['reordered'].mean())

In [None]:
user_product_features=user_product_purchase_count.merge(user_product_avg_reorder_prob,on=['user_id','product_id'],how='left')

In [None]:
user_product_reorder_ratio=(prior_df.groupby(['user_id','product_id'])['reordered'].mean().rename('user_product_reorder_ratio').reset_index())

In [None]:
user_product_last_purchase=(prior_df.groupby(['user_id','product_id'])['days_since_prior_order'].last()
.rename('user_product_days_since_last_purchase').reset_index())

In [None]:
user_product_features=(user_product_purchase_count.merge(user_product_reorder_ratio,on=['user_id','product_id'],how='left')
    .merge(user_product_last_purchase,on=['user_id','product_id'],how='left'))

In [None]:
user_product_features=user_product_features.astype({
    'user_product_purchase_count':'int32',
    'user_product_reorder_ratio':'float32',
    'user_product_days_since_last_purchase':'float32'})

In [None]:
orders['order_hour']=orders['order_hour_of_day']

In [None]:
orders['order_dow_feature']=orders['order_dow']

In [None]:
orders['order_month']=((orders['order_number']- 1) % 12 + 1)

In [None]:
orders['order_year']=((orders['order_number']- 1) // 12 + 1)

In [None]:
orders['season']=((orders['order_month'] % 12) // 3 + 1)

In [None]:
orders=orders.astype({
    'order_hour':'int8',
    'order_dow_feature':'int8',
    'order_month':'int8',
    'order_year':'int16',
    'season':'int8'})

In [None]:
prior_df=prior_df.sort_values(['user_id','order_number'])

In [None]:
last_3_orders=(prior_df.groupby('user_id').tail(3))

In [None]:
user_last3_orders_count=(last_3_orders.groupby('user_id')['order_id'].nunique().rename('user_last3_orders_count'))

In [None]:
user_last3_avg_basket=(last_3_orders.groupby(['user_id','order_id'])['product_id'].count().groupby('user_id').mean().rename('user_last3_avg_basket'))

In [None]:
user_window_features=(pd.concat([user_last3_orders_count, user_last3_avg_basket],axis=1).reset_index())

In [None]:
user_features['log_user_total_orders']=np.log1p(user_features['user_total_orders'])

In [None]:
final_feature_matrix=(user_product_features.merge(user_features,on='user_id',how='left'))
final_feature_matrix.shape

(13307953, 11)

In [None]:
final_feature_matrix.head()

Unnamed: 0,user_id,product_id,user_product_purchase_count,user_product_reorder_ratio,user_product_days_since_last_purchase,user_total_orders,user_reorder_ratio,user_avg_basket_size,user_mean_days_between_orders,user_days_since_last_order,log_user_total_orders
0,1,196,10,0.9,19.0,10,0.694915,5.9,20.25926,19.0,2.397895
1,1,10258,9,0.888889,19.0,10,0.694915,5.9,20.25926,19.0,2.397895
2,1,10326,1,0.0,28.0,10,0.694915,5.9,20.25926,19.0,2.397895
3,1,12427,10,0.9,19.0,10,0.694915,5.9,20.25926,19.0,2.397895
4,1,13032,3,0.666667,30.0,10,0.694915,5.9,20.25926,19.0,2.397895


All engineered features from the previous steps were consolidated into a single final merged feature matrix.  
This dataset represents the complete input space for downstream classification and regression models.