In [3]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns

In [63]:
df = pd.read_csv('../data/cleaned_data.csv')
df['order_date'] = pd.to_datetime(df['order_date'])
df['region'] = df['region'].astype('category')

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88076 entries, 0 to 88075
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   anon_user_id         88076 non-null  int64         
 1   order_date           88076 non-null  datetime64[ns]
 2   price_tier           88076 non-null  int64         
 3   dose_bucket          88076 non-null  int64         
 4   region               88076 non-null  category      
 5   is_partner_pharmacy  88076 non-null  int64         
 6   is_paid              88076 non-null  int64         
dtypes: category(1), datetime64[ns](1), int64(5)
memory usage: 4.1 MB
None


In [73]:
print("--- USER RE-ORDER BEHAVIOUR FEATURES ---")

df = df.sort_values(by=['anon_user_id', 'order_date']).reset_index(drop=True)
df['order_number'] = df.groupby('anon_user_id').cumcount() + 1

# days between orders
df['days_since_last_order'] = df.groupby('anon_user_id')['order_date'].diff().dt.days
df['days_since_last_order'] = df['days_since_last_order'].fillna(0).astype(int)

df['dose_change'] = df.groupby('anon_user_id')['dose_bucket'].diff().fillna(0).astype(int)

df['total_orders'] = df.groupby('anon_user_id')['order_number'].transform('max')

--- USER RE-ORDER BEHAVIOUR FEATURES ---


In [65]:
user_intervals = df.groupby('anon_user_id')['days_since_last_order'].agg(['mean', 'median', 'max']).reset_index()

In [79]:
print("--- REGION-WISE ORDER PATTERNS ---")
print(df['region'].value_counts())
region_retention = df.groupby('region')['days_since_last_order'].agg(['mean', 'median', 'max']).reset_index()
print(region_retention)

region_num_orders = df.groupby('region')['order_number'].agg(['mean', 'median', 'max']).reset_index()
print(region_num_orders)


--- REGION-WISE ORDER PATTERNS ---
region
Ontario     32755
Quebec      23045
West        22286
Other        5160
Prairies     4830
Name: count, dtype: int64
     region       mean  median  max
0   Ontario  26.134483    23.0  362
1     Other  25.515891    24.0  348
2  Prairies  27.097516    25.0  318
3    Quebec  26.686396    24.0  363
4      West  26.167280    23.0  358
     region      mean  median  max
0   Ontario  4.310365     3.0   28
1     Other  3.710465     3.0   24
2  Prairies  4.151346     3.0   24
3    Quebec  3.965372     3.0   30
4      West  4.050301     3.0   27


  region_retention = df.groupby('region')['days_since_last_order'].agg(['mean', 'median', 'max']).reset_index()
  region_num_orders = df.groupby('region')['order_number'].agg(['mean', 'median', 'max']).reset_index()


In [75]:
print("--- PHARMACY-LEVEL ORDER PATTERNS ---")
pharmacy_retention = df.groupby('is_partner_pharmacy')['days_since_last_order'].agg(['mean', 'median', 'max']).reset_index()
print(pharmacy_retention)

pharmacy_num_orders = df.groupby('is_partner_pharmacy')['total_orders'].agg(['mean', 'median', 'max']).reset_index()
print(pharmacy_num_orders)

--- PHARMACY-LEVEL ORDER PATTERNS ---
   is_partner_pharmacy       mean  median  max
0                    0  25.787572    24.0  363
1                    1  26.951609    23.0  362
   is_partner_pharmacy      mean  median  max
0                    0  5.857361     5.0   30
1                    1  8.932125     9.0   30


In [85]:
# number of total users with dosage change
dosage_change_counts = df[df['dose_change'] != 0].groupby('anon_user_id').size().reset_index(name='dosage_change_count')
# relationship between dosage changes and total orders
dosage_change_analysis = dosage_change_counts.merge(df.groupby('anon_user_id')['total_orders'].max().reset_index(), on='anon_user_id')
print(dosage_change_analysis.describe())

       anon_user_id  dosage_change_count  total_orders
count  1.565000e+03          1565.000000   1565.000000
mean   1.850232e+17             1.233227      8.075399
std    5.355709e+18             0.521821      5.120736
min   -9.221975e+18             1.000000      2.000000
25%   -4.571146e+18             1.000000      4.000000
50%    4.210113e+17             1.000000      6.000000
75%    4.911719e+18             1.000000     11.000000
max    9.197411e+18             5.000000     30.000000
