### Imports

In [1]:
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import parallel_coordinates

In [2]:
# setting up pandas options
pd.set_option('display.max_colwidth', None)

### Loading data

In [3]:
path = '~/ecomm-open-cdp/2019-Oct.csv'
header = pd.read_csv(path, index_col=0, nrows=1)
header

Unnamed: 0_level_0,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
event_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-10-01 00:00:00 UTC,view,44600062,2103807459595387724,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c


In [None]:
# filtering on event_type = purchase and category_code not na
data_chunks = pd.read_csv(path, iterator = True, chunksize = 10000)
store_data = pd.concat([ chunk[(~chunk.category_code.isna()) & (chunk.event_type == 'purchase')] for chunk in data_chunks])

In [None]:
store_data.info()

In [None]:
store_data.head()

In [None]:
store_data.shape

In [None]:
#saving filtered data
pd.save_csv('~/ecomm-open-cdp/2019-csv-filt.csv')

### Profiling data

In [None]:
data_profile = ProfileReport(store_data, 'Ecommerce data profile', explorative=True)

In [None]:
data_profile.to_widgets()

### Filtetring data

In [None]:
FILT_COLS = ['user_id', 'user_session', 'category_code']

In [None]:
filt_data = store_data[FILT_COLS]
filt_data.head()

### Preparing transaction data

In [None]:
trans_groups = filt_data.groupby(['user_id','user_session'])

In [None]:
print('Total transactions in the dataset', trans_groups.ngroups)

In [None]:
group_sizes =  trans_groups.size()
group_sizes

In [None]:
#Transaction aggregator
def transaction_agg(df):
    items = list(set(df.category_code))
    return pd.DataFrame({'items':[items]})

In [None]:
#aggregating transactions across userids and sessionids
transactions_df = trans_groups.apply(transaction_agg).reset_index()

In [None]:
transactions_df

In [None]:
transactions = list(transactions_df['items'])

In [None]:
#one hot encoding transactions
te = TransactionEncoder()
items = te.fit(transactions).transform(transactions)
items_df = pd.DataFrame(items, columns=te.columns_)

### Market Basket analysis

In [None]:
#exploring support
supports = items_df.apply(np.mean)
supports

In [None]:
supports.nlargest(10)

In [None]:
# supports plot 10 largest
fig, ax = plt.subplots(figsize=(7,5))
ax.bar(supports.nlargest(10).index, supports.nlargest(10))
ax.set_xlabel('Products')
ax.set_ylabel('Support')
plt.xticks(rotation=90)
plt.show()

In [None]:
supports.nsmallest(10)

In [None]:
# supports plot 10 smallest
fig, ax = plt.subplots(figsize=(7,5))
plt.tight_layout()
ax.bar(supports.nsmallest(10).index, supports.nsmallest(10))
ax.set_xlabel('Products')
ax.set_ylabel('Support')
plt.xticks(rotation=90)
plt.show()

In [None]:
#apriori algorithm
frequent_itemsets = apriori(items_df, min_support = 0.0001, max_len = 4, use_colnames=True)

In [None]:
len(frequent_itemsets)

In [None]:
frequent_itemsets

In [None]:
# get association rules with pruning
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1.1) 

In [None]:
len(rules)

In [None]:
rules

### Visualizing rules

In [None]:
rules['antecedents'] = rules['antecedents'].apply(lambda a: ','.join(list(a)))
rules['consequents'] = rules['consequents'].apply(lambda a: ','.join(list(a)))

In [None]:
lift_table = rules.pivot(index='antecedents', columns='consequents', values='lift')
lift_table

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
sns.heatmap(lift_table, annot=True, ax=ax)
plt.show()

In [None]:
rules['rule'] = rules.index
coords = rules[['antecedents','consequents','rule']]
fig, ax = plt.subplots(figsize=(12,10))
parallel_coordinates(coords, 'rule', colormap = 'ocean', ax=ax)
plt.show()

### Cross-sells and promotional bundles

In [None]:
# Cross-sells and promotional bucket heuristic:
# 1- Group by lift/support to gather bidirectional relations
# 2- If the difference on confidence is less than a threshold
#     a- Create a bucket - items are very closely related
#     b- Else - use the strongest directional relation for cross-sell

#### Cross-sells and promotional bucket heuristic:
#### 1. Group by lift/support to gather bidirectional relations
#### 2. If the difference on confidence is less than a threshold
####   a- Create a bucket - items are very closely related
####   b- Else - use the strongest directional relation for cross-sell

#### Promotional bundle logic

In [None]:
CONF_THRESH = 0.05

In [None]:
#rounding off floats before grouping
rules['lift'] = round(rules['lift'],5)

In [None]:
# promotional bundles logic 
promos_filt = rules.groupby('lift') \
     .filter(lambda x: np.abs( x['confidence'].iloc[0] - x['confidence'].iloc[1] ) < CONF_THRESH) \

promos = promos_filt.groupby('lift') \
     .apply(lambda x: list(x['antecedents']))\
     .to_frame('promo_buckets')\
     .reset_index()

#### Promotional bundles

In [None]:
promos

#### Cross-sell logic

In [None]:
#cross sell logic
cross_sells_cand = rules[~rules.antecedents.isin(promos_filt.antecedents)]
cross_sells_cand

In [None]:
cross_sells = cross_sells_cand.groupby('lift')\
               .apply(lambda x: x[x['confidence'] ==  x['confidence'].max()])\
               .reset_index(drop=True)

#### Cross-sell items

In [None]:
cross_sells