### Imports

In [1]:
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import parallel_coordinates

In [2]:
# setting up pandas options
pd.set_option('display.max_colwidth', None)

### Loading data

In [3]:
path = '~/ecomm-open-cdp/items_ohe_2019_oct.csv'
items_df = pd.read_csv(path).reset_index(drop=True)

In [4]:
items_df = items_df.drop(['Unnamed: 0'],axis=1)
items_df.head()

Unnamed: 0,accessories.bag,accessories.umbrella,accessories.wallet,apparel.belt,apparel.costume,apparel.dress,apparel.jacket,apparel.jeans,apparel.jumper,apparel.scarf,...,kids.skates,kids.swing,kids.toys,medicine.tools.tonometer,sport.bicycle,sport.ski,sport.snowboard,sport.tennis,sport.trainer,stationery.cartrige
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### Market Basket analysis

In [5]:
#exploring support
supports = items_df.apply(np.mean)
supports

accessories.bag         0.002452
accessories.umbrella    0.000048
accessories.wallet      0.000577
apparel.belt            0.000041
apparel.costume         0.000957
                          ...   
sport.ski               0.000033
sport.snowboard         0.000012
sport.tennis            0.000019
sport.trainer           0.000722
stationery.cartrige     0.000219
Length: 121, dtype: float64

In [6]:
supports.nlargest(10)

electronics.smartphone              0.589841
electronics.audio.headphone         0.056664
electronics.video.tv                0.039745
electronics.clocks                  0.033081
appliances.kitchen.washer           0.029997
computers.notebook                  0.028236
appliances.environment.vacuum       0.023924
appliances.kitchen.refrigerators    0.021000
electronics.tablet                  0.010202
auto.accessories.player             0.008786
dtype: float64

In [7]:
# supports plot 10 largest
fig, ax = plt.subplots(figsize=(7,5))
ax.bar(supports.nlargest(10).index, supports.nlargest(10))
ax.set_xlabel('Products')
ax.set_ylabel('Support')
plt.xticks(rotation=90)
plt.show()

  plt.show()


In [None]:
supports.nsmallest(10)

In [None]:
# supports plot 10 smallest
fig, ax = plt.subplots(figsize=(7,5))
plt.tight_layout()
ax.bar(supports.nsmallest(10).index, supports.nsmallest(10))
ax.set_xlabel('Products')
ax.set_ylabel('Support')
plt.xticks(rotation=90)
plt.show()

In [None]:
#apriori algorithm
frequent_itemsets = apriori(items_df, min_support = 0.0001, max_len = 4, use_colnames=True)

In [None]:
len(frequent_itemsets)

In [None]:
frequent_itemsets

In [None]:
# get association rules with pruning
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1.1) 

In [None]:
len(rules)

In [None]:
rules

### Visualizing rules

In [None]:
rules['antecedents'] = rules['antecedents'].apply(lambda a: ','.join(list(a)))
rules['consequents'] = rules['consequents'].apply(lambda a: ','.join(list(a)))

In [None]:
lift_table = rules.pivot(index='antecedents', columns='consequents', values='lift')
lift_table

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
sns.heatmap(lift_table, annot=True, ax=ax)
plt.show()

In [None]:
rules['rule'] = rules.index
coords = rules[['antecedents','consequents','rule']]
fig, ax = plt.subplots(figsize=(12,10))
parallel_coordinates(coords, 'rule', colormap = 'ocean', ax=ax)
plt.show()

### Cross-sells and promotional bundles

In [None]:
# Cross-sells and promotional bucket heuristic:
# 1- Group by lift/support to gather bidirectional relations
# 2- If the difference on confidence is less than a threshold
#     a- Create a bucket - items are very closely related
#     b- Else - use the strongest directional relation for cross-sell

#### Cross-sells and promotional bucket heuristic:
#### 1. Group by lift/support to gather bidirectional relations
#### 2. If the difference on confidence is less than a threshold
####   a- Create a bucket - items are very closely related
####   b- Else - use the strongest directional relation for cross-sell

#### Promotional bundle logic

In [None]:
CONF_THRESH = 0.05

In [None]:
#rounding off floats before grouping
rules['lift'] = round(rules['lift'],5)

In [None]:
# promotional bundles logic 
promos_filt = rules.groupby('lift') \
     .filter(lambda x: np.abs( x['confidence'].iloc[0] - x['confidence'].iloc[1] ) < CONF_THRESH) \

promos = promos_filt.groupby('lift') \
     .apply(lambda x: list(x['antecedents']))\
     .to_frame('promo_buckets')\
     .reset_index()

#### Promotional bundles

In [None]:
promos

#### Cross-sell logic

In [None]:
#cross sell logic
cross_sells_cand = rules[~rules.antecedents.isin(promos_filt.antecedents)]
cross_sells_cand

In [None]:
cross_sells = cross_sells_cand.groupby('lift')\
               .apply(lambda x: x[x['confidence'] ==  x['confidence'].max()])\
               .reset_index(drop=True)

#### Cross-sell items

In [None]:
cross_sells