## Import Package

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
from operator import attrgetter
from scipy import stats
import seaborn as sns
from mlxtend.frequent_patterns import apriori, association_rules

  return datetime.utcnow().replace(tzinfo=utc)


## Import Csv

In [None]:
df = pd.read_csv("/content/drive/MyDrive/online_retail_II.csv", header = 0)
df

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.10,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom
...,...,...,...,...,...,...,...,...
1067366,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
1067367,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
1067368,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France
1067369,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680.0,France


## DataCleaning

In [None]:
df_resik = df.copy()

# create date columns
df_resik['InvoiceDate'] = df_resik['InvoiceDate'].astype('datetime64[ns]')

# convert datetime
df_resik['InvoiceDate'] = pd.to_datetime(df_resik['InvoiceDate'], errors='coerce')

# Create month column
df_resik['year_month'] = df_resik['InvoiceDate'].dt.to_period('M')

# Remove all customer id
df_resik = df_resik[~df_resik['Customer ID'].isna()]

# Remove all description
df_resik = df_resik[~df_resik['Description'].isna()]

# Make all item in description column with no capital
df_resik['Description'] = df_resik['Description'].str.lower()

# Remove all name in the row, that name p, w and post or special characters
df_resik = df_resik[~df_resik['StockCode'].str.match('^\d+$', case=False, na=False)]

# Adding order statu
df_resik['order_status'] = np.where(df_resik['Invoice'].str[:1] == 'C', ' Cancelled', 'Delivereed')

# change negatif to positif in quantity
df_resik['Quantity'] = df_resik['Quantity'].abs()

# Remove all negative quantity
df_resik = df_resik[df_resik['Quantity'] > 0 ]

# Create new column by name amount
df_resik['Amount'] = df_resik['Quantity'] * df_resik['Price']

# Get the most common in description for every stockcode
most_freq_product_name = df_resik.groupby(['StockCode', 'Description'], as_index=False) \
  .agg(order_cnt=('Invoice', 'nunique')).sort_values(['StockCode', 'order_cnt'], ascending=[True, False])
most_freq_product_name['rank'] = most_freq_product_name.groupby('StockCode')['order_cnt'].rank(method='first', ascending=False)
most_freq_product_name = most_freq_product_name[most_freq_product_name['rank'] == 1].drop(columns=['order_cnt', 'rank'])

df_resik = df_resik.merge(
    most_freq_product_name.rename(columns={'Description' : 'most_freq_product_name'}),
    on='StockCode',
    how='left'
)

df_resik['Description'] = df_resik['most_freq_product_name']
df_resik = df_resik.drop(columns=['most_freq_product_name'])

# change customer ID to String
df_resik['Customer ID'] = df_resik['Customer ID'].astype(str)

# outlier remove
df_resik = df_resik[(np.abs(stats.zscore(df_resik[['Quantity', 'Amount']])) < 3).all(axis=1)]
df_resik = df_resik.reset_index(drop=True)
df_resik

  df_resik = df_resik[~df_resik['StockCode'].str.match('^\d+$', case=False, na=False)]


Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,year_month,order_status,Amount
0,489434,79323P,pink cherry lights,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,2009-12,Delivereed,81.00
1,489434,79323W,white cherry lights,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,2009-12,Delivereed,81.00
2,489436,48173C,doormat black flock,10,2009-12-01 09:06:00,5.95,13078.0,United Kingdom,2009-12,Delivereed,59.50
3,489436,35004B,set of 3 black flying ducks,12,2009-12-01 09:06:00,4.65,13078.0,United Kingdom,2009-12,Delivereed,55.80
4,489436,84596F,small marshmallows pink bowl,8,2009-12-01 09:06:00,1.25,13078.0,United Kingdom,2009-12,Delivereed,10.00
...,...,...,...,...,...,...,...,...,...,...,...
93933,581579,85099C,jumbo bag baroque black white,10,2011-12-09 12:19:00,1.79,17581.0,United Kingdom,2011-12,Delivereed,17.90
93934,581580,84993A,75 green petit four cases,2,2011-12-09 12:20:00,0.42,12748.0,United Kingdom,2011-12,Delivereed,0.84
93935,581580,85049A,traditional christmas ribbons,1,2011-12-09 12:20:00,1.25,12748.0,United Kingdom,2011-12,Delivereed,1.25
93936,581580,85049E,scandinavian reds ribbons,2,2011-12-09 12:20:00,1.25,12748.0,United Kingdom,2011-12,Delivereed,2.50


In [None]:
df_resik.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93938 entries, 0 to 93937
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Invoice       93938 non-null  object        
 1   StockCode     93938 non-null  object        
 2   Description   93938 non-null  object        
 3   Quantity      93938 non-null  int64         
 4   InvoiceDate   93938 non-null  datetime64[ns]
 5   Price         93938 non-null  float64       
 6   Customer ID   93938 non-null  object        
 7   Country       93938 non-null  object        
 8   year_month    93938 non-null  period[M]     
 9   order_status  93938 non-null  object        
 10  Amount        93938 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(6), period[M](1)
memory usage: 7.9+ MB


## Already online retail

Create online retail dataframe

In [None]:
london = pd.pivot_table(df_resik, index='Invoice', columns='Description', values='StockCode', aggfunc='nunique', fill_value=0)
london

Description,4 purple flock dinner candles,silver cherry lights,white cherry lights,3 black cats w hearts blank card,3 gardenia morris boxed candles,3 rose morris boxed candles,3 white choc morris boxed candles,3d dog picture playing cards,3d sheet of cat stickers,3d sheet of dog stickers,...,yellow purple daisy felt purse kit,yellow red flower piggy bank,yellow shark helicopter,yellow vw beetle ceramic money box,yellow/blue retro radio,yellow/orange flower design plate,yellow/pink ceramic candle holder,yellow/pink flower design big mug,yuletide images gift wrap set,yuletide images s/6 paper boxes
Invoice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
489434,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
489436,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
489437,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
489438,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
489439,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C581228,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C581229,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C581330,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C581409,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
london.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26005 entries, 489434 to C581499
Columns: 1300 entries,  4 purple flock dinner candles to yuletide images s/6 paper boxes
dtypes: int64(1300)
memory usage: 258.1+ MB


## Encode the basket DataFrame with True for all values ​​above 0 and False for all values ​​0

In [None]:
def encode(x):
  if x==0 :
    return False
  if x>0 :
    return True

london_encode = london.applymap(encode)
london_encode

  london_encode = london.applymap(encode)


Description,4 purple flock dinner candles,silver cherry lights,white cherry lights,3 black cats w hearts blank card,3 gardenia morris boxed candles,3 rose morris boxed candles,3 white choc morris boxed candles,3d dog picture playing cards,3d sheet of cat stickers,3d sheet of dog stickers,...,yellow purple daisy felt purse kit,yellow red flower piggy bank,yellow shark helicopter,yellow vw beetle ceramic money box,yellow/blue retro radio,yellow/orange flower design plate,yellow/pink ceramic candle holder,yellow/pink flower design big mug,yuletide images gift wrap set,yuletide images s/6 paper boxes
Invoice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
489434,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
489436,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
489437,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
489438,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
489439,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C581228,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
C581229,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
C581330,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
C581409,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
london_encode.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26005 entries, 489434 to C581499
Columns: 1300 entries,  4 purple flock dinner candles to yuletide images s/6 paper boxes
dtypes: bool(1300)
memory usage: 32.4+ MB



## Take transactions with more than 1 unique product

In [None]:
london_filter = london_encode[(london_encode>0).sum(axis=1)>1]
london_filter

Description,4 purple flock dinner candles,silver cherry lights,white cherry lights,3 black cats w hearts blank card,3 gardenia morris boxed candles,3 rose morris boxed candles,3 white choc morris boxed candles,3d dog picture playing cards,3d sheet of cat stickers,3d sheet of dog stickers,...,yellow purple daisy felt purse kit,yellow red flower piggy bank,yellow shark helicopter,yellow vw beetle ceramic money box,yellow/blue retro radio,yellow/orange flower design plate,yellow/pink ceramic candle holder,yellow/pink flower design big mug,yuletide images gift wrap set,yuletide images s/6 paper boxes
Invoice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
489434,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
489436,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
489437,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
489438,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
489439,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C579926,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
C579929,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
C580954,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
C580971,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
london_filter.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16581 entries, 489434 to C581229
Columns: 1300 entries,  4 purple flock dinner candles to yuletide images s/6 paper boxes
dtypes: bool(1300)
memory usage: 20.7+ MB


## Applying a priori algorithm

Create a frequent itemset list (a collection of products that are frequently purchased)

In [None]:
frequent_itemsets = apriori(london_filter, min_support=.01, use_colnames=True) \
  .sort_values('support', ascending=False).reset_index(drop=True)

frequent_itemsets['product_cnt'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets

Unnamed: 0,support,itemsets,product_cnt
0,0.230083,(white hanging heart t-light holder),1
1,0.162837,(jumbo bag red retrospot),1
2,0.091551,(wooden frame antique white ),1
3,0.084494,(jumbo bag strawberry),1
4,0.079850,(jumbo bag baroque black white),1
...,...,...,...
154,0.010192,(rose scent candle in jewelled box),1
155,0.010132,"(feather pen,coal black)",1
156,0.010132,"(black/blue polkadot umbrella, edwardian paras...",2
157,0.010132,"(white hanging heart t-light holder, tea time ...",2


## Calculate the support, confidence, and lift values ​​of each possible product pair

In [None]:
assosiation_product = assosiation_product = association_rules(
    frequent_itemsets,
    metric='confidence',
    min_threshold=0.7
).sort_values(['support', 'confidence'], ascending=[False, False]).reset_index(drop=True)
assosiation_product


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(blue 3 piece mini dots cutlery set),(pink 3 piece mini dots cutlery set),0.052349,0.05838,0.03703,0.707373,12.11669,1.0,0.033974,3.217819,0.968151,0.502455,0.689231,0.670835
1,(blue happy birthday bunting),(pink happy birthday bunting),0.035402,0.045293,0.026476,0.747871,16.511906,1.0,0.024873,3.786575,0.973916,0.48832,0.735909,0.666212
2,(green 3 piece mini dots cutlery set),(red 3 piece mini dots cutlery set),0.032326,0.054158,0.023702,0.733209,13.538238,1.0,0.021951,3.545253,0.957074,0.377522,0.717933,0.585424
3,(green 3 piece mini dots cutlery set),(pink 3 piece mini dots cutlery set),0.032326,0.05838,0.022978,0.710821,12.175745,1.0,0.021091,3.256182,0.948532,0.33927,0.692892,0.552208
4,"(red 3 piece mini dots cutlery set, pink 3 pie...",(blue 3 piece mini dots cutlery set),0.030758,0.052349,0.022857,0.743137,14.195805,1.0,0.021247,3.689328,0.959055,0.379379,0.728948,0.589887
5,"(red 3 piece mini dots cutlery set, blue 3 pie...",(pink 3 piece mini dots cutlery set),0.032507,0.05838,0.022857,0.703154,12.044418,1.0,0.02096,3.172082,0.947784,0.335993,0.68475,0.547341
6,"(jumbo bag strawberry, jumbo bag baroque blac...",(jumbo bag red retrospot),0.030155,0.162837,0.022194,0.736,4.519858,1.0,0.017284,3.171072,0.802968,0.129944,0.684649,0.436148
7,(small marshmallows pink bowl),(small dolly mix design orange bowl),0.024305,0.029974,0.018334,0.754342,25.166503,1.0,0.017606,3.948691,0.984185,0.510067,0.746752,0.683006
8,"(green 3 piece mini dots cutlery set, blue 3 p...",(pink 3 piece mini dots cutlery set),0.020505,0.05838,0.017731,0.864706,14.811661,1.0,0.016534,6.959799,0.952007,0.289941,0.856318,0.584212
9,"(pink 3 piece mini dots cutlery set, green 3 p...",(red 3 piece mini dots cutlery set),0.022978,0.054158,0.017731,0.771654,14.248093,1.0,0.016487,4.142134,0.951683,0.298477,0.758579,0.549524
