In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

import networkx as nx
import warnings
from itertools import permutations

from mlxtend.preprocessing import TransactionEncoder

In [2]:
class Columns:
    INVOICE_NO = 'invoice_no'
    STOCK_CODE = 'stock_code'
    DESCRIPTION = 'description'

In [3]:
gifts_df = pd.read_csv('../datasets/market_basket/online_retail.csv')

gifts_df.rename(columns={ \
    'InvoiceNo' : 'invoice_no', \
    'StockCode' : 'stock_code', \
    'Description' : 'description' \
    }, inplace=True)

gifts_df.head()

Unnamed: 0,invoice_no,stock_code,description
0,562583,35637A,IVORY STRING CURTAIN WITH POLE
1,562583,35638A,PINK AND BLACK STRING CURTAIN
2,562583,84927F,PSYCHEDELIC TILE HOOK
3,562583,22425,ENAMEL COLANDER CREAM
4,562583,16008,SMALL FOLDING SCISSOR(POINTED EDGE)


In [4]:
gifts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227760 entries, 0 to 227759
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   invoice_no   227760 non-null  object
 1   stock_code   227760 non-null  object
 2   description  227404 non-null  object
dtypes: object(3)
memory usage: 5.2+ MB


In [5]:
#Remove leading and trailing characters in the Description column
gifts_df[Columns.DESCRIPTION] = gifts_df[Columns.DESCRIPTION].str.strip()
gifts_df.head()

Unnamed: 0,invoice_no,stock_code,description
0,562583,35637A,IVORY STRING CURTAIN WITH POLE
1,562583,35638A,PINK AND BLACK STRING CURTAIN
2,562583,84927F,PSYCHEDELIC TILE HOOK
3,562583,22425,ENAMEL COLANDER CREAM
4,562583,16008,SMALL FOLDING SCISSOR(POINTED EDGE)


In [6]:
#Dropping the rows without any invoice number
row_count = len(gifts_df)

gifts_df.dropna(subset=[Columns.INVOICE_NO], inplace=True)
gifts_df[Columns.INVOICE_NO] = gifts_df[Columns.INVOICE_NO].astype('str')

print(f'Row count dropped from {row_count} to {len(gifts_df)}')

gifts_df.dtypes

Row count dropped from 227760 to 227760


invoice_no     object
stock_code     object
description    object
dtype: object

In [7]:
# Dropping all transactions which were done on credit
row_count = len(gifts_df)

filt = ~gifts_df[Columns.INVOICE_NO].str.contains('C')
gifts_df = gifts_df[filt]

print(f'Row count dropped from {row_count} to {len(gifts_df)}')

Row count dropped from 227760 to 224372


In [8]:
print(f'Number of transactions is {len(gifts_df[Columns.INVOICE_NO].unique())}')
print(f'Number of items is {len(gifts_df[Columns.DESCRIPTION].unique())}')

Number of transactions is 8410
Number of items is 3447


In [9]:
# Recover unique InvoiceNo's.
invoice_numbers = gifts_df[Columns.INVOICE_NO].unique()
print(f'{len(invoice_numbers)} unique invoice numbers was found')

8410 unique invoice numbers was found


In [12]:
#Create a basket of items for each transaction
transactions = []

i = 1
for invoice_no in invoice_numbers:
    
    if i % 500 == 0:
        print(f'invoice {i} processed')
        
    filt = gifts_df[Columns.INVOICE_NO] == invoice_no
    transaction = list(gifts_df[filt].description.astype(str))
    transactions.append(transaction)
    i = i + 1

invoice 500 processed
invoice 1000 processed
invoice 1500 processed
invoice 2000 processed
invoice 2500 processed
invoice 3000 processed
invoice 3500 processed
invoice 4000 processed
invoice 4500 processed
invoice 5000 processed
invoice 5500 processed
invoice 6000 processed
invoice 6500 processed
invoice 7000 processed
invoice 7500 processed
invoice 8000 processed


[['IVORY STRING CURTAIN WITH POLE',
  'PINK AND BLACK STRING CURTAIN',
  'PSYCHEDELIC TILE HOOK',
  'ENAMEL COLANDER CREAM',
  'SMALL FOLDING SCISSOR(POINTED EDGE)',
  'JIGSAW TOADSTOOLS 3 PIECE'],
 ['MULTI COLOUR SILVER T-LIGHT HOLDER',
  'GINGHAM HEART  DOORSTOP RED',
  'PAPER CHAIN KIT RETROSPOT',
  'VINTAGE UNION JACK BUNTING',
  'SPOTTY BUNTING',
  'PAPER BUNTING RETROSPOT',
  'TEA TIME PARTY BUNTING',
  'BLUE HAPPY BIRTHDAY BUNTING',
  'PINK HAPPY BIRTHDAY BUNTING',
  'SET 2 PANTRY DESIGN TEA TOWELS',
  'BLUE HAPPY BIRTHDAY BUNTING'],
 ['COLOURING PENCILS BROWN TUBE'],
 ['HOME BUILDING BLOCK WORD',
  'HEART OF WICKER LARGE',
  'NOEL WOODEN BLOCK LETTERS',
  'PEACE WOODEN BLOCK LETTERS',
  'PARTY CONE CHRISTMAS DECORATION',
  'SET OF 6 T-LIGHTS SNOWMEN'],
 ['WRAP PAISLEY PARK',
  'WRAP VINTAGE LEAF DESIGN',
  'EMPIRE GIFT WRAP',
  'WRAP SUKI AND FRIENDS',
  'JUMBO BAG RED RETROSPOT',
  'JUMBO BAG ALPHABET',
  'JUMBO BAG VINTAGE LEAF',
  'JUMBO STORAGE BAG SUKI',
  'JUMBO BAG VINTA

In [13]:
transactions[0]

['IVORY STRING CURTAIN WITH POLE',
 'PINK AND BLACK STRING CURTAIN',
 'PSYCHEDELIC TILE HOOK',
 'ENAMEL COLANDER CREAM',
 'SMALL FOLDING SCISSOR(POINTED EDGE)',
 'JIGSAW TOADSTOOLS 3 PIECE']

In [18]:
# Instantiate transaction encoder.
encoder = TransactionEncoder()

# One-hot encode transactions.
item_map = encoder.fit(transactions).transform(transactions)

# Use unique items as column headers.
item_map_df = pd.DataFrame(item_map, columns = encoder.columns_).drop('nan', axis=1)

item_map_df.to_csv(index=False)

filepath = Path('../datasets/outputs/market_basket/gits_item_map.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)
item_map_df.to_csv(filepath)

# Print onehot header.
item_map_df.head()

Unnamed: 0,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 DAISY PEGS IN WOOD BOX,12 EGG HOUSE PAINTED WOOD,12 HANGING EGGS HAND PAINTED,12 IVORY ROSE PEG PLACE SETTINGS,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,12 PENCILS SMALL TUBE RED RETROSPOT,12 PENCILS SMALL TUBE SKULL,...,wet boxes,wet pallet,wet rusty,wet?,wrongly coded 20713,wrongly coded 23343,wrongly coded-23343,wrongly marked,wrongly marked 23343,wrongly marked carton 22804
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,True,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
