In [1]:
import pandas as pd
import numpy as np
import random
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
data = 'Online Retail.xlsx'
df = pd.read_excel(data)
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [3]:
df['Country'].value_counts()

United Kingdom          495478
Germany                   9495
France                    8557
EIRE                      8196
Spain                     2533
Netherlands               2371
Belgium                   2069
Switzerland               2002
Portugal                  1519
Australia                 1259
Norway                    1086
Italy                      803
Channel Islands            758
Finland                    695
Cyprus                     622
Sweden                     462
Unspecified                446
Austria                    401
Denmark                    389
Japan                      358
Poland                     341
Israel                     297
USA                        291
Hong Kong                  288
Singapore                  229
Iceland                    182
Canada                     151
Greece                     146
Malta                      127
United Arab Emirates        68
European Community          61
RSA                         58
Lebanon 

In [4]:
df['Description'] = df['Description'].str.strip()
df.dropna(axis=0, subset=['InvoiceNo'], inplace=True)
df['InvoiceNo'] = df['InvoiceNo'].astype('str')
df = df[~df['InvoiceNo'].str.contains('C')]

In [5]:
basket = (df[df['Country'] =="United Kingdom"]
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))

In [6]:
basket.head()

Description,*Boombox Ipod Classic,*USB Office Mirror Ball,10 COLOUR SPACEBOY PEN,12 COLOURED PARTY BALLOONS,12 DAISY PEGS IN WOOD BOX,12 EGG HOUSE PAINTED WOOD,12 HANGING EGGS HAND PAINTED,12 IVORY ROSE PEG PLACE SETTINGS,12 MESSAGE CARDS WITH ENVELOPES,12 PENCIL SMALL TUBE WOODLAND,...,wrongly coded 20713,wrongly coded 23343,wrongly coded-23343,wrongly marked,wrongly marked 23343,wrongly marked carton 22804,wrongly marked. 23343 in box,wrongly sold (22719) barcode,wrongly sold as sets,wrongly sold sets
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
536365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536367,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
basket.shape

(18667, 4175)

In [8]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

basket_sets = basket.applymap(encode_units)
basket_sets.drop('POSTAGE', inplace=True, axis=1)

In [9]:
basket_sets.shape

(18667, 4174)

In [10]:
frequent_itemsets = apriori(basket_sets, min_support=0.01, use_colnames=True)

In [11]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
rules.head()

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(JUMBO BAG BAROQUE BLACK WHITE, JUMBO BAG SCA...",(JUMBO BAG RED RETROSPOT),0.013178,0.10382,0.010982,0.833333,8.026746,0.009614,5.377083
1,"(JUMBO BAG OWLS, JUMBO SHOPPER VINTAGE RED PAI...",(JUMBO BAG RED RETROSPOT),0.0135,0.10382,0.010125,0.75,7.224071,0.008723,3.584722
2,"(JUMBO BAG OWLS, JUMBO BAG RED RETROSPOT)",(JUMBO SHOPPER VINTAGE RED PAISLEY),0.019821,0.060695,0.010125,0.510811,8.41598,0.008922,1.920126
3,(JUMBO BAG CHARLIE AND LOLA TOYS),(JUMBO BAG RED RETROSPOT),0.015803,0.10382,0.010286,0.650847,6.269025,0.008645,2.56673
4,"(LUNCH BAG RED RETROSPOT, RED RETROSPOT CHARLO...",(STRAWBERRY CHARLOTTE BAG),0.019339,0.036053,0.011839,0.612188,16.980268,0.011142,2.485606


In [12]:
rules["antecedant_len"] = rules["antecedants"].apply(lambda x: len(x))
rules["consequentst_len"] = rules["consequents"].apply(lambda x: len(x))
rules.head()

Unnamed: 0,antecedants,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedant_len,consequentst_len
0,"(JUMBO BAG BAROQUE BLACK WHITE, JUMBO BAG SCA...",(JUMBO BAG RED RETROSPOT),0.013178,0.10382,0.010982,0.833333,8.026746,0.009614,5.377083,2,1
1,"(JUMBO BAG OWLS, JUMBO SHOPPER VINTAGE RED PAI...",(JUMBO BAG RED RETROSPOT),0.0135,0.10382,0.010125,0.75,7.224071,0.008723,3.584722,2,1
2,"(JUMBO BAG OWLS, JUMBO BAG RED RETROSPOT)",(JUMBO SHOPPER VINTAGE RED PAISLEY),0.019821,0.060695,0.010125,0.510811,8.41598,0.008922,1.920126,2,1
3,(JUMBO BAG CHARLIE AND LOLA TOYS),(JUMBO BAG RED RETROSPOT),0.015803,0.10382,0.010286,0.650847,6.269025,0.008645,2.56673,1,1
4,"(LUNCH BAG RED RETROSPOT, RED RETROSPOT CHARLO...",(STRAWBERRY CHARLOTTE BAG),0.019339,0.036053,0.011839,0.612188,16.980268,0.011142,2.485606,2,1


In [13]:
rules['antecedant_len'].value_counts()

2    641
1    300
3    109
4      5
Name: antecedant_len, dtype: int64

In [14]:
rules['consequentst_len'].value_counts()

1    948
2    102
3      5
Name: consequentst_len, dtype: int64

In [15]:
rules.shape

(1055, 11)

In [16]:
antec = rules['antecedants'].values.tolist()

In [17]:
conse = rules['consequents'].values.tolist()

In [18]:
def process(antec,conse):
    all_a = []
    all_c = []
    for a,c in zip(antec,conse):
        str_a = ', '.join(list(a))
        str_c = ', '.join(list(c))
        all_a.append(str_a)
        all_c.append(str_c)
        
    return all_a,all_c

In [19]:
ar_antec, ar_conse = process(antec,conse)

In [20]:
labels = np.ones(rules.shape[0], dtype=int)

In [21]:
col = ["Association Rule antecedants", 
       "Association Rule consequents", "label"]

In [22]:
all_items_df = pd.DataFrame({"Association Rule antecedants":ar_antec,
                             "Association Rule consequents":ar_conse,
                             "label":labels
                            }, columns = col)

In [23]:
all_items_df.head(10)

Unnamed: 0,Association Rule antecedants,Association Rule consequents,label
0,"JUMBO BAG BAROQUE BLACK WHITE, JUMBO BAG SCAN...",JUMBO BAG RED RETROSPOT,1
1,"JUMBO BAG OWLS, JUMBO SHOPPER VINTAGE RED PAISLEY",JUMBO BAG RED RETROSPOT,1
2,"JUMBO BAG OWLS, JUMBO BAG RED RETROSPOT",JUMBO SHOPPER VINTAGE RED PAISLEY,1
3,JUMBO BAG CHARLIE AND LOLA TOYS,JUMBO BAG RED RETROSPOT,1
4,"LUNCH BAG RED RETROSPOT, RED RETROSPOT CHARLOT...",STRAWBERRY CHARLOTTE BAG,1
5,"LUNCH BAG RED RETROSPOT, STRAWBERRY CHARLOTTE BAG",RED RETROSPOT CHARLOTTE BAG,1
6,HERB MARKER MINT,HERB MARKER BASIL,1
7,HERB MARKER BASIL,HERB MARKER MINT,1
8,"WOODEN PICTURE FRAME WHITE FINISH, WHITE HANGI...",WOODEN FRAME ANTIQUE WHITE,1
9,"WOODEN FRAME ANTIQUE WHITE, WHITE HANGING HEAR...",WOODEN PICTURE FRAME WHITE FINISH,1


In [24]:
#============create label 0 rule=============

# get the items
items = basket.columns
no_repeat_items = list(set(items))
len(no_repeat_items)

4175

In [25]:
no_repeat_items[:10]

['add stock to allocate online orders',
 'HANGING JAM JAR T-LIGHT HOLDER',
 'QUEEN OF THE SKIES HOLIDAY PURSE',
 'SET 10 CARDS WISHING TREE 17116',
 'WALL ART VINTAGE HEART',
 'MAGNETS PACK OF 4 VINTAGE LABELS',
 '4 BURGUNDY WINE DINNER CANDLES',
 'CERAMIC STRAWBERRY CAKE MONEY BANK',
 'CITRUS GARLAND FELT FLOWERS',
 'LUNCH BAG VINTAGE DOILEY']

In [26]:
def random_select(items):
    length = 2000
    all_a = []
    all_c = []
    
    for i in range(length):
        a_len = random.randint(0,1)
        c_len = random.randint(0,1)
        
        temp_a = []
        temp_c = []
        for j in range(a_len+1):
            item = random.choice(items)
            temp_a.append(item)
        for j in range(c_len+1):
            item = random.choice(items)
            temp_c.append(item)
            
        strtemp_a = ', '.join(temp_a)
        strtemp_c = ', '.join(temp_c)
        
        all_a.append(strtemp_a)
        all_c.append(strtemp_c)
        
    return all_a, all_c

In [27]:
all_a, all_c = random_select(no_repeat_items)

In [29]:
all_a[:10]

['VINTAGE BILLBOARD MUG',
 'COLOUR GLASS. STAR T-LIGHT HOLDER, FOLDING UMBRELLA WHITE/RED POLKADOT',
 'LIGHT DECORATION BATTERY OPERATED, BLUE EGG  SPOON',
 'YELLOW BREAKFAST CUP AND SAUCER',
 'sold with wrong barcode',
 'CLASSIC BICYCLE CLIPS',
 'MINI PAINT SET VINTAGE',
 '3 TIER SWEETHEART GARDEN SHELF',
 'RED REFECTORY CLOCK',
 'ZINC HEART FLOWER T-LIGHT HOLDER',
 'BIRD ON BRANCH CANVAS SCREEN, CLEAR CRYSTAL STAR PHONE CHARM',
 'BLUE/GREEN SHELL NECKLACE W PENDANT, BOTANICAL LILY GIFT WRAP',
 'HEART OF WICKER LARGE, FRAPPUCINO SCARF KNITTING KIT',
 'RABBIT EASTER DECORATION, SNACK TRAY RED GINGHAM',
 "CRACKED GLAZE NECKLACE BROWN, MUM'S KITCHEN CLOCK",
 'VINTAGE CREAM 3 BASKET CAKE STAND, RUSTIC STRAWBERRY JAM POT SMALL',
 'SET OF 16 VINTAGE RED CUTLERY',
 'DISCO BALL CHRISTMAS DECORATION, MONEY BOX FIRST ADE DESIGN',
 'BOYS VINTAGE TIN SEASIDE BUCKET',
 'MULTICOLOUR 3D BALLS GARLAND, WOVEN BUBBLE GUM CUSHION COVER',
 'HANGING HEART MIRROR DECORATION, sold as set on dotcom',
 'RED R

In [30]:
zeros_labels = np.zeros(len(all_a), dtype=int)

zeros_label_df = pd.DataFrame({"Association Rule antecedants":all_a,
                             "Association Rule consequents":all_c,
                             "label":zeros_labels
                            }, columns = col)

In [31]:
zeros_label_df.head()

Unnamed: 0,Association Rule antecedants,Association Rule consequents,label
0,VINTAGE BILLBOARD MUG,"VINTAGE DONKEY TAIL GAME, SET/2 RED RETROSPOT ...",0
1,"COLOUR GLASS. STAR T-LIGHT HOLDER, FOLDING UMB...",PINK POT PLANT CANDLE,0
2,"LIGHT DECORATION BATTERY OPERATED, BLUE EGG S...","PINK RETRO BIG FLOWER BAG, SET OF 72 PINK HEAR...",0
3,YELLOW BREAKFAST CUP AND SAUCER,lost,0
4,sold with wrong barcode,ZINC HEART LATTICE TRAY OVAL,0


In [32]:
# concatenate the label 1 and label 0 data
all_df = pd.concat([all_items_df, zeros_label_df], axis=0)

In [34]:
all_df.shape

(3055, 3)

In [35]:
not_duplicated_row = all_df.duplicated(['Association Rule antecedants', 'Association Rule consequents'])
not_duplicated_row.value_counts()

False    3055
dtype: int64

In [36]:
# shuffle
all_df = all_df[~not_duplicated_row].sample(frac=1).reset_index(drop=True)
all_df.head(10)

Unnamed: 0,Association Rule antecedants,Association Rule consequents,label
0,"WOODLAND CHARLOTTE BAG, CHARLOTTE BAG PINK POL...","CHARLOTTE BAG SUKI DESIGN, RED RETROSPOT CHARL...",1
1,FRENCH ENAMEL WATER BASIN,"FRENCH BLUE METAL DOOR SIGN 7, CHEST OF DRAWER...",0
2,"NATURAL SLATE HEART CHALKBOARD, DOTCOM POSTAGE",JUMBO BAG RED RETROSPOT,1
3,GIRLS PARTY BAG,BLUE COAT RACK PARIS FASHION,0
4,"LUNCH BAG SPACEBOY DESIGN, LUNCH BAG WOODLAND",LUNCH BAG CARS BLUE,1
5,"REGENCY CAKESTAND 3 TIER, NUMBER TILE VINTAGE ...","Damages/samples, DRAWER KNOB CRACKLE GLAZE GREEN",0
6,"3 BLACK CATS W HEARTS BLANK CARD, ILLUSTRATED ...",NECKLACE+BRACELET SET BLUE BLOSSOM,0
7,VEGETABLE MAGNETIC SHOPPING LIST,SET OF 6 ICE CREAM SKITTLES,0
8,SMALL JEWELLERY STAND,PINK ROSEBUD & PEARL NECKLACE,0
9,"RED RETROSPOT CHARLOTTE BAG, WOODLAND CHARLOTT...",STRAWBERRY CHARLOTTE BAG,1


In [37]:
all_df.to_csv('answer.csv')