Import Data
- Make a dataframe from the csv
    - There is some inconsistent data (some rows formatted poorly), so we will skip them    


In [23]:
import pandas as pd

FILE = "./data/Assignment-1_Data.csv"

data = pd.read_csv(FILE, sep=";", on_bad_lines="skip", low_memory=False)

print(data.head())

   BillNo                             Itemname  Quantity              Date  \
0  536365   WHITE HANGING HEART T-LIGHT HOLDER         6  01.12.2010 08:26   
1  536365                  WHITE METAL LANTERN         6  01.12.2010 08:26   
2  536365       CREAM CUPID HEARTS COAT HANGER         8  01.12.2010 08:26   
3  536365  KNITTED UNION FLAG HOT WATER BOTTLE         6  01.12.2010 08:26   
4  536365       RED WOOLLY HOTTIE WHITE HEART.         6  01.12.2010 08:26   

  Price  CustomerID         Country  
0  2,55     17850.0  United Kingdom  
1  3,39     17850.0  United Kingdom  
2  2,75     17850.0  United Kingdom  
3  3,39     17850.0  United Kingdom  
4  3,39     17850.0  United Kingdom  


Clean Data
- Only data we care about is BillNo, Itemname, and Country 
    - We drop the other columns
- Remove entries that are missing data (BillNo, Itemname, or Country)
- Also remove whitespace from Itemname
 

In [24]:
columns_to_keep = ['BillNo', 'Itemname', 'Country']

data = data[columns_to_keep]

#Drop rows with missing values
data.dropna(inplace=True)

data['Itemname'] = data['Itemname'].str.strip()

print(data.head())

   BillNo                             Itemname         Country
0  536365   WHITE HANGING HEART T-LIGHT HOLDER  United Kingdom
1  536365                  WHITE METAL LANTERN  United Kingdom
2  536365       CREAM CUPID HEARTS COAT HANGER  United Kingdom
3  536365  KNITTED UNION FLAG HOT WATER BOTTLE  United Kingdom
4  536365       RED WOOLLY HOTTIE WHITE HEART.  United Kingdom


Group Data
- Group transaction data by country
- Find countries with a decent number of rows (transaction data)  
- We keep the countries with a lot of transaction data 
        - United Kingdom, France, and Germany  

In [25]:
for country, data in country_datas.items():
        print(f"Number of Rows for {country}:")
        print(data.shape[0]) # number of rows
        print("\n")
        
countries_to_keep = ['United Kingdom', 'France', 'Germany', 'Australia', 'Austria', 'Bahrain', 'Belgium']

country_datas = {key: value for key, value in country_datas.items() if key in countries_to_keep}

for country, data in country_datas.items():
        print(f"Data for {country}:")
        print(data.head()) 
        print("\n")

Number of Rows for Australia:
1185


Number of Rows for Austria:
398


Number of Rows for Bahrain:
18


Number of Rows for Belgium:
2031


Number of Rows for France:
8408


Number of Rows for Germany:
9042


Number of Rows for United Kingdom:
486167


Data for Australia:
     BillNo                           Itemname    Country
195  536389       CHRISTMAS LIGHTS 10 REINDEER  Australia
196  536389   VINTAGE UNION JACK CUSHION COVER  Australia
197  536389  VINTAGE HEADS AND TAILS CARD GAME  Australia
198  536389    SET OF 3 COLOURED  FLYING DUCKS  Australia
199  536389         SET OF 3 GOLD FLYING DUCKS  Australia


Data for Austria:
       BillNo                            Itemname  Country
33323  539330  CERAMIC CAKE STAND + HANGING CAKES  Austria
33324  539330  MINI CAKE STAND WITH HANGING CAKES  Austria
33325  539330               JAM JAR WITH PINK LID  Austria
33326  539330  SET3 BOOK BOX GREEN GINGHAM FLOWER  Austria
33327  539330         GREY HEART HOT WATER BOTTLE  Austria


Data

Modify Data to be Transactions for TransactionEncoder
- Make transactions by joining items with the same BillNo
- Make it so transactions data for each country is setup for the transaction encoder

In [26]:
country_transactions = {}

for country, data in country_datas.items():
    country_transactions[country] = data.groupby(['BillNo'])['Itemname'].apply(lambda x: ','.join(x)).reset_index()
        
for country, transactions in country_transactions.items():
    transactions.drop(columns=['BillNo'], inplace=True)
    transactions.rename(columns={'Itemname': 'Items'}, inplace=True)
    
for country, transactions in country_transactions.items():
    print(f"Transactions for {country}:")
    print(transactions.head()) 
    print("\n")
    
for country, transactions in country_transactions.items():
    country_transactions[country] = transactions['Items'].apply(lambda x: x.split(',')).tolist()
    
for country, transactions in country_transactions.items():
    print(f"Transactions for {country}:")
    print(transactions[0]) 
    print("\n")

Transactions for Australia:
                                               Items
0  CHRISTMAS LIGHTS 10 REINDEER,VINTAGE UNION JAC...
1  20 DOLLY PEGS RETROSPOT,ASSORTED BOTTLE TOP  M...
2  DOORMAT UNION FLAG,CORONA MEXICAN TRAY,LUNCH B...
3  RIBBON REEL LACE DESIGN,RIBBON REEL HEARTS DES...
4          PORCELAIN ROSE SMALL,PORCELAIN ROSE LARGE


Transactions for Austria:
                                               Items
0  CERAMIC CAKE STAND + HANGING CAKES,MINI CAKE S...
1  JUMBO BAG OWLS,JUMBO STORAGE BAG SUKI,PLASTERS...
2  BROCADE RING PURSE,PLASTERS IN TIN CIRCUS PARA...
3  PLASTERS IN TIN CIRCUS PARADE,RETROSPOT TEA SE...
4  PACK OF 6 SWEETIE GIFT BOXES,PACK OF 6 PANNETO...


Transactions for Bahrain:
                                               Items
0                 OCEAN SCENT CANDLE IN JEWELLED BOX
1  GROW A FLYTRAP OR SUNFLOWER IN TIN,ICE CREAM S...
2  CERAMIC CAKE STAND + HANGING CAKES,MINI CAKE S...


Transactions for Belgium:
                                        

Make One-Hot Encoded Dataframe for FP-Growth Algorithm
- The fpgrowth function from the mlxtend library expects data in a one-hot encoded pandas DataFrame
 

In [27]:
from mlxtend.preprocessing import TransactionEncoder

for country, transactions in country_transactions.items():
    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions)
    data = pd.DataFrame(te_ary, columns=te.columns_)
    country_transactions[country] = data
    
for country, transactions in country_transactions.items():
    print(f"Transactions for {country}:")
    print(transactions.head())
    print("\n")

Transactions for Australia:
           1 HANGER    BIRTHDAY CARD   FRONT  DOOR   GARAGE DESIGN   SHED  \
0  False       False           False         False           False  False   
1  False       False           False         False           False  False   
2  False       False           False         False           False  False   
3  False       False           False         False           False  False   
4  False       False           False         False           False  False   

   10 COLOUR SPACEBOY PEN  12 PENCIL SMALL TUBE WOODLAND  \
0                   False                          False   
1                   False                          False   
2                   False                          False   
3                   False                          False   
4                   False                          False   

   12 PENCILS TALL TUBE POSY  12 PENCILS TALL TUBE RED RETROSPOT  ...  \
0                      False                               False  ...   
1 

In [28]:
print(len(country_transactions))
from mlxtend.frequent_patterns import fpgrowth, association_rules

fq_itemsets = {}
fq_rules = {}
# Apply FP-Growth to each country's transactions
for country, transactions in country_transactions.items():
    # print(country)
    frequent_itemsets = fpgrowth(transactions, min_support=0.1, use_colnames=True)
    top_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)
    rules = association_rules(top_itemsets, metric='confidence', min_threshold=0.2)
    fq_itemsets[country] = frequent_itemsets
    fq_rules[country] = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]


7


In [29]:
for country, itemsets in fq_itemsets.items():
    print(f"Frequent itemsets for {country}")
    print(len(itemsets))
    print(itemsets.sort_values(by='support', ascending=False).head(5))
    print("\n")

Frequent itemsets for Australia
30
     support                            itemsets
21  0.157895     (RED TOADSTOOL LED NIGHT LIGHT)
4   0.157895  (SET OF 3 CAKE TINS PANTRY DESIGN)
9   0.140351         (LUNCH BAG SPACEBOY DESIGN)
16  0.140351   (ROSES REGENCY TEACUP AND SAUCER)
11  0.140351                     (PARTY BUNTING)


Frequent itemsets for Austria
8888
     support                                           itemsets
0   0.823529                                          (POSTAGE)
65  0.235294      (POSTAGE, ROUND SNACK BOXES SET OF4 WOODLAND)
68  0.235294  (POSTAGE, ROUND SNACK BOXES SET OF4 WOODLAND, ...
67  0.235294       (POSTAGE, ROUND SNACK BOXES SET OF 4 FRUITS)
66  0.235294  (ROUND SNACK BOXES SET OF4 WOODLAND, ROUND SNA...


Frequent itemsets for Bahrain
8205
       support                                           itemsets
0     0.666667               (OCEAN SCENT CANDLE IN JEWELLED BOX)
1     0.666667               (NOVELTY BISCUITS CAKE STAND 3 TIER)
8203  0.333333 

In [30]:
for country, rules in fq_rules.items():
    print(f"Association rules for {country}")
    print(len(rules))
    print(rules.head(5))
    print("\n")

Association rules for Australia
4
                    antecedents                   consequents   support  \
0    (ALARM CLOCK BAKELIKE RED)  (ALARM CLOCK BAKELIKE GREEN)  0.105263   
1  (ALARM CLOCK BAKELIKE GREEN)    (ALARM CLOCK BAKELIKE RED)  0.105263   
2        (DOLLY GIRL LUNCH BOX)          (SPACEBOY LUNCH BOX)  0.105263   
3          (SPACEBOY LUNCH BOX)        (DOLLY GIRL LUNCH BOX)  0.105263   

   confidence  lift  
0         1.0   9.5  
1         1.0   9.5  
2         1.0   9.5  
3         1.0   9.5  


Association rules for Austria
1062488
                                         antecedents  \
0                                          (POSTAGE)   
1               (ROUND SNACK BOXES SET OF4 WOODLAND)   
2      (POSTAGE, ROUND SNACK BOXES SET OF4 WOODLAND)   
3       (POSTAGE, ROUND SNACK BOXES SET OF 4 FRUITS)   
4  (ROUND SNACK BOXES SET OF4 WOODLAND, ROUND SNA...   

                            consequents   support  confidence      lift  
0  (ROUND SNACK BOXES SET OF4