Import Data
- Make a dataframe from the csv
    - There is some inconsistent data (some rows formatted poorly), so we will skip them    


In [28]:
import pandas as pd

FILE = "./data/Assignment-1_Data.csv"

data = pd.read_csv(FILE, sep=";", on_bad_lines="skip", low_memory=False)

print(data.head())

   BillNo                             Itemname  Quantity              Date  \
0  536365   WHITE HANGING HEART T-LIGHT HOLDER         6  01.12.2010 08:26   
1  536365                  WHITE METAL LANTERN         6  01.12.2010 08:26   
2  536365       CREAM CUPID HEARTS COAT HANGER         8  01.12.2010 08:26   
3  536365  KNITTED UNION FLAG HOT WATER BOTTLE         6  01.12.2010 08:26   
4  536365       RED WOOLLY HOTTIE WHITE HEART.         6  01.12.2010 08:26   

  Price  CustomerID         Country  
0  2,55     17850.0  United Kingdom  
1  3,39     17850.0  United Kingdom  
2  2,75     17850.0  United Kingdom  
3  3,39     17850.0  United Kingdom  
4  3,39     17850.0  United Kingdom  


Clean Data
- Only data we care about is BillNo, Itemname, and Country 
    - We drop the other columns
- Remove entries that are missing data (BillNo, Itemname, or Country)
- Also remove whitespace from Itemname
 

In [29]:
columns_to_keep = ['BillNo', 'Itemname', 'Country']

data = data[columns_to_keep]

#Drop rows with missing values
data.dropna(inplace=True)

data['Itemname'] = data['Itemname'].str.strip()

print(data.head())

   BillNo                             Itemname         Country
0  536365   WHITE HANGING HEART T-LIGHT HOLDER  United Kingdom
1  536365                  WHITE METAL LANTERN  United Kingdom
2  536365       CREAM CUPID HEARTS COAT HANGER  United Kingdom
3  536365  KNITTED UNION FLAG HOT WATER BOTTLE  United Kingdom
4  536365       RED WOOLLY HOTTIE WHITE HEART.  United Kingdom


Group Data
- Group transaction data by country
- Find countries with a decent number of rows (transaction data)  
- We keep the countries with a lot of transaction data 
        - United Kingdom, France, and Germany  

In [30]:
country_datas = {country: data for country, data in data.groupby('Country')}

for country, data in country_datas.items():
        print(f"Number of Rows for {country}:")
        print(data.shape[0]) # number of rows
        print("\n")
        
countries_to_keep = ['United Kingdom', 'France', 'Germany']

country_datas = {key: value for key, value in country_datas.items() if key in countries_to_keep}

for country, data in country_datas.items():
        print(f"Data for {country}:")
        print(data.head()) 
        print("\n")

Number of Rows for Australia:
1185


Number of Rows for Austria:
398


Number of Rows for Bahrain:
18


Number of Rows for Belgium:
2031


Number of Rows for Brazil:
32


Number of Rows for France:
8408


Number of Rows for Germany:
9042


Number of Rows for Greece:
145


Number of Rows for Hong Kong:
284


Number of Rows for Iceland:
182


Number of Rows for Israel:
295


Number of Rows for Italy:
758


Number of Rows for Japan:
321


Number of Rows for Lebanon:
45


Number of Rows for Lithuania:
35


Number of Rows for Malta:
112


Number of Rows for Netherlands:
2363


Number of Rows for Norway:
1072


Number of Rows for Poland:
330


Number of Rows for Portugal:
1501


Number of Rows for RSA:
58


Number of Rows for Saudi Arabia:
9


Number of Rows for Singapore:
222


Number of Rows for Spain:
2485


Number of Rows for Sweden:
451


Number of Rows for Switzerland:
1967


Number of Rows for USA:
179


Number of Rows for United Arab Emirates:
68


Number of Rows for United Kingdom:


Modify Data to be Transactions for TransactionEncoder
- Make transactions by joining items with the same BillNo
- Make it so transactions data for each country is setup for the transaction encoder

In [31]:
country_transactions = {}

for country, data in country_datas.items():
    country_transactions[country] = data.groupby(['BillNo'])['Itemname'].apply(lambda x: ','.join(x)).reset_index()
        
for country, transactions in country_transactions.items():
    transactions.drop(columns=['BillNo'], inplace=True)
    transactions.rename(columns={'Itemname': 'Items'}, inplace=True)
    
for country, transactions in country_transactions.items():
    print(f"Transactions for {country}:")
    print(transactions.head()) 
    print("\n")
    
for country, transactions in country_transactions.items():
    country_transactions[country] = transactions['Items'].apply(lambda x: x.split(',')).tolist()
    
for country, transactions in country_transactions.items():
    print(f"Transactions for {country}:")
    print(transactions[0]) 
    print("\n")

Transactions for France:
                                               Items
0  ALARM CLOCK BAKELIKE PINK,ALARM CLOCK BAKELIKE...
1  PICTURE DOMINOES,MINI JIGSAW SPACEBOY,MINI JIG...
2  EDWARDIAN PARASOL BLACK,EDWARDIAN PARASOL PINK...
3  HOT WATER BOTTLE BABUSHKA,BREAD BIN DINER STYL...
4  JAM MAKING SET PRINTED,SET/4 SKULL BADGES,ROUN...


Transactions for Germany:
                                               Items
0  SET OF 6 T-LIGHTS SANTA,ROTATING SILVER ANGELS...
1  JAM MAKING SET PRINTED,JAM JAR WITH PINK LID,J...
2  FELTCRAFT 6 FLOWER FRIENDS,6 RIBBONS RUSTIC CH...
3                    POSTAGE,JUMBO BAG RED RETROSPOT
4  WOODLAND PARTY BAG + STICKER SET,HAND WARMER O...


Transactions for United Kingdom:
                                               Items
0  WHITE HANGING HEART T-LIGHT HOLDER,WHITE METAL...
1   HAND WARMER UNION JACK,HAND WARMER RED POLKA DOT
2  ASSORTED COLOUR BIRD ORNAMENT,POPPY'S PLAYHOUS...
3  JAM MAKING SET WITH JARS,RED COAT RACK PARIS F...
4          

Make One-Hot Encoded Dataframe for FP-Growth Algorithm
- The fpgrowth function from the mlxtend library expects data in a one-hot encoded pandas DataFrame
 

In [32]:
from mlxtend.preprocessing import TransactionEncoder

for country, transactions in country_transactions.items():
    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions)
    data = pd.DataFrame(te_ary, columns=te.columns_)
    country_transactions[country] = data
    
for country, transactions in country_transactions.items():
    print(f"Transactions for {country}:")
    print(transactions.head())
    print("\n")

Transactions for France:
           BACK DOOR   BIRTHDAY CARD   CHOCOLATE  SPOTS   NEW ENGLAND  \
0  False       False           False              False         False   
1  False       False           False              False         False   
2  False       False           False              False         False   
3  False       False           False              False         False   
4   True       False           False              False         False   

    OVERCROWDED POOL.   RETRO SPOT   SHED  10 COLOUR SPACEBOY PEN  \
0               False        False  False                   False   
1               False        False  False                   False   
2               False        False  False                   False   
3               False        False  False                   False   
4               False        False  False                   False   

   12 COLOURED PARTY BALLOONS  ...  WRAP VINTAGE PETALS  DESIGN  \
0                       False  ...                    