In [1]:
import pandas as pd
import random

## Import Data of Transaction

In [2]:
subset_no = 10000
directory = 'Dillards POS/'
skst_file_path = directory + 'skstinfo.csv'
dept_file_path = directory + 'deptinfo.csv'
str_file_path = directory + 'strinfo.csv'
trans_file_path = directory + 'trnsact.csv'
sku_file_path = directory + 'skuinfo.csv'
trans_file_path = directory + 'trans_final.csv'
sku_file_path = directory + 'sku_final.csv'

In [3]:
columns = ['sku', 'storeid', 'register', 'trannum', 'interID', 'saledate', 'stype', 'quantity',
                 'orgprice', 'amt', 'seq', 'mic', 'unkown']

In [4]:
columns_sku = ['sku', 'dept', 'classid', 'upc', 'style', 'color', 'size', 
                       'packsize', 'vendor', 'brand']

In [5]:
# Count the total number of rows in the CSV file (optional but recommended)
total_rows = sum(1 for line in open(trans_file_path, 'r'))

# Number of random rows you want to select
num_random_rows = total_rows

if subset_no == True:
    num_random_rows = subset_no
    

# Calculate the random row indices to skip
skip_rows = sorted(random.sample(range(1, total_rows + 1), total_rows - num_random_rows))

# Read n random rows from the CSV file
trans = pd.read_csv(trans_file_path, skiprows=skip_rows, header=None)

In [6]:
trans.columns = columns

In [7]:
trans.head()

Unnamed: 0,sku,storeid,register,trannum,interID,saledate,stype,quantity,orgprice,amt,seq,mic,unkown
0,1658506,9709,80,4600,218609042,2004-10-08,P,1,13.5,13.5,530300071,281,0
1,1658506,9709,80,4600,218609042,2004-10-08,P,1,13.5,13.5,530400071,281,0
2,1658506,9709,80,4600,0,2005-04-12,P,1,13.5,13.5,556900050,281,0
3,1658506,9709,80,4700,828105953,2004-09-18,P,1,13.5,13.5,908400195,281,0
4,1658506,9709,80,4800,142506388,2005-06-03,P,1,13.5,13.5,839200063,281,0


In [8]:
import matplotlib.pyplot as plt
import numpy as np

## Read Cleaned Data SKU

In [10]:
# Count the total number of rows in the CSV file (optional but recommended)
total_rows = sum(1 for line in open(sku_file_path, 'r'))

# Number of random rows you want to select
num_random_rows = total_rows

if subset_no == True:
    num_random_rows = subset_no
    

# Calculate the random row indices to skip
skip_rows = sorted(random.sample(range(1, total_rows + 1), total_rows - num_random_rows))

# Read n random rows from the CSV file
sku = pd.read_csv(sku_file_path, skiprows=skip_rows, header=None)

In [11]:
sku.columns = columns_sku
sku.head()

Unnamed: 0,sku,dept,classid,upc,style,color,size,packsize,vendor,brand
0,3,6505,113,400000003000,00 F55KT2,WHISPERWHITE,P8EA,1,5119207,TURNBURY
1,4,8101,002,400000004000,22 615CZ4,SPEARMI,S,1,3311144,C A SPOR
2,5,7307,003,400000005000,7LBS 245-01,34 SILVER,KING,1,5510554,BEAU IDE
3,8,3404,00B,400000008000,622 F05H84,MORNING MI,2T,1,2912827,HARTSTRI
4,15,2301,004,400000015000,126 MDU461,255CAMEL,12,1,23272,JONES/LA


## Join Transactions and SKUs

In [12]:
df = pd.merge(trans, sku[['sku', 'brand']], on='sku', how='inner')

## Get Baskets together

In [13]:
df = df.head(1000000)

# Ensure the SKU column is of string data type
df['brand'] = df['brand'].astype(str)

# Group data by unique baskets
baskets = df.groupby(['saledate', 'storeid', 'register', 'trannum'])['brand'].apply(list).reset_index(name='Items')
baskets.head()

Unnamed: 0,saledate,storeid,register,trannum,Items
0,2004-08-01,102,120,700,"[GIBSON O , GIBSON O ]"
1,2004-08-01,102,190,8300,[TEMPTED ]
2,2004-08-01,102,250,2400,[BELIZA ]
3,2004-08-01,102,270,200,[TOMMY HI ]
4,2004-08-01,102,330,800,[CITY TRI ]


In [14]:
# Filter out baskets with only one item
baskets = baskets[baskets['Items'].apply(len) > 1]
baskets.head()

Unnamed: 0,saledate,storeid,register,trannum,Items
0,2004-08-01,102,120,700,"[GIBSON O , GIBSON O ]"
10,2004-08-01,107,140,3700,"[ , , ]"
20,2004-08-01,203,160,3000,"[NOBILITY , NOBILITY , NOBILITY ]"
27,2004-08-01,209,700,2500,"[CALVIN K , CALVIN K , CALVIN K ]"
29,2004-08-01,302,80,800,"[BEACON L , BEACON L ]"


In [15]:
baskets.size

342655

## Apply Market Basket Algorithm

In [18]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# Convert the list of items into a one-hot encoded format
te = TransactionEncoder()
encoded_baskets = te.fit(baskets['Items']).transform(baskets['Items'])
encoded_df = pd.DataFrame(encoded_baskets, columns=te.columns_)

# Apply Apriori algorithm to find frequent itemsets
min_support = 0.0001  # You can adjust this threshold based on your dataset
frequent_itemsets = apriori(encoded_df, min_support=min_support, use_colnames=True)

# Generate association rules
min_confidence = 0.7  # You can adjust this threshold as well
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)

# Print the frequent itemsets and association rules
print("Frequent Itemsets:")
print(frequent_itemsets)
print("\nAssociation Rules:")
print(rules)

Frequent Itemsets:
      support                itemsets
0    0.015949             (         )
1    0.000204             (9 WEST   )
2    0.000379             (A H SCHR )
3    0.006844             (AGB      )
4    0.000292             (AJI/DBA  )
..        ...                     ...
299  0.000102  (POLO FAS , POLO JEA )
300  0.000423  (POLO FAS , ROUNDTRE )
301  0.000117  (POLO FAS , TURNBURY )
302  0.000146  (RALPH LA , TOMMY HI )
303  0.000102  (TOMMY HI , VANITY F )

[304 rows x 2 columns]


In [20]:
frequent_itemsets.tail(5)

Unnamed: 0,support,itemsets
299,0.000102,"(POLO FAS , POLO JEA )"
300,0.000423,"(POLO FAS , ROUNDTRE )"
301,0.000117,"(POLO FAS , TURNBURY )"
302,0.000146,"(RALPH LA , TOMMY HI )"
303,0.000102,"(TOMMY HI , VANITY F )"
