In [1]:
# Using PCY Implementation

In [2]:
# Loading dataset
from itertools import combinations
data = sc.textFile("Market_Basket_Optimisation.csv")
data.take(10)

['shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil',
 'burgers,meatballs,eggs',
 'chutney',
 'turkey,avocado',
 'mineral water,milk,energy bar,whole wheat rice,green tea',
 'low fat yogurt',
 'whole wheat pasta,french fries',
 'soup,light cream,shallot',
 'frozen vegetables,spaghetti,green tea',
 'french fries']

In [3]:
# Total number of transactions
data.count()

7501

In [4]:
# Converting into a list and sorting the values inside everylist
input_list = data.map(lambda x:x.split(','))
input_list = input_list.map(lambda x:sorted(x))
input_list.take(5)

[['almonds',
  'antioxydant juice',
  'avocado',
  'cottage cheese',
  'energy drink',
  'frozen smoothie',
  'green grapes',
  'green tea',
  'honey',
  'low fat yogurt',
  'mineral water',
  'olive oil',
  'salad',
  'salmon',
  'shrimp',
  'spinach',
  'tomato juice',
  'vegetables mix',
  'whole weat flour',
  'yams'],
 ['burgers', 'eggs', 'meatballs'],
 ['chutney'],
 ['avocado', 'turkey'],
 ['energy bar', 'green tea', 'milk', 'mineral water', 'whole wheat rice']]

In [5]:
# Generating pairs from the list available
pairs_input = input_list.map(lambda x: list(combinations(x,2)))
pairs_input = pairs_input.flatMap(lambda x:x)
pairs_input.take(5)

[('almonds', 'antioxydant juice'),
 ('almonds', 'avocado'),
 ('almonds', 'cottage cheese'),
 ('almonds', 'energy drink'),
 ('almonds', 'frozen smoothie')]

In [6]:
# Generating counts of all the pairs being generated
count_pair = pairs_input.map(lambda x:(x,1)).sortByKey()
count_pair_data = count_pair.reduceByKey(lambda x,y: x+y)
count_pair_data.take(5)

[((' asparagus', 'burgers'), 1),
 ((' asparagus', 'chocolate'), 1),
 ((' asparagus', 'energy bar'), 1),
 ((' asparagus', 'ground beef'), 1),
 ((' asparagus', 'shrimp'), 1)]

In [7]:
# Getting count of every item to get support
data = input_list.flatMap(lambda x:x)
data_map = data.map(lambda x: (x,1))
data_map.take(10)

[('almonds', 1),
 ('antioxydant juice', 1),
 ('avocado', 1),
 ('cottage cheese', 1),
 ('energy drink', 1),
 ('frozen smoothie', 1),
 ('green grapes', 1),
 ('green tea', 1),
 ('honey', 1),
 ('low fat yogurt', 1)]

In [8]:
# Reducing the values to get counts
data_count = data_map.reduceByKey(lambda x,y: x+y)
data_count.take(15)

[('almonds', 153),
 ('antioxydant juice', 67),
 ('avocado', 250),
 ('cottage cheese', 239),
 ('energy drink', 200),
 ('frozen smoothie', 475),
 ('green grapes', 68),
 ('green tea', 991),
 ('honey', 356),
 ('low fat yogurt', 574),
 ('mineral water', 1788),
 ('olive oil', 494),
 ('salad', 37),
 ('salmon', 319),
 ('shrimp', 536)]

In [9]:
# Filtering data with support greater than 0.133 ie. out of 7500 transactions select 1000
freq_item = data_count.filter(lambda x: x[1] >=1000).sortByKey()
freq_item = freq_item.map(lambda x:x[0])
freq_item.take(10)

['chocolate', 'eggs', 'french fries', 'mineral water', 'spaghetti']

In [10]:
freq_item.count()

5

In [11]:
# generating combinations
combination = freq_item.map(lambda x: (1,x))
combination= combination.groupByKey().map(lambda x: (x[0],(list(x[1]))))
comb = combination.map(lambda x: (x[0],(list(combinations(x[1],2)))))
comb = comb.flatMap(lambda x:x[1])
comb.take(100)

[('chocolate', 'eggs'),
 ('chocolate', 'french fries'),
 ('chocolate', 'mineral water'),
 ('chocolate', 'spaghetti'),
 ('eggs', 'french fries'),
 ('eggs', 'mineral water'),
 ('eggs', 'spaghetti'),
 ('french fries', 'mineral water'),
 ('french fries', 'spaghetti'),
 ('mineral water', 'spaghetti')]

In [12]:
# Attaching index with the itemsets
comb=comb.zipWithIndex()
comb.take(10)

[(('chocolate', 'eggs'), 0),
 (('chocolate', 'french fries'), 1),
 (('chocolate', 'mineral water'), 2),
 (('chocolate', 'spaghetti'), 3),
 (('eggs', 'french fries'), 4),
 (('eggs', 'mineral water'), 5),
 (('eggs', 'spaghetti'), 6),
 (('french fries', 'mineral water'), 7),
 (('french fries', 'spaghetti'), 8),
 (('mineral water', 'spaghetti'), 9)]

In [13]:
# Creating 5 buckets 
pairs_with_bucketno = comb.map(lambda x:(x[0],(x[1]%5)))
pairs_with_bucketno.take(30)
pairs_with_buc = pairs_with_bucketno.map(lambda xy:(xy[1],xy[0]))
pairs_with_buc.take(10)

[(0, ('chocolate', 'eggs')),
 (1, ('chocolate', 'french fries')),
 (2, ('chocolate', 'mineral water')),
 (3, ('chocolate', 'spaghetti')),
 (4, ('eggs', 'french fries')),
 (0, ('eggs', 'mineral water')),
 (1, ('eggs', 'spaghetti')),
 (2, ('french fries', 'mineral water')),
 (3, ('french fries', 'spaghetti')),
 (4, ('mineral water', 'spaghetti'))]

In [14]:
# Getting count of the occurences
bucketcount = pairs_with_bucketno.join(count_pair_data)
bucketcount.take(20)

[(('chocolate', 'spaghetti'), (3, 295)),
 (('eggs', 'french fries'), (4, 273)),
 (('eggs', 'mineral water'), (0, 382)),
 (('french fries', 'mineral water'), (2, 253)),
 (('chocolate', 'eggs'), (0, 249)),
 (('chocolate', 'french fries'), (1, 258)),
 (('chocolate', 'mineral water'), (2, 396)),
 (('eggs', 'spaghetti'), (1, 274)),
 (('french fries', 'spaghetti'), (3, 207)),
 (('mineral water', 'spaghetti'), (4, 448))]

In [15]:
# Bucket data generated
bucket_freq=bucketcount.map(lambda x:(x[1][0],(x[0],x[1][1]))).sortByKey()
bucket_1 = bucket_freq.map(lambda x: (x[0],(x[1][1]))).groupByKey().sortByKey().map(lambda x : (x[0],(sum(x[1])))).filter(lambda x:x[1]>=50)

bucket_1.take(10)


[(0, 631), (1, 532), (2, 649), (3, 502), (4, 721)]

In [16]:
# Number of unique buckets
freq_bucket = bucket_1.map(lambda x: x[0]).collect()
print(freq_bucket)

[0, 1, 2, 3, 4]


In [17]:

pairs_with_bucketno.take(30)

[(('chocolate', 'eggs'), 0),
 (('chocolate', 'french fries'), 1),
 (('chocolate', 'mineral water'), 2),
 (('chocolate', 'spaghetti'), 3),
 (('eggs', 'french fries'), 4),
 (('eggs', 'mineral water'), 0),
 (('eggs', 'spaghetti'), 1),
 (('french fries', 'mineral water'), 2),
 (('french fries', 'spaghetti'), 3),
 (('mineral water', 'spaghetti'), 4)]

In [18]:
#Checking if the pair matches with the hash generated and itemset in the bucket
bitvector = pairs_with_bucketno.map(lambda x:(x,1 if x[1] in freq_bucket else 0 ))

In [19]:
# Getting a bit vector
bits_pairs= bitvector.map(lambda x:(x[0][0],x[1]))
bits_pairs.take(10)
bits_pairs.count()

10

In [20]:
bits_1 = bits_pairs.filter(lambda x: (x[1] ==1))
bits_1.take(20)

[(('chocolate', 'eggs'), 1),
 (('chocolate', 'french fries'), 1),
 (('chocolate', 'mineral water'), 1),
 (('chocolate', 'spaghetti'), 1),
 (('eggs', 'french fries'), 1),
 (('eggs', 'mineral water'), 1),
 (('eggs', 'spaghetti'), 1),
 (('french fries', 'mineral water'), 1),
 (('french fries', 'spaghetti'), 1),
 (('mineral water', 'spaghetti'), 1)]

In [21]:
bits_1.count()

10

In [22]:
# frequent itemsets generated
freq_itemset_2 = bits_1.map(lambda x : list(x[0]))
freq_itemset_2.take(20)

[['chocolate', 'eggs'],
 ['chocolate', 'french fries'],
 ['chocolate', 'mineral water'],
 ['chocolate', 'spaghetti'],
 ['eggs', 'french fries'],
 ['eggs', 'mineral water'],
 ['eggs', 'spaghetti'],
 ['french fries', 'mineral water'],
 ['french fries', 'spaghetti'],
 ['mineral water', 'spaghetti']]

In [23]:
# Most frequently bought items
freq_1 = []
freq_2 = []
for i in freq_item.collect():
    freq_1.append(i)
freq_1

['chocolate', 'eggs', 'french fries', 'mineral water', 'spaghetti']

In [24]:
# Frequent itemsets obtained using PCY
freq_2.append(freq_itemset_2.collect())
print("Frequent Itemsets generated are: ")
for pair in freq_2[0]:
    print(pair)

Frequent Itemsets generated are: 
['chocolate', 'eggs']
['chocolate', 'french fries']
['chocolate', 'mineral water']
['chocolate', 'spaghetti']
['eggs', 'french fries']
['eggs', 'mineral water']
['eggs', 'spaghetti']
['french fries', 'mineral water']
['french fries', 'spaghetti']
['mineral water', 'spaghetti']


In [25]:
#Using pyspark FPGrowth Library

In [33]:
from pyspark.mllib.fpm import FPGrowth
data = sc.textFile("Market_Basket_Optimisation.csv")
transactions = data.map(lambda line: sorted(set(line.split(','))))
model = FPGrowth.train(transactions, minSupport = 0.1, numPartitions = 10)
result = model.freqItemsets().collect()

print("Frequent Itemsets generated are: ")
for itemset in result:
    print(itemset)

Frequent Itemsets generated are: 
FreqItemset(items=['mineral water'], freq=1788)
FreqItemset(items=['eggs'], freq=1348)
FreqItemset(items=['spaghetti'], freq=1306)
FreqItemset(items=['french fries'], freq=1282)
FreqItemset(items=['chocolate'], freq=1229)
FreqItemset(items=['green tea'], freq=991)
FreqItemset(items=['milk'], freq=972)
