In [None]:
import pandas as pd
import numpy as np

In [None]:
# Run this if the data files are stored in your Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def get_data(file_path):
  """
  Params:
    file_path - path to data file containing sparse vector representation
  Return: a list of rows (where rows are sets)
  """
  f = open(file_path)
  d = []
  line = f.readline()
  while line:
    line = set(line.strip().split(", ")[1:])
    d.append(line)
    line = f.readline()
  return d

In [None]:
def get_goods_names(file_path):  # TODO: add filepath as arg; also need to new parsing fnxn for bingo dataset psv file
  goods = pd.read_csv(file_path)
  goods["Name"] = goods["Flavor"] + " " + goods["Food"]
  goods["Name"] = goods["Name"].apply(lambda x: x.replace("'", ""))
  goods = goods[["Id", "Name"]]
  d = {}
  for index, row in goods.iterrows():
      d[str(row["Id"])] = row["Name"]
  return d

In [None]:
def get_named_item(ids_to_names, id):
  return ids_to_names[str(id)]

In [None]:
def get_named_itemset(ids_to_names, itemset):
  return tuple([get_named_item(ids_to_names, id) for id in itemset])

In [None]:
def get_named_skyline(ids_to_names, skyline_itemsets):
  return [get_named_itemset(ids_to_names, itemset) for itemset in skyline_itemsets]

In [None]:
def get_named_associations(ids_to_names, associations):
  """
  Params:
    ids_to_names - dict {id (str): name (str)}
    associations - dict returned by find_associations {tuple(left, right): confidence}
  Return: a dict identical to associations, except, ids are now the corresponding names
  """
  named_associations = {}
  for assoc, conf in associations.items():
    named_left = get_named_itemset(ids_to_names, assoc[0])
    named_right = get_named_item(ids_to_names, assoc[1])
    named_assoc = tuple([named_left, named_right])
    named_associations[named_assoc] = conf
  return named_associations

In [None]:
def get_named_association(ids_to_names, association):
  """
  Params:
    ids_to_names - dict {id (str): name (str)}
    association - a single id association (i.e. a tuple(tuple(left), right))
  Return: the same association as given but with ids changed into names
  """
  named_left = get_named_itemset(ids_to_names, association[0])
  named_right = get_named_item(ids_to_names, association[1])
  named_assoc = tuple([named_left, named_right])
  return named_assoc

In [None]:
def print_skyline(skyline_itemsets):
  for itemset in skyline_itemsets:
    print(f"  ({', '.join(itemset)})")

def print_associations(associations):
  for assoc, conf in associations.items():
    print(f"  {', '.join(assoc[0])} -> {assoc[1]} : conf {conf*100:.2f}%")

In [None]:
def print_skyline_with_freqs(skyline_itemsets, frequent_itemsets_info, ids_to_names, num_rows):
  """
  Print each skyline itemset (with names not ids) along with its frequency.
  Params:
    skyline_itemsets - list of skyline itemsets (which are tuples of ids)
    frequent_itemsets_info - dict of frequent itemsets (tuples of ids) to [skyline_flag, support]
    ids_to_names - map ids to names via a dict {id (str): name (str)}
    num_rows - the integer length of the dataset (i.e. number of rows, aka number of data points)
  """
  for itemset in skyline_itemsets:
    named_itemset = get_named_itemset(ids_to_names, itemset)
    print(f"  ({', '.join(named_itemset)}) has rsupport {frequent_itemsets_info[itemset][1] / num_rows:.3f}")

In [None]:
def print_associations_with_sups_and_confs(associations, frequent_itemsets_info, ids_to_names, num_rows):
  """
  Print named associations with the corresponding supports and confidences.
  Params:
    associations - a dict {tuple(skyline frequent itemset subset in ids (tuple), single item id): confidence (>= minConf)}
    frequent_itemsets_info - dict of frequent itemsets (tuples of ids) to [skyline_flag, support]
    ids_to_names - map ids to names via a dict {id (str): name (str)}
    num_rows - the integer length of the dataset (i.e. number of rows, aka number of data points)
  """
  for assoc, conf in associations.items():
    superset = list(set.union(set(assoc[0]), set([assoc[1]])))     # superset (ids) for getting support
    superset.sort()                                                # sort so will find key
    superset = tuple(superset)
    assoc_support = frequent_itemsets_info[superset][1] / num_rows  # calc rule support
    named_assoc = get_named_association(ids_to_names, assoc)        # get named version of association
    print(f"  {', '.join(named_assoc[0])} -> {named_assoc[1]} :   rsupport = {assoc_support:0.3f},   conf = {conf*100:.2f}%")

In [None]:
def iterate_minX(fnxn, fnxn_arg1, fnxn_arg2, minBound, maxBound, increment):
  """
  Iterate through a given range of minRSup or minConf values, running the corresponding given function on the given data.
  This function is meant to help quickly test sets of parameter values.
  It prints out the important results from each iteration and returns all results in a dict keyed by minX value.
  Params:
    fnxn, fnxn_arg1, fnxn_arg2 - 2 fnxn options where _arg1 and _arg2 are the 1st and 2nd args to pass into that fnxn
      apriori(sparse_data, list_of_ids, minRSup)
      find_associations(apriori_res[minRSup][0], apriori_res[minRSup][1], minConf)
    minBound, maxBound, increment - params to np.linspace() to generate test options for the minX to iterate over
      will act as minRSup if apriori() given as fnxn
      will act as minConf if find_associations() given as fnxn
  Return: dict {minX: return of given fnxn when called with given args and iterated minX value}
  """
  res = {}
  for minX in np.linspace(minBound, maxBound, increment):
    res[minX] = fnxn(fnxn_arg1, fnxn_arg2, minX)
    if fnxn is apriori:
      print(f"minRSup {minX} yielded {len(res[minX][1])} skyline frequent itemsets:\n{res[minX][1]}")
    elif fnxn is find_associations:
      print(f"minConf {minX} yielded {len(res[minX])} skyline associations:\n{res[minX]}")
  return res

In [None]:
def find_supported_items(data, candidates, minRSup):
  """
  Params:
    data - a list of rows (where rows are sets)
    candidates - a list of (item)sets to search for
    minRSup - min relative support
  """
  supports = {tuple(item): 0 for item in candidates}   # init all counts to 0
  for row in data:              # loop through rows first (1 disk read)
    for cand in candidates:     # then check each itemset in that row
      if set(cand).issubset(set(row)):    # increase frequency count if itemset is found
        supports[tuple(cand)] += 1
  
  # return only candidate itemsets meeting min sup
  num_rows = len(data)
  return {itemset: support for itemset, support in supports.items() if support / num_rows >= minRSup}

In [None]:
def all_subsets_present(superset, itemsets):
  """
  Return whether or not all subsets of superset are in itemsets.
  Params:
    superset - a list representing the new superset of length k (sorted)
    itemsets - a list of tuples (the itemsets of length k-1, also sorted)
  """
  all_found = True
  for item in superset:       # loop through all subsets by removing on elmt at a time
    subset = superset.copy()
    subset.remove(item)
    found = tuple(subset) in itemsets
    all_found &= found
  return all_found

In [None]:
def gen_candidates(Fprev, kprev):
  """
  Return a list of candidate length-k (item)sets
  Params:
    Fprev - a list of lists; i.e. a list of frequent itemsets (lists) of length k-1
    kprev - k-1
  """
  # given a set of itemsets each of length k-1,
  # generate a new set of itemsets each of length k
  # where each new itemset is a combination of two given itemsets,
  #   and all of its other subsets are also in the given set of itemsets
  candidates = []
  for i, a in enumerate(Fprev):
    for j, b in enumerate(Fprev[i+1:]):
      cand = sorted(list(set.union(set(a), set(b))))
      if (cand not in candidates and        # if not a duplicate superset and
          len(cand) == kprev + 1 and        # if the new itemset is of correct length (meaning a,b could be combined) and
          all_subsets_present(cand, Fprev)):# if given set of itemsets contains all subsets of new superset:
          candidates.append(cand)
  
  return candidates

#gen_candidates([['a','b','c'], ['a','b','d'],['b','c','d'],['b','c','e']], 3)

In [None]:
def get_skyline_items(freq_itemsets):
  return [key for key, value in freq_itemsets.items() if value[0] != 0]

In [None]:
def apriori(data, singles, minRSup):
  """
  Params:
    data - a list of rows (where rows are sets)
    singles - a list of all the singletons
    minRSup - min ratio/relative support
  """
  # find supports of singletons to start: dict { tuple(itemset) : freq_count }
  single_supports = find_supported_items(data, [set([str(single)]) for single in singles], minRSup)
  # dict of frequent itemsets only; values are flag values (1 for skyline, 0 for not)
  # dict { tuple(itemset) : [ skyline_flag, freq_count ] }
  freq_itemsets = {itemset: [1, support] for itemset, support in single_supports.items()}

  # init F1
  Fk = [tuple(item) for item in freq_itemsets.keys()]
  k = 2
  #print(f"F1 = {Fk}")
  
  # main loop
  while len(Fk) > 0 and k <= len(data):
    candidates = gen_candidates(Fk, k-1)
    #print(f'Candidates {k}: {candidates}')

    # Calc supports of candidates and prune
    freq_counts = find_supported_items(data, candidates, minRSup) # dict: {tuple(itemset): frequency}
    
    # Handle flags for finding skyline frequent itemsets
    for new_freq_item in freq_counts.keys():    # set skyline flag to 1 for all new frequent itemsets
      freq_itemsets[new_freq_item] = [1, freq_counts[new_freq_item]]
    for old_freq_item in Fk:                    # set skyline flag to 0 for any subsets of new frequent itemsets
      # if old is a subset of any of the new, set old flag to 0
      for new_freq_item in freq_counts.keys():
        if len(set.union(set(old_freq_item), set(new_freq_item))) == k:
          freq_itemsets[tuple(old_freq_item)] = [0, freq_itemsets[tuple(old_freq_item)][1]]
          continue

    Fk = list(freq_counts.keys())  # CAREFUL: don't set Fk to new itemsets until after skyline/flag checks
    #print(f"F{k} = {Fk}")
    k += 1

  return freq_itemsets, get_skyline_items(freq_itemsets)

In [None]:
def find_associations(freq_itemsets, skyline_itemsets, minConf):
  """
  Find the associations (meeting min confidence) in the skyline frequent itemsets
  Params:
    freq_itemsets: a dict {itemset (tuple): [0/1 in skyline, support]}
    skyline_itemsets: a list of frequent itemsets (tuples) in the skylines
  Return: a dict {tuple(skyline frequent itemset subset (tuple), single item): confidence (>= minConf)}
  """
  confs = {}
  #print(freq_itemsets)

  for freq_itemset in skyline_itemsets:
    #print(f"Freq itemset ({type(freq_itemset)}): {freq_itemset}")
    if len(freq_itemset) <= 1:    # skip frequent singletons
      continue
    
    # loop through all subsets (remove one item each time to get k subsets)
    for item in freq_itemset:
      #print(f"Freq_itemset: {freq_itemset}")
      subset = list(freq_itemset)
      subset.remove(item)
      subset.sort()
      #print(f"Subset {subset} from {freq_itemset}")
      conf = freq_itemsets[freq_itemset][1] / freq_itemsets[tuple(subset)][1]

      if conf >= minConf:
        confs[(tuple(subset), item)] = conf
  
  return confs

In [None]:
# THIS IS THE OUTERMOST FUNCTION TO RUN
def outer_find_associations(csv_filepath, minRSup, minConf, ids_to_names, rules_and_itemsets=0b11):
  """
  Find the frequent itemsets and association rules in the given dataset as determined by the given min relative support and min confidence.
  Print the results, but only the skyline frequent itemsets and skyline frequent association rules.
  Params:
    csv_filepath - path to csv data file containing sparse vector representation of the data
    minRSup - the min relative support value to use
    minConf - the min confidence value to use
    ids_to_names - a dictionary mapping ids (used in the dataset itself) to more descriptive item names
    rules_and_itemsets - 1 to only print itemsets, 2 to only print association rules, 3 to print both (default)
  """
  data = get_data(csv_filepath)   # list of lists
  singles = list(ids_to_names.keys())
  freq_itemsets_info, skyline_itemsets = apriori(data, singles, minRSup)
  associations = find_associations(freq_itemsets_info, skyline_itemsets, minConf)
  if rules_and_itemsets & 0b1:
    print(f"There are {len(skyline_itemsets)} skyline frequent itemsets for minRSup {minRSup}:")
    print_skyline_with_freqs(skyline_itemsets, freq_itemsets_info, ids_to_names, len(data))
    print("")
  if rules_and_itemsets & 0b10:
    print(f"There are {len(associations)} skyline frequent itemset association rules for minConf {minConf*100:.1f}%:")
    print_associations_with_sups_and_confs(associations, freq_itemsets_info, ids_to_names, len(data))

RUN ALL ABOVE THIS POINT, THEN INSERT YOUR TEST LINE(S) HERE CALLING THE FUNCTION outer_find_associations() WITH THE CORRECT PARAMS! IGNORE ALL BELOW!










Start of testing with lab datasets

Goods Datasets

In [None]:
file_path = "/content/drive/MyDrive/Data/goods.csv"
#file_path = goods.csv
goods_ids_to_names = get_goods_names(file_path)
print(f"Goods (id: name): {goods_ids_to_names}")
good_ids = list(goods_ids_to_names.keys())

Goods (id: name): {'0': 'Chocolate Cake', '1': 'Lemon Cake', '2': 'Casino Cake', '3': 'Opera Cake', '4': 'Strawberry Cake', '5': 'Truffle Cake', '6': 'Chocolate Eclair', '7': 'Coffee Eclair', '8': 'Vanilla Eclair', '9': 'Napoleon Cake', '10': 'Almond Tart', '11': 'Apple Pie', '12': 'Apple Tart', '13': 'Apricot Tart', '14': 'Berry Tart', '15': 'Blackberry Tart', '16': 'Blueberry Tart', '17': 'Chocolate Tart', '18': 'Cherry Tart', '19': 'Lemon Tart', '20': 'Pecan Tart', '21': 'Ganache Cookie', '22': 'Gongolais Cookie', '23': 'Raspberry Cookie', '24': 'Lemon Cookie', '25': 'Chocolate Meringue', '26': 'Vanilla Meringue', '27': 'Marzipan Cookie', '28': 'Tuile Cookie', '29': 'Walnut Cookie', '30': 'Almond Croissant', '31': 'Apple Croissant', '32': 'Apricot Croissant', '33': 'Cheese Croissant', '34': 'Chocolate Croissant', '35': 'Apricot Danish', '36': 'Apple Danish', '37': 'Almond Twist', '38': 'Almond Bear Claw', '39': 'Blueberry Danish', '40': 'Lemon Lemonade', '41': 'Raspberry Lemonade', 

Goods 5000 Dataset

In [None]:
goods_5000_data = get_data("/content/drive/MyDrive/Data/5000-out1.csv")
#goods_5000_data = get_data("5000-out1.csv")

In [None]:
goods_5000_res = iterate_minX(apriori, goods_5000_data, good_ids, 0.005, 0.009, 5)

minRSup 0.005 yielded 171 skyline frequent itemsets:
[('6',), ('20',), ('0', '4'), ('0', '7'), ('0', '28'), ('0', '42'), ('1', '3'), ('1', '4'), ('1', '5'), ('1', '9'), ('1', '17'), ('1', '18'), ('1', '19'), ('1', '22'), ('1', '27'), ('1', '36'), ('1', '40'), ('1', '42'), ('1', '46'), ('1', '48'), ('1', '49'), ('2', '4'), ('2', '7'), ('2', '9'), ('18', '2'), ('2', '42'), ('3', '4'), ('3', '5'), ('3', '9'), ('25', '3'), ('28', '3'), ('3', '33'), ('3', '42'), ('4', '5'), ('4', '7'), ('4', '9'), ('12', '4'), ('13', '4'), ('14', '4'), ('15', '4'), ('16', '4'), ('18', '4'), ('22', '4'), ('25', '4'), ('27', '4'), ('28', '4'), ('30', '4'), ('32', '4'), ('34', '4'), ('35', '4'), ('36', '4'), ('4', '43'), ('4', '44'), ('4', '46'), ('4', '47'), ('5', '7'), ('17', '5'), ('22', '5'), ('27', '5'), ('28', '5'), ('32', '5'), ('35', '5'), ('37', '5'), ('42', '5'), ('44', '5'), ('45', '5'), ('7', '9'), ('12', '7'), ('22', '7'), ('27', '7'), ('28', '7'), ('33', '7'), ('42', '7'), ('46', '7'), ('48', '7'

In [None]:
goods_5000_res.update(iterate_minX(apriori, goods_5000_data, good_ids, 0.01, 0.09, 9))

minRSup 0.01 yielded 26 skyline frequent itemsets:
[('6',), ('8',), ('10',), ('13',), ('20',), ('21',), ('25',), ('26',), ('30',), ('34',), ('38',), ('39',), ('1', '19'), ('4', '9'), ('22', '5'), ('14', '44'), ('27', '28'), ('33', '42'), ('0', '2', '46'), ('18', '3', '35'), ('15', '49', '7'), ('16', '32', '45'), ('17', '29', '47'), ('11', '37', '45', '7'), ('12', '31', '36', '48'), ('23', '24', '40', '41', '43')]
minRSup 0.02 yielded 26 skyline frequent itemsets:
[('6',), ('8',), ('10',), ('13',), ('20',), ('21',), ('25',), ('26',), ('30',), ('34',), ('38',), ('39',), ('1', '19'), ('4', '9'), ('22', '5'), ('14', '44'), ('27', '28'), ('33', '42'), ('0', '2', '46'), ('18', '3', '35'), ('15', '49', '7'), ('16', '32', '45'), ('17', '29', '47'), ('11', '37', '45', '7'), ('12', '31', '36', '48'), ('23', '24', '40', '41', '43')]
minRSup 0.03 yielded 35 skyline frequent itemsets:
[('6',), ('8',), ('10',), ('13',), ('20',), ('21',), ('23',), ('24',), ('25',), ('26',), ('29',), ('30',), ('34',),

In [None]:
minRSup = 0.01
goods_5000_assoc = iterate_minX(find_associations, goods_5000_res[minRSup][0], goods_5000_res[minRSup][1], 0.9, 1, 11)

minConf 0.9 yielded 19 skyline associations:
{(('2', '46'), '0'): 0.9017341040462428, (('0', '2'), '46'): 0.9122807017543859, (('3', '35'), '18'): 0.9444444444444444, (('18', '3'), '35'): 0.9357798165137615, (('49', '7'), '15'): 0.9662162162162162, (('15', '49'), '7'): 0.910828025477707, (('32', '45'), '16'): 0.9425287356321839, (('16', '45'), '32'): 0.9371428571428572, (('17', '29'), '47'): 0.9300699300699301, (('37', '45', '7'), '11'): 1.0, (('11', '45', '7'), '37'): 1.0, (('11', '37', '45'), '7'): 1.0, (('31', '36', '48'), '12'): 0.991304347826087, (('12', '36', '48'), '31'): 1.0, (('12', '31', '48'), '36'): 0.991304347826087, (('24', '40', '41', '43'), '23'): 1.0, (('23', '40', '41', '43'), '24'): 1.0, (('23', '24', '41', '43'), '40'): 1.0, (('23', '24', '40', '43'), '41'): 1.0}
minConf 0.91 yielded 18 skyline associations:
{(('0', '2'), '46'): 0.9122807017543859, (('3', '35'), '18'): 0.9444444444444444, (('18', '3'), '35'): 0.9357798165137615, (('49', '7'), '15'): 0.96621621621621

In [None]:
# Extreme minRSup test
minRSup = 0.001
minConf = 0.95
freq_goods_5000 = apriori(goods_5000_data, good_ids, minRSup)
freq_assoc_goods_5000 = find_associations(freq_goods_5000[0], freq_goods_5000[1], minConf)
print(f"There are {len(freq_goods_5000[1])} skyline frequent itemsets for minRSup {minRSup}:")
print_skyline(get_named_skyline(goods_ids_to_names, freq_goods_5000[1]))
print(f"There are {len(freq_assoc_goods_5000)} skyline frequent itemset association rule for minConf {minConf*100:.1f}%:")
print_associations(get_named_associations(goods_ids_to_names, freq_assoc_goods_5000))

KeyboardInterrupt: ignored

In [None]:
# Final goods_5000 choices
minRSup = 0.02
minConf = 0.95
freq_goods_5000 = apriori(goods_5000_data, good_ids, minRSup)
freq_assoc_goods_5000 = find_associations(freq_goods_5000[0], freq_goods_5000[1], minConf)
print(f"There are {len(freq_goods_5000[1])} skyline frequent itemsets for minRSup {minRSup}:")
print_skyline(get_named_skyline(goods_ids_to_names, freq_goods_5000[1]))
print("")
print(f"There are {len(freq_assoc_goods_5000)} skyline frequent itemset association rule for minConf {minConf*100:.1f}%:")
print_associations(get_named_associations(goods_ids_to_names, freq_assoc_goods_5000))

There are 26 skyline frequent itemsets for minRSup 0.02:
  (Chocolate Eclair)
  (Vanilla Eclair)
  (Almond Tart)
  (Apricot Tart)
  (Pecan Tart)
  (Ganache Cookie)
  (Chocolate Meringue)
  (Vanilla Meringue)
  (Almond Croissant)
  (Chocolate Croissant)
  (Almond Bear Claw)
  (Blueberry Danish)
  (Lemon Cake, Lemon Tart)
  (Strawberry Cake, Napoleon Cake)
  (Gongolais Cookie, Truffle Cake)
  (Berry Tart, Bottled Water)
  (Marzipan Cookie, Tuile Cookie)
  (Cheese Croissant, Orange Juice)
  (Chocolate Cake, Casino Cake, Chocolate Coffee)
  (Cherry Tart, Opera Cake, Apricot Danish)
  (Blackberry Tart, Single Espresso, Coffee Eclair)
  (Blueberry Tart, Apricot Croissant, Hot Coffee)
  (Chocolate Tart, Walnut Cookie, Vanilla Frappuccino)
  (Apple Pie, Almond Twist, Hot Coffee, Coffee Eclair)
  (Apple Tart, Apple Croissant, Apple Danish, Cherry Soda)
  (Raspberry Cookie, Lemon Cookie, Lemon Lemonade, Raspberry Lemonade, Green Tea)

There are 11 skyline frequent itemset association rule for mi

Goods 20000 Dataset

In [None]:
goods_20000_data = get_data("/content/drive/MyDrive/Data/20000-out1.csv")
#goods_20000_data = get_data("20000-out1.csv")

In [None]:
goods_20000_res = iterate_minX(apriori, goods_20000_data, good_ids, 0.01, 0.05, 5)

minRSup 0.01 yielded 26 skyline frequent itemsets:
[('6',), ('8',), ('10',), ('13',), ('20',), ('21',), ('25',), ('26',), ('30',), ('34',), ('38',), ('39',), ('1', '19'), ('4', '9'), ('22', '5'), ('14', '44'), ('27', '28'), ('33', '42'), ('0', '2', '46'), ('18', '3', '35'), ('15', '49', '7'), ('16', '32', '45'), ('17', '29', '47'), ('11', '37', '45', '7'), ('12', '31', '36', '48'), ('23', '24', '40', '41', '43')]
minRSup 0.02 yielded 26 skyline frequent itemsets:
[('6',), ('8',), ('10',), ('13',), ('20',), ('21',), ('25',), ('26',), ('30',), ('34',), ('38',), ('39',), ('1', '19'), ('4', '9'), ('22', '5'), ('14', '44'), ('27', '28'), ('33', '42'), ('0', '2', '46'), ('18', '3', '35'), ('15', '49', '7'), ('16', '32', '45'), ('17', '29', '47'), ('11', '37', '45', '7'), ('12', '31', '36', '48'), ('23', '24', '40', '41', '43')]
minRSup 0.03 yielded 39 skyline frequent itemsets:
[('6',), ('8',), ('10',), ('12',), ('13',), ('20',), ('21',), ('23',), ('24',), ('25',), ('26',), ('30',), ('31',),

In [None]:
goods_20000_res.update(iterate_minX(apriori, goods_20000_data, good_ids, 0.005, 0.009, 5))

minRSup 0.005 yielded 98 skyline frequent itemsets:
[('6',), ('8',), ('10',), ('13',), ('20',), ('21',), ('25',), ('26',), ('30',), ('34',), ('38',), ('39',), ('1', '4'), ('1', '5'), ('1', '7'), ('1', '9'), ('1', '14'), ('1', '17'), ('1', '19'), ('1', '22'), ('1', '27'), ('1', '28'), ('1', '31'), ('2', '4'), ('3', '9'), ('22', '3'), ('28', '3'), ('3', '42'), ('4', '9'), ('14', '4'), ('19', '4'), ('27', '4'), ('28', '4'), ('5', '7'), ('5', '9'), ('14', '5'), ('22', '5'), ('28', '5'), ('42', '5'), ('45', '5'), ('47', '5'), ('7', '9'), ('19', '7'), ('28', '7'), ('22', '9'), ('28', '9'), ('42', '9'), ('44', '9'), ('14', '22'), ('14', '27'), ('14', '28'), ('14', '33'), ('14', '42'), ('14', '44'), ('15', '28'), ('16', '27'), ('16', '28'), ('17', '28'), ('18', '22'), ('19', '22'), ('19', '45'), ('22', '24'), ('22', '28'), ('22', '29'), ('22', '33'), ('22', '35'), ('22', '42'), ('22', '45'), ('22', '47'), ('22', '48'), ('23', '28'), ('27', '28'), ('27', '32'), ('27', '33'), ('27', '42'), ('27'

I think that .01 is a good value for minRsupport bc for the next couple lower than it, it remains at 26 then sky rockets to higher values indicating that those after may be too small.

In [None]:
minRSup_20000 = 0.01
goods_20000_assoc = iterate_minX(find_associations, goods_20000_res[minRSup_20000][0], goods_20000_res[minRSup_20000][1], 0.9, 1, 11)

minConf 0.9 yielded 19 skyline associations:
{(('2', '46'), '0'): 0.9495798319327731, (('0', '2'), '46'): 0.9456066945606695, (('3', '35'), '18'): 0.9457900807381776, (('18', '3'), '35'): 0.9392898052691867, (('49', '7'), '15'): 0.9197952218430034, (('32', '45'), '16'): 0.9287749287749287, (('16', '45'), '32'): 0.9131652661064426, (('29', '47'), '17'): 0.9127625201938611, (('17', '29'), '47'): 0.9247135842880524, (('37', '45', '7'), '11'): 0.9982238010657194, (('11', '45', '7'), '37'): 0.9964539007092199, (('11', '37', '45'), '7'): 0.9946902654867257, (('31', '36', '48'), '12'): 0.9882352941176471, (('12', '36', '48'), '31'): 0.995260663507109, (('12', '31', '48'), '36'): 0.9929078014184397, (('24', '40', '41', '43'), '23'): 0.9975550122249389, (('23', '40', '41', '43'), '24'): 0.9975550122249389, (('23', '24', '41', '43'), '40'): 1.0, (('23', '24', '40', '43'), '41'): 1.0}
minConf 0.91 yielded 19 skyline associations:
{(('2', '46'), '0'): 0.9495798319327731, (('0', '2'), '46'): 0.9456

After the 95% confidence, the skyline seems to remain constant so I think we should do 95% confidence

In [None]:
# Final goods_20000 choices
minRSup = 0.02
minConf = 0.95
freq_goods_20000 = apriori(goods_20000_data, good_ids, minRSup)
freq_assoc_goods_20000 = find_associations(freq_goods_20000[0], freq_goods_20000[1], minConf)
print(f"There are {len(freq_goods_20000[1])} skyline frequent itemsets for minRSup {minRSup}:")
print_skyline(get_named_skyline(goods_ids_to_names, freq_goods_20000[1]))
print("")
print(f"There are {len(freq_assoc_goods_20000)} skyline frequent itemset association rule for minConf {minConf*100:.1f}%:")
print_associations(get_named_associations(goods_ids_to_names, freq_assoc_goods_20000))

There are 26 skyline frequent itemsets for minRSup 0.02:
  (Chocolate Eclair)
  (Vanilla Eclair)
  (Almond Tart)
  (Apricot Tart)
  (Pecan Tart)
  (Ganache Cookie)
  (Chocolate Meringue)
  (Vanilla Meringue)
  (Almond Croissant)
  (Chocolate Croissant)
  (Almond Bear Claw)
  (Blueberry Danish)
  (Lemon Cake, Lemon Tart)
  (Strawberry Cake, Napoleon Cake)
  (Gongolais Cookie, Truffle Cake)
  (Berry Tart, Bottled Water)
  (Marzipan Cookie, Tuile Cookie)
  (Cheese Croissant, Orange Juice)
  (Chocolate Cake, Casino Cake, Chocolate Coffee)
  (Cherry Tart, Opera Cake, Apricot Danish)
  (Blackberry Tart, Single Espresso, Coffee Eclair)
  (Blueberry Tart, Apricot Croissant, Hot Coffee)
  (Chocolate Tart, Walnut Cookie, Vanilla Frappuccino)
  (Apple Pie, Almond Twist, Hot Coffee, Coffee Eclair)
  (Apple Tart, Apple Croissant, Apple Danish, Cherry Soda)
  (Raspberry Cookie, Lemon Cookie, Lemon Lemonade, Raspberry Lemonade, Green Tea)

There are 10 skyline frequent itemset association rule for mi

In [None]:
goods_75000_data = get_data("/content/drive/MyDrive/Data/75000-out1.csv")
#goods_75000_data = get_data("75000-out1.csv")


In [None]:
goods_75000_res = iterate_minX(apriori, goods_75000_data, good_ids, 0.01, 0.05, 5)

KeyboardInterrupt: ignored

In [None]:
goods_75000_res.update(iterate_minX(apriori, goods_75000_data, good_ids, 0.005, 0.009, 5))

minRSup 0.005 yielded 85 skyline frequent itemsets:
[('6',), ('8',), ('10',), ('13',), ('20',), ('21',), ('25',), ('26',), ('30',), ('34',), ('38',), ('39',), ('0', '28'), ('1', '4'), ('1', '9'), ('1', '14'), ('1', '18'), ('1', '19'), ('1', '22'), ('1', '27'), ('1', '28'), ('1', '42'), ('4', '5'), ('4', '7'), ('4', '9'), ('14', '4'), ('18', '4'), ('19', '4'), ('22', '4'), ('27', '4'), ('28', '4'), ('35', '4'), ('4', '42'), ('4', '44'), ('14', '5'), ('22', '5'), ('28', '5'), ('33', '5'), ('42', '5'), ('28', '7'), ('14', '9'), ('19', '9'), ('22', '9'), ('27', '9'), ('28', '9'), ('35', '9'), ('14', '18'), ('14', '22'), ('14', '27'), ('14', '28'), ('14', '33'), ('14', '42'), ('14', '44'), ('15', '28'), ('18', '28'), ('18', '42'), ('19', '28'), ('19', '42'), ('22', '27'), ('22', '28'), ('22', '42'), ('27', '28'), ('27', '32'), ('27', '33'), ('27', '42'), ('28', '32'), ('28', '33'), ('28', '35'), ('28', '42'), ('28', '45'), ('28', '46'), ('28', '49'), ('32', '42'), ('33', '42'), ('35', '42')

KeyboardInterrupt: ignored

.02 Seems to be a good bc after that everything seems to 

In [None]:
minRSup_75000 = 0.02
goods_75000_assoc = iterate_minX(find_associations, goods_75000_res[minRSup_75000][0], goods_75000_res[minRSup_75000][1], 0.9, 1, 11)

minConf 0.9 yielded 20 skyline associations:
{(('2', '46'), '0'): 0.9474082482027999, (('0', '2'), '46'): 0.9395872420262664, (('3', '35'), '18'): 0.9553765106910443, (('18', '3'), '35'): 0.9477405471872118, (('49', '7'), '15'): 0.9222423146473779, (('15', '49'), '7'): 0.9230769230769231, (('32', '45'), '16'): 0.9280060309084056, (('16', '45'), '32'): 0.936834094368341, (('29', '47'), '17'): 0.9396067415730337, (('17', '29'), '47'): 0.9369747899159664, (('37', '45', '7'), '11'): 0.9928876244665719, (('11', '45', '7'), '37'): 0.9938300901756051, (('11', '37', '45'), '7'): 0.9952471482889734, (('31', '36', '48'), '12'): 0.9897435897435898, (('12', '36', '48'), '31'): 0.9929260450160772, (('12', '31', '48'), '36'): 0.9910141206675225, (('24', '40', '41', '43'), '23'): 1.0, (('23', '40', '41', '43'), '24'): 0.9993573264781491, (('23', '24', '41', '43'), '40'): 1.0, (('23', '24', '40', '43'), '41'): 1.0}
minConf 0.91 yielded 20 skyline associations:
{(('2', '46'), '0'): 0.9474082482027999, 

In [None]:
goods_75000_res.update(iterate_minX(apriori, goods_20000_data, good_ids, 0.003, 0.008, 5))

In [None]:
# Final goods_75000 choices
minRSup = 0.02
minConf = 0.97
freq_goods_75000 = apriori(goods_75000_data, good_ids, minRSup)
freq_assoc_goods_75000 = find_associations(freq_goods_75000[0], freq_goods_75000[1], minConf)
print(f"There are {len(freq_goods_75000[1])} skyline frequent itemsets for minRSup {minRSup}:")
print_skyline(get_named_skyline(goods_ids_to_names, freq_goods_75000[1]))
print("")
print(f"There are {len(freq_assoc_goods_75000)} skyline frequent itemset association rule for minConf {minConf*100:.1f}%:")
print_associations(get_named_associations(goods_ids_to_names, freq_assoc_goods_75000))

There are 26 skyline frequent itemsets for minRSup 0.02:
  (Chocolate Eclair)
  (Vanilla Eclair)
  (Almond Tart)
  (Apricot Tart)
  (Pecan Tart)
  (Ganache Cookie)
  (Chocolate Meringue)
  (Vanilla Meringue)
  (Almond Croissant)
  (Chocolate Croissant)
  (Almond Bear Claw)
  (Blueberry Danish)
  (Lemon Cake, Lemon Tart)
  (Strawberry Cake, Napoleon Cake)
  (Gongolais Cookie, Truffle Cake)
  (Berry Tart, Bottled Water)
  (Marzipan Cookie, Tuile Cookie)
  (Cheese Croissant, Orange Juice)
  (Chocolate Cake, Casino Cake, Chocolate Coffee)
  (Cherry Tart, Opera Cake, Apricot Danish)
  (Blackberry Tart, Single Espresso, Coffee Eclair)
  (Blueberry Tart, Apricot Croissant, Hot Coffee)
  (Chocolate Tart, Walnut Cookie, Vanilla Frappuccino)
  (Apple Pie, Almond Twist, Hot Coffee, Coffee Eclair)
  (Apple Tart, Apple Croissant, Apple Danish, Cherry Soda)
  (Raspberry Cookie, Lemon Cookie, Lemon Lemonade, Raspberry Lemonade, Green Tea)

There are 10 skyline frequent itemset association rule for mi

In [None]:
# Final FINAL goods 75000 output
minRSup_75000 = 0.02
minConf_75000 = 0.97
print(f"There are {len(freq_goods_75000[1])} skyline frequent itemsets for minRSup {minRSup_75000}:")
print_skyline_with_freqs(freq_goods_75000[1], freq_goods_75000[0], goods_ids_to_names, len(goods_75000_data))
print("")
print(f"There are {len(freq_assoc_goods_75000)} skyline frequent itemset association rules for minConf {minConf_75000*100:.1f}%:")
print_associations_with_sups_and_confs(freq_assoc_goods_75000, freq_goods_75000[0], goods_ids_to_names, len(goods_75000_data))

There are 26 skyline frequent itemsets for minRSup 0.02:
  (Chocolate Eclair) has rsupport 0.042
  (Vanilla Eclair) has rsupport 0.043
  (Almond Tart) has rsupport 0.042
  (Apricot Tart) has rsupport 0.042
  (Pecan Tart) has rsupport 0.043
  (Ganache Cookie) has rsupport 0.043
  (Chocolate Meringue) has rsupport 0.042
  (Vanilla Meringue) has rsupport 0.042
  (Almond Croissant) has rsupport 0.043
  (Chocolate Croissant) has rsupport 0.043
  (Almond Bear Claw) has rsupport 0.042
  (Blueberry Danish) has rsupport 0.044
  (Lemon Cake, Lemon Tart) has rsupport 0.037
  (Strawberry Cake, Napoleon Cake) has rsupport 0.043
  (Gongolais Cookie, Truffle Cake) has rsupport 0.044
  (Berry Tart, Bottled Water) has rsupport 0.038
  (Marzipan Cookie, Tuile Cookie) has rsupport 0.051
  (Cheese Croissant, Orange Juice) has rsupport 0.043
  (Chocolate Cake, Casino Cake, Chocolate Coffee) has rsupport 0.033
  (Cherry Tart, Opera Cake, Apricot Danish) has rsupport 0.041
  (Blackberry Tart, Single Espresso

Fantasy Bingo Dataset

In [None]:
bingo_baskets = get_data("/content/drive/MyDrive/Data/bingoBaskets.csv")
#bingo_baskets = get_data("bingoBaskets.csv")

In [None]:
def get_author_names(file_path):
  f = open(file_path)
  line = f.readline()
  d = {}
  while line:
    the_line = line.strip().split(' | ')
    d[the_line[0]] = the_line[1]
    line = f.readline()
  return d

In [None]:
file_path = "/content/drive/MyDrive/Data/authorlist.psv"
#file_path = authorlist.psv
author_id_to_names = get_author_names(file_path)
author_ids = author_id_to_names.keys()

In [None]:
bingo_res = iterate_minX(apriori, bingo_baskets, author_ids, 0.03, 0.1, 11)

minRSup 0.03 yielded 1022 skyline frequent itemsets:
[('3',), ('8',), ('18',), ('42',), ('49',), ('85',), ('107',), ('125',), ('136',), ('183',), ('190',), ('195',), ('204',), ('215',), ('239',), ('259',), ('276',), ('279',), ('305',), ('338',), ('347',), ('421',), ('435',), ('441',), ('444',), ('482',), ('507',), ('522',), ('524',), ('548',), ('551',), ('641',), ('648',), ('672',), ('689',), ('725',), ('728',), ('752',), ('753',), ('803',), ('805',), ('818',), ('852',), ('861',), ('863',), ('870',), ('878',), ('912',), ('947',), ('954',), ('965',), ('993',), ('1014',), ('1044',), ('1107',), ('1115',), ('1156',), ('1178',), ('1195',), ('1224',), ('1244',), ('1245',), ('1247',), ('1259',), ('1274',), ('1290',), ('1405',), ('6', '91'), ('368', '6'), ('445', '6'), ('6', '743'), ('1029', '6'), ('1109', '6'), ('13', '32'), ('13', '48'), ('13', '68'), ('109', '13'), ('13', '166'), ('13', '197'), ('13', '218'), ('13', '544'), ('13', '644'), ('13', '668'), ('13', '747'), ('13', '880'), ('1117'

In [None]:
bingo_res = iterate_minX(apriori, bingo_baskets, author_ids, 0.027, 0.029, 3)

minRSup 0.027 yielded 1353 skyline frequent itemsets:
[('2',), ('3',), ('8',), ('25',), ('42',), ('49',), ('72',), ('85',), ('107',), ('125',), ('136',), ('190',), ('195',), ('204',), ('210',), ('239',), ('276',), ('295',), ('305',), ('338',), ('339',), ('421',), ('435',), ('441',), ('444',), ('482',), ('512',), ('522',), ('524',), ('548',), ('551',), ('631',), ('633',), ('641',), ('648',), ('687',), ('689',), ('697',), ('728',), ('753',), ('760',), ('803',), ('805',), ('818',), ('851',), ('852',), ('863',), ('878',), ('912',), ('947',), ('954',), ('965',), ('993',), ('1014',), ('1044',), ('1087',), ('1107',), ('1156',), ('1178',), ('1195',), ('1199',), ('1214',), ('1224',), ('1233',), ('1259',), ('1274',), ('1282',), ('1290',), ('1322',), ('1334',), ('1362',), ('1405',), ('445', '6'), ('564', '6'), ('1109', '6'), ('1283', '6'), ('13', '32'), ('13', '48'), ('13', '68'), ('109', '13'), ('13', '166'), ('13', '168'), ('13', '197'), ('13', '240'), ('13', '668'), ('13', '747'), ('13', '880'

In [None]:
minRSup_bingo = 0.065
bingo_apriori_final = apriori(bingo_baskets, author_ids, minRSup_bingo)

In [None]:
bingo_assoc = iterate_minX(find_associations, bingo_apriori_final[0], bingo_apriori_final[1], 0.5, 1, 11)

minConf 0.5 yielded 64 skyline associations:
{(('13',), '91'): 0.5853658536585366, (('13',), '1109'): 0.5609756097560976, (('544',), '91'): 0.6176470588235294, (('564',), '91'): 0.6451612903225806, (('617',), '91'): 0.5757575757575758, (('668',), '91'): 0.6060606060606061, (('694',), '91'): 0.5, (('802',), '91'): 0.5416666666666666, (('1172',), '91'): 0.5517241379310345, (('1284',), '91'): 0.5333333333333333, (('197',), '743'): 0.5384615384615384, (('197',), '1109'): 0.6666666666666666, (('564',), '445'): 0.5806451612903226, (('467',), '1109'): 0.5, (('564',), '743'): 0.5483870967741935, (('564',), '1029'): 0.6129032258064516, (('564',), '1109'): 0.5806451612903226, (('564',), '1283'): 0.5483870967741935, (('694',), '576'): 0.5227272727272727, (('617',), '644'): 0.5454545454545454, (('1172',), '644'): 0.5517241379310345, (('1279',), '644'): 0.56, (('1321',), '644'): 0.5666666666666667, (('694',), '1109'): 0.5681818181818182, (('802',), '1029'): 0.5, (('802',), '1109'): 0.5, (('849',), 

In [None]:
# Final bingo choices
minRSup_bingo = 0.065
minConf_bingo = 0.65
print(f"There are {len(bingo_apriori_final[1])} skyline frequent itemsets for minRSup {minRSup_bingo}:")
print_skyline_with_freqs(bingo_apriori_final[1], bingo_apriori_final[0], author_id_to_names, len(bingo_baskets))
print("")
print(f"There are {len(bingo_assoc[minConf_bingo])} skyline frequent itemset association rules for minConf {minConf_bingo*100:.1f}%:")
print_associations_with_sups_and_confs(bingo_assoc[minConf_bingo], bingo_apriori_final[0], author_id_to_names, len(bingo_baskets))

There are 200 skyline frequent itemsets for minRSup 0.065:
  (Abercrombie, Joe) has rsupport 0.066
  (Anders, Charlie Jane) has rsupport 0.099
  (Atwood, Margaret) has rsupport 0.095
  (Bardugo, Leigh) has rsupport 0.103
  (Bear, Elizabeth) has rsupport 0.119
  (Beaulieu, Bradley P.) has rsupport 0.099
  (Bennett, Robert Jackson) has rsupport 0.128
  (Brett, Peter V.) has rsupport 0.066
  (Brown, Pierce) has rsupport 0.103
  (Bujold, Lois McMaster) has rsupport 0.091
  (Butler, Octavia E.) has rsupport 0.091
  (Carey, Mike / Carey, M. R.) has rsupport 0.074
  (Carriger, Gail) has rsupport 0.086
  (Clarke, Susanna) has rsupport 0.066
  (Drake, Darrell) has rsupport 0.119
  (El-Mohtar, Amal) has rsupport 0.066
  (Elliott, Kate / Rasmussen, Alis A.) has rsupport 0.086
  (Erikson, Steven) has rsupport 0.086
  (Hill, Joe) has rsupport 0.070
  (Huff, Tanya) has rsupport 0.082
  (Jones, Diana Wynne) has rsupport 0.115
  (Jordan, Robert) has rsupport 0.066
  (Kowal, Mary Robinette) has rsuppor

In [None]:
# TEST OF OUTERMOST FUNCTION
outer_find_associations("/content/drive/MyDrive/Data/75000-out1.csv", 0.02, 0.96, goods_ids_to_names, rules_and_itemsets=3)

There are 26 skyline frequent itemsets for minRSup 0.02:
  (Chocolate Eclair) has rsupport 0.042
  (Vanilla Eclair) has rsupport 0.043
  (Almond Tart) has rsupport 0.042
  (Apricot Tart) has rsupport 0.042
  (Pecan Tart) has rsupport 0.043
  (Ganache Cookie) has rsupport 0.043
  (Chocolate Meringue) has rsupport 0.042
  (Vanilla Meringue) has rsupport 0.042
  (Almond Croissant) has rsupport 0.043
  (Chocolate Croissant) has rsupport 0.043
  (Almond Bear Claw) has rsupport 0.042
  (Blueberry Danish) has rsupport 0.044
  (Lemon Cake, Lemon Tart) has rsupport 0.037
  (Strawberry Cake, Napoleon Cake) has rsupport 0.043
  (Gongolais Cookie, Truffle Cake) has rsupport 0.044
  (Berry Tart, Bottled Water) has rsupport 0.038
  (Marzipan Cookie, Tuile Cookie) has rsupport 0.051
  (Cheese Croissant, Orange Juice) has rsupport 0.043
  (Chocolate Cake, Casino Cake, Chocolate Coffee) has rsupport 0.033
  (Cherry Tart, Opera Cake, Apricot Danish) has rsupport 0.041
  (Blackberry Tart, Single Espresso

In [None]:
author_names_to_id["91"]

NameError: ignored