# Project 1 Association Analysis

## Task

* Dataset1: Select from kaggle.com / UCI
* Dataset2: Use IBM Quest Synthetic Data Generator 
    * https://sourceforge.net/projects/ibmquestdatagen/ 
    * Generate different datasets
* Implement **Apriori Algorithm** and apply on these datasets 
    * Hash? Tree? (optional)
    * **FP-growth**
* Use association analysis tools (e.g. WEKA) to generate association rules from the datasets you generate
* Compare your results

In [1]:
import itertools
import pandas as pd
import matplotlib.pyplot as plt

## Dataset 2

### Parse Data Generated by IBM Quest Synthetic Data Generator

利用 IBM Quest Synthetic Data Generator 的 lit 模式產生資料集，調整的參數如下:

```
-ntran 1
-tlen 3
-nitems 20
-npats 10
-patlen 5
```

In [2]:
inputfile = open('data/data_1_3_20_10_5.data', 'r')
outputfile = open('data/data_1_3_20_10_5.csv', 'w')

In [3]:
outputfile.write('CustID,TransID,Item\n')

20

In [4]:
while True:

    s = inputfile.readline().rstrip('\n')
    
    # if this is the eof
    if len(s) == 0:
        break
        
    # Extract dimensions from first line. Cast values to integers from strings.
    CustID, TransID, Item = (int(val) for val in s.split())
    output_str = '%d,%d,%d\n' % (CustID, TransID, Item)
    outputfile.write(output_str)

In [5]:
inputfile.close()
outputfile.close()

### Before Getting Started

In [6]:
# Load data
FILE_PATH = 'data/fcam.csv'
# FILE_PATH = 'data/data_1_3_20_10_5.csv'
df = pd.read_csv(FILE_PATH)
df = df.astype({'Item': str})
# Parameters
MINSUP = 3
MINCONF = 0.8
# Candidate 1-itemset
C1_df = df['Item'].value_counts()
# Frequent 1-itemset
L1_df = C1_df.loc[C1_df.values >= MINSUP]
L1 = L1_df.index.values.tolist()
tmp = L1_df.values.tolist()
L1_freq = {key: value for key, value in zip(L1, tmp)}

In [7]:
print(L1)

['c', 'f', 'm', 'a', 'p', 'b']


In [8]:
L1_freq

{'c': 4, 'f': 4, 'm': 3, 'a': 3, 'p': 3, 'b': 3}

In [9]:
# Init dictionary for every transaction
trans_num = df['TransID'].max()
transaction_db = {}
for i in range(1, trans_num + 1):
    transaction_db[i] = []
# Extract info from df to dictionary
df_num = len(df)
for i in range(df_num):
    index = df.iloc[i][0]
    item = df.iloc[i][2]    
    transaction_db[index] += [item]

In [10]:
transaction_db

{1: ['a', 'c', 'd', 'f', 'g', 'i', 'm', 'p'],
 2: ['a', 'b', 'c', 'f', 'i', 'm', 'o'],
 3: ['b', 'f', 'h', 'j', 'o'],
 4: ['b', 'c', 'k', 's', 'p'],
 5: ['a', 'c', 'e', 'f', 'l', 'm', 'n', 'p']}

### Apriori Algorithm

In [11]:
def Apriori_gen(x, k):
    Ck = []
    # Combination of k items in Lk
    for subset1 in itertools.combinations(x, k):
        # Change subset1 into `set` type for set operation
        tmp = [set(item) for item in subset1]
        union_result = set()
        # Combination of k-1 items in subset1
        for subset2 in itertools.combinations(tmp, k - 1):
            # Intersection of all items in subset2 (k-1 items)
            result = subset2[0]
            for i in range(k - 1):
                result = result.intersection(subset2[i])
            union_result = union_result.union(result)
        if len(union_result) == k:
            Ck.append(list(union_result))
    return Ck

In [12]:
def Apriori(tdb, L1, minsup):
    Lk = [(item,) for item in L1]
    Lk_freq = {(key,): value for (key, value) in L1_freq.items()}
    k = 2
    FreqPat = []
    FreqPat_freq = {}
    while Lk != []:
        # Add Lk in freqent patterns            
        for item in Lk:
            FreqPat.append(item)
        # Add Lk_freq in FreqPat_freq
        FreqPat_freq.update(Lk_freq)
        # Use previous Lk to generate Ck
        Ck = Apriori_gen(Lk, k)
        # Count the number of every item in Ck appears in DB
        Ck_freq = {}
        for item in Ck:
            count = 0
            for transaction in tdb.values():
                if all(x in transaction for x in item):
                    count += 1
            Ck_freq[tuple(item)] = count
        # Generate Lk
        Lk = []
        Lk_freq = {}
        for (key, value) in Ck_freq.items():
            if value >= minsup:
                # Because tuple has order
                new_key = tuple(sorted(key))
                Lk.append(new_key)
                Lk_freq[new_key] = value
        k += 1
    return FreqPat, FreqPat_freq

In [13]:
FreqPat, FreqPat_freq = Apriori(transaction_db, L1_freq, MINSUP)
FreqPat

[('c',),
 ('f',),
 ('m',),
 ('a',),
 ('p',),
 ('b',),
 ('c', 'f'),
 ('c', 'm'),
 ('a', 'c'),
 ('c', 'p'),
 ('f', 'm'),
 ('a', 'f'),
 ('a', 'm'),
 ('c', 'f', 'm'),
 ('a', 'c', 'f'),
 ('a', 'c', 'm'),
 ('a', 'f', 'm'),
 ('a', 'c', 'f', 'm')]

### FP-growth

In [14]:
class HeaderTableNode:
    def __init__(self):
        self.head = None
        self.tail = None

class FPtreeNode:
    def __init__(self, val, parent=None):
        self.val = val
        self.count = 1
        self.parent = parent
        self.children = []
        self.next = None
    def insert_frequent_items(self, items, hdtable):
        # If there is no frequent item
        if len(items) == 0:
            return
        item = items[0]
        for child in self.children:
            if child.val == item:
                child.count += 1
                child.insert_frequent_items(items[1:], hdtable)
                return
        # If cannot find the item among children
        new_child = FPtreeNode(item, self)
        # Add new node to header table
        if hdtable[item].head == None:
            hdtable[item].head = new_child
            hdtable[item].tail = new_child
        else:
            hdtable[item].tail.next = new_child
            hdtable[item].tail = new_child
        # Add new node to current node's children
        self.children.append(new_child)
        new_child.insert_frequent_items(items[1:], hdtable)

class CondPatternBase:
    def __init__(self, pattern, freq):
        self.pattern = pattern
        self.freq = freq

class FreqPattern:
    def __init__(self, pattern, support):
        self.pattern = pattern
        self.support = support

In [15]:
# Init dictionary for ordered frequent items of every transaction
ofi = {}
for i in range(1, trans_num + 1):
    ofi[i] = []
# Construct ordered frequent items of every transaction
for i in range(1, trans_num + 1):
    for item in L1:
        if item in transaction_db[i]:
            ofi[i] += [item]

# Init header table
HeaderTable = {}
for item in L1:
    new_node = HeaderTableNode()
    HeaderTable[item] = new_node

# Construct FP-tree
FPtree = FPtreeNode('root')
for i in range(1, trans_num + 1):
    FPtree.insert_frequent_items(ofi[i], HeaderTable)



In [16]:
ofi

{1: ['c', 'f', 'm', 'a', 'p'],
 2: ['c', 'f', 'm', 'a', 'b'],
 3: ['f', 'b'],
 4: ['c', 'p', 'b'],
 5: ['c', 'f', 'm', 'a', 'p']}

In [17]:
# Generate conditional pattern base
CondBase = {}
for item in L1:
    # Init
    CondBase[item] = []
    # Start from head, and no need to traverse the leaf node
    listnode = HeaderTable[item].head
    treenode = listnode.parent
    # Traversal of linked-list
    while True:
        pattern = []
        # Traversal of tree
        while True:
            if treenode.val == 'root':
                # print()
                break
            # print('%s ' % treenode.val, end = '')
            pattern.insert(0, treenode.val)
            treenode = treenode.parent
        # Create a new base for this item
        if len(pattern) > 0:
            new_base = CondPatternBase(pattern, listnode.count)
            CondBase[item].append(new_base)
            # print('item = %s, count = %d: ' % (item, listnode.count), end = '\t')
            # print(pattern)
        # Reach the end of the list of this item
        if listnode.next == None:
            break
        # Continue to next node in the list, and no need to traverse the leaf node
        listnode = listnode.next
        treenode = listnode.parent



In [18]:
CondBase

{'c': [],
 'f': [<__main__.CondPatternBase at 0x1174a08d0>],
 'm': [<__main__.CondPatternBase at 0x117473410>],
 'a': [<__main__.CondPatternBase at 0x117473c10>],
 'p': [<__main__.CondPatternBase at 0x117473890>,
  <__main__.CondPatternBase at 0x1174736d0>],
 'b': [<__main__.CondPatternBase at 0x117473290>,
  <__main__.CondPatternBase at 0x1174735d0>,
  <__main__.CondPatternBase at 0x117473f50>]}

In [19]:
# Accumulate the count for each item in the base
freq = {}
for item1 in L1:
    freq[item1] = {}
    for item2 in L1:
        freq[item1][item2] = 0
    for base in CondBase[item1]:
        for item3 in base.pattern:
            freq[item1][item3] += base.freq    
    # print(item1)
    print(freq[item1])



{'c': 0, 'f': 0, 'm': 0, 'a': 0, 'p': 0, 'b': 0}
{'c': 3, 'f': 0, 'm': 0, 'a': 0, 'p': 0, 'b': 0}
{'c': 3, 'f': 3, 'm': 0, 'a': 0, 'p': 0, 'b': 0}
{'c': 3, 'f': 3, 'm': 3, 'a': 0, 'p': 0, 'b': 0}
{'c': 3, 'f': 2, 'm': 2, 'a': 2, 'p': 0, 'b': 0}
{'c': 2, 'f': 2, 'm': 1, 'a': 1, 'p': 1, 'b': 0}


In [20]:
# Conditional FP-tree (not a tree actually)
condFPtree = {}
for item1 in L1:
    tmp_pattern = []
    for item2 in L1:
        if freq[item1][item2] >= MINSUP:
            tmp_pattern.append(item2)
    if len(tmp_pattern) > 0:
        condFPtree[item1] = tmp_pattern

In [21]:
condFPtree

{'f': ['c'], 'm': ['c', 'f'], 'a': ['c', 'f', 'm'], 'p': ['c']}

In [22]:
# Generate frequent patterns
FreqPat2 = []
for key in condFPtree:
    x = condFPtree[key]
    for L in range(1, len(x)+1):
        for subset in itertools.combinations(x, L):
            pat = list(subset)
            pat.append(key)
            FreqPat2.append(pat)
# Add L1            
for item in L1:
    FreqPat2.append([item])

In [23]:
FreqPat2

[['c', 'f'],
 ['c', 'm'],
 ['f', 'm'],
 ['c', 'f', 'm'],
 ['c', 'a'],
 ['f', 'a'],
 ['m', 'a'],
 ['c', 'f', 'a'],
 ['c', 'm', 'a'],
 ['f', 'm', 'a'],
 ['c', 'f', 'm', 'a'],
 ['c', 'p'],
 ['c'],
 ['f'],
 ['m'],
 ['a'],
 ['p'],
 ['b']]

### Rule Generation

In [24]:
# Rule generation
def rule_gen(FreqPat, FreqPat_freq, minconf):
    rules = []
    for pattern in FreqPat:
        pattern_len = len(pattern)
        if pattern_len == 1:
            continue
        for length in range(2, pattern_len):
            for subset in itertools.combinations(pattern, length):
                # print(pattern, subset)
                conf = float(FreqPat_freq[pattern]) / FreqPat_freq[subset]
                if conf >= minconf:
                    rhs = set(pattern).difference(set(subset))
                    rules.append('%s -> %s' % (list(subset), list(rhs)))
    return rules

In [25]:
rule_gen(FreqPat, FreqPat_freq, MINCONF)

["['c', 'f'] -> ['m']",
 "['c', 'm'] -> ['f']",
 "['f', 'm'] -> ['c']",
 "['a', 'c'] -> ['f']",
 "['a', 'f'] -> ['c']",
 "['c', 'f'] -> ['a']",
 "['a', 'c'] -> ['m']",
 "['a', 'm'] -> ['c']",
 "['c', 'm'] -> ['a']",
 "['a', 'f'] -> ['m']",
 "['a', 'm'] -> ['f']",
 "['f', 'm'] -> ['a']",
 "['a', 'c'] -> ['f', 'm']",
 "['a', 'f'] -> ['m', 'c']",
 "['a', 'm'] -> ['f', 'c']",
 "['c', 'f'] -> ['a', 'm']",
 "['c', 'm'] -> ['a', 'f']",
 "['f', 'm'] -> ['a', 'c']",
 "['a', 'c', 'f'] -> ['m']",
 "['a', 'c', 'm'] -> ['f']",
 "['a', 'f', 'm'] -> ['c']",
 "['c', 'f', 'm'] -> ['a']"]

## Dataset 1

Dataset source: https://www.kaggle.com/abcsds/pokemon

In [26]:
df1 = pd.read_csv('data/Pokemon.csv')

In [27]:
df1.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [28]:
df1.describe()

Unnamed: 0,#,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation
count,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0
mean,362.81375,435.1025,69.25875,79.00125,73.8425,72.82,71.9025,68.2775,3.32375
std,208.343798,119.96304,25.534669,32.457366,31.183501,32.722294,27.828916,29.060474,1.66129
min,1.0,180.0,1.0,5.0,5.0,10.0,20.0,5.0,1.0
25%,184.75,330.0,50.0,55.0,50.0,49.75,50.0,45.0,2.0
50%,364.5,450.0,65.0,75.0,70.0,65.0,70.0,65.0,3.0
75%,539.25,515.0,80.0,100.0,90.0,95.0,90.0,90.0,5.0
max,721.0,780.0,255.0,190.0,230.0,194.0,230.0,180.0,6.0


In [29]:
def tmp(x):
    if x > 515: return 'high Total'
    elif x > 450: return 'medium Total'
    else: return 'low Total'

df1['Total'] = df1['Total'].apply(tmp)

def tmp(x):
    if x > 80: return 'high HP'
    elif x > 65: return 'medium HP'
    else: return 'low HP'

df1['HP'] = df1['HP'].apply(tmp)

def tmp(x):
    if x > 100: return 'high Attack'
    elif x > 75: return 'medium Attack'
    else: return 'low Attack'

df1['Attack'] = df1['Attack'].apply(tmp)

def tmp(x):
    if x > 90: return 'high Defense'
    elif x > 70: return 'medium Defense'
    else: return 'low Defense'

df1['Defense'] = df1['Defense'].apply(tmp)

def tmp(x):
    if x > 95: return 'high Sp. Atk'
    elif x > 65: return 'medium Sp. Atk'
    else: return 'low Sp. Atk'

df1['Sp. Atk'] = df1['Sp. Atk'].apply(tmp)

def tmp(x):
    if x > 90: return 'high Sp. Def'
    elif x > 70: return 'medium Sp. Def'
    else: return 'low Sp. Def'

df1['Sp. Def'] = df1['Sp. Def'].apply(tmp)

def tmp(x):
    if x > 90: return 'high Speed'
    elif x > 65: return 'medium Speed'
    else: return 'low Speed'

df1['Speed'] = df1['Speed'].apply(tmp)

In [30]:
df1['Type 2'].fillna('Does not have Type 2', inplace=True)

In [31]:
df1 = df1.astype({'Generation': str, 'Legendary': str})
df1.drop(['#', 'Name'], axis=1, inplace=True)

### Before Getting Started

In [33]:
# Parameters
MINSUP = 200
MINCONF = 0.8

In [34]:
# Candidate 1-itemset
cols_names = ['Total', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed', 'Generation', 'Legendary']
C1_series = df1['Type 1'].value_counts()
tmp = df1['Type 2'].value_counts()
C1_series = C1_series.add(tmp, fill_value=0)
C1_series = C1_series.astype(int)
for col in cols_names:
    tmp = df1[col].value_counts()
    C1_series = pd.concat([C1_series, tmp])

# Frequent 1-itemset
L1_series = C1_series.loc[C1_series.values >= MINSUP]
L1 = L1_series.index.values.tolist()
tmp = L1_series.values.tolist()
L1_freq = {key: value for key, value in zip(L1, tmp)}

In [35]:
L1_freq

{'Does not have Type 2': 386,
 'low Total': 404,
 'medium Total': 201,
 'low HP': 405,
 'medium HP': 202,
 'low Attack': 409,
 'medium Attack': 221,
 'low Defense': 438,
 'low Sp. Atk': 411,
 'medium Sp. Atk': 208,
 'low Sp. Def': 433,
 'low Speed': 410,
 'medium Speed': 204,
 'False': 735}

In [36]:
# Extract info from df to dictionary
cols_names = df1.columns.to_list()
transaction_db = {}
for index, row in df1.iterrows():
    new_index = index + 1
    transaction_db[new_index] = []
    for col in cols_names:
        transaction_db[new_index].append(row[col])

In [37]:
transaction_db

{1: ['Grass',
  'Poison',
  'low Total',
  'low HP',
  'low Attack',
  'low Defense',
  'low Sp. Atk',
  'low Sp. Def',
  'low Speed',
  '1',
  'False'],
 2: ['Grass',
  'Poison',
  'low Total',
  'low HP',
  'low Attack',
  'low Defense',
  'medium Sp. Atk',
  'medium Sp. Def',
  'low Speed',
  '1',
  'False'],
 3: ['Grass',
  'Poison',
  'high Total',
  'medium HP',
  'medium Attack',
  'medium Defense',
  'high Sp. Atk',
  'high Sp. Def',
  'medium Speed',
  '1',
  'False'],
 4: ['Grass',
  'Poison',
  'high Total',
  'medium HP',
  'medium Attack',
  'high Defense',
  'high Sp. Atk',
  'high Sp. Def',
  'medium Speed',
  '1',
  'False'],
 5: ['Fire',
  'Does not have Type 2',
  'low Total',
  'low HP',
  'low Attack',
  'low Defense',
  'low Sp. Atk',
  'low Sp. Def',
  'low Speed',
  '1',
  'False'],
 6: ['Fire',
  'Does not have Type 2',
  'low Total',
  'low HP',
  'low Attack',
  'low Defense',
  'medium Sp. Atk',
  'low Sp. Def',
  'medium Speed',
  '1',
  'False'],
 7: ['Fire

### Apply Apriori to Dataset 1

In [38]:
FreqPat, FreqPat_freq = Apriori(transaction_db, L1_freq, MINSUP)
FreqPat

[('Does not have Type 2',),
 ('low Total',),
 ('medium Total',),
 ('low HP',),
 ('medium HP',),
 ('low Attack',),
 ('medium Attack',),
 ('low Defense',),
 ('low Sp. Atk',),
 ('medium Sp. Atk',),
 ('low Sp. Def',),
 ('low Speed',),
 ('medium Speed',),
 ('False',),
 ('Does not have Type 2', 'low Total'),
 ('Does not have Type 2', 'low HP'),
 ('Does not have Type 2', 'low Attack'),
 ('Does not have Type 2', 'low Defense'),
 ('Does not have Type 2', 'low Sp. Atk'),
 ('Does not have Type 2', 'low Sp. Def'),
 ('Does not have Type 2', 'low Speed'),
 ('Does not have Type 2', 'False'),
 ('low HP', 'low Total'),
 ('low Attack', 'low Total'),
 ('low Defense', 'low Total'),
 ('low Sp. Atk', 'low Total'),
 ('low Sp. Def', 'low Total'),
 ('low Speed', 'low Total'),
 ('False', 'low Total'),
 ('False', 'medium Total'),
 ('low Attack', 'low HP'),
 ('low Defense', 'low HP'),
 ('low HP', 'low Sp. Atk'),
 ('low HP', 'low Sp. Def'),
 ('low HP', 'low Speed'),
 ('False', 'low HP'),
 ('low Attack', 'low Defen

### Apply FP-Growth to Dataset 1

### Rule Generation for Dataset 1

In [47]:
rule_gen(FreqPat, FreqPat_freq, MINCONF)

["['Does not have Type 2', 'low Total'] -> ['False']",
 "['Does not have Type 2', 'low HP'] -> ['False']",
 "['Does not have Type 2', 'low Attack'] -> ['False']",
 "['Does not have Type 2', 'low Defense'] -> ['False']",
 "['Does not have Type 2', 'low Sp. Atk'] -> ['False']",
 "['Does not have Type 2', 'low Sp. Def'] -> ['False']",
 "['Does not have Type 2', 'low Speed'] -> ['False']",
 "['low Attack', 'low HP'] -> ['low Total']",
 "['low Attack', 'low Total'] -> ['low HP']",
 "['low HP', 'low Total'] -> ['low Attack']",
 "['low Defense', 'low HP'] -> ['low Total']",
 "['low HP', 'low Total'] -> ['low Defense']",
 "['low HP', 'low Sp. Atk'] -> ['low Total']",
 "['low Sp. Atk', 'low Total'] -> ['low HP']",
 "['low HP', 'low Sp. Def'] -> ['low Total']",
 "['low HP', 'low Total'] -> ['low Sp. Def']",
 "['low HP', 'low Speed'] -> ['low Total']",
 "['False', 'low HP'] -> ['low Total']",
 "['low HP', 'low Total'] -> ['False']",
 "['low Attack', 'low Defense'] -> ['low Total']",
 "['low Attac

## Generate csv for WEKA

In [39]:
df = pd.read_csv('data/data_1_3_20_10_5.csv')

In [40]:
# Init dictionary for every transaction
trans_num = df['TransID'].max()
di = {}
for i in range(1, trans_num + 1):
    di[i] = []
# Extract info from df to dictionary
df_num = len(df)
for i in range(df_num):
    index = df.iloc[i][0]
    item = df.iloc[i][2]    
    di[index] += [item]

In [41]:
outputfile = open('data/weka_data_1_3_20_10_5.csv', 'w')

In [42]:
outputfile.write('TransID')

title = list(df['Item'].unique())
for item in title:
    outputfile.write(',%s' % item)
outputfile.write('\n')

1

In [43]:
trans_num = df['TransID'].max()

In [44]:
for i in range(1, trans_num + 1):
    outputfile.write('%d' % i)
    for item in title:
        if item in di[i]:
            outputfile.write(',1')
        else:
            outputfile.write(',0')
    outputfile.write('\n')

In [45]:
outputfile.close()

In [46]:
df_weka = pd.read_csv('data/weka_data_1_3_20_10_5.csv')
df_weka.head()

Unnamed: 0,TransID,4089,8704,9205,9430,12679,12779,18927,19970,38,...,17679,15854,16381,9914,7711,14049,19119,17637,5085,11480
0,1,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Find and answer

What are rules with
* High support, high confidence ? 
* High support, low confidence ? 
* Low support, low confidence ? 
* Low support, high confidence ?