In [1]:
import numpy as np
from fpdf import FPDF
from collections import Counter
from itertools import combinations

# load data

In [2]:
def load_data (adr='msnbc990928.seq'):
    DataFile = open(adr)
    TextLines = DataFile.readlines()
    LegendLine = TextLines[2]
    Legend = LegendLine.split()
    LegendCodes = list(range(1,len(Legend)+1))
    encode = dict(zip(LegendCodes,Legend))
    Sequences = [[int(Entry) for Entry in Line.split()] for Line in TextLines[7:]]
    return Sequences,encode

In [3]:
def build_matrix (seq,i_numbers):
    t_number = len(seq)
    shape = (t_number,i_numbers)
    matrix = np.zeros(shape)
    extract_items(seq)
    for i in  range(len(seq)):
        for q in seq[i]:
            matrix[i][q-1] = 1
    return matrix

In [4]:
def extract_items (seq):
    items = []
    for i in seq:
        for q in i:
            if(q not in items):
                items.append(q)
    items = sorted(items)
    return items

In [5]:
def generate_c1 (matrix):
    c1 = list(np.sum(matrix, axis=0).astype(int)) #support
    return c1

In [6]:
def prune_c1 (c1,min_s):
    l1 = Counter()
    for i in range(len(c1)):
        if(c1[i] >= min_s):
            l1[frozenset([i+1])] += c1[i]
    return l1

In [7]:
def generate_ck (lk,time,matrix):
    nc = combined_set(lk,time)
    ck = frequency (nc,matrix)
    return ck

In [8]:
def prune_ck (ck,min_s):
    lk = Counter()
    for i in ck:
        if(ck[i] >= min_s):
            lk[i] += ck[i]
    return lk

In [9]:
def k_freq (maps,min_s,l1):
    lk = l1
    acc_itemset = []
    k = 2
    while (len(lk) != 0):
        ck = generate_ck (lk,k,matrix)
        lk = prune_ck (ck,min_s)
        for i in lk:
            maps[i] = lk[i]
        if(len(lk) == 0):
            break     
        acc_itemset.append(lk)
        k += 1
    return acc_itemset,maps

In [10]:
def apriori (matrix,min_s,min_c):
    c1 = generate_c1 (matrix)
    min_s = ((len(matrix) * min_s)) / 100
    maps = dict()
    l1 = prune_c1 (c1,min_s)
    for i in l1:
        maps[i] = l1[i]
    acc_itemset,maps = k_freq(maps,min_s,l1)
    t_number = len(matrix)
    result = strong_rules(matrix,maps,acc_itemset,min_c)
    return result

In [11]:
def strong_rules (t_number,maps,acc_itemset,min_c):
    result = []
    for i in acc_itemset:
        for lk in i:
            for j in subset(list(lk)):
                if (len(list(lk)) == len(j)):
                    continue
                next_rule = subtract(lk,j)
                sup = maps[frozenset(lk)] / t_number
                conf = maps[frozenset(lk)] / maps[frozenset(j)]
                if conf >= min_c:
                    s = round(sup * 100, 2)
                    c = round(conf * 100, 2)
                    result.append((j,next_rule, s, c))
    result.sort(key=lambda x:x[2])
    return result

In [12]:
def subset (m):
    comb = []
    for i in range(1,len(m)+1):
        comb += [list(j) for j in combinations(m, i)]
    return comb

In [13]:
def subtract (l1,l2):
    return [x for x in l1 if x not in l2]

In [14]:
def frequency (nc, matrix):
    c = Counter()
    for q in matrix:
        temp = set(np.where(q == 1)[0] + 1)
        for i in nc:
            if (temp.issuperset(i)):
                c[i] += 1
    return c

In [15]:
def combined_set (lk,time):
    nc = set()
    temp = list(lk)
    for i in range(0,len(temp)):
        for j in range(i+1,len(temp)):
            t = temp[i].union(temp[j])
            if(len(t) == time):
                nc.add(temp[i].union(temp[j]))
    return list(nc)

In [16]:
def show_result (result,encode):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font('Times', style = '', size = 15)
    line = 1
    for (i,j,k,z) in result:
        line = line + 1
        en_i = []
        en_j = []
        for a in i:
            en_i.append(encode[a])
        for a in j:
            en_j.append(encode[a])

        pdf.cell(200, 7, txt = str(en_i) + "----->" +str(en_j)+ ": sup="+ str(k)+ ": conf="+ str(z) ,ln = line , align = 'L')
    pdf.output("Rule.pdf")

In [17]:
if __name__ == '__main__':
    seq,encode = load_data()
#     seq = seq[0:10000]
    items = extract_items(seq)
    i_numbers = len(items)
    matrix = build_matrix (seq,i_numbers)
    result = apriori (matrix, min_s =0.1, min_c = 0.5)
    show_result (result,encode)

5
