In [1]:
import numpy as np
import time
from itertools import combinations
from itertools import permutations

def read_data(file_name):
    file = open(file_name, 'r')
    data = []
    disease = []
    for line in file:
        genes = line.replace("\n","").split("\t")
        disease.append(genes.pop())
        for i in range(0,len(genes)):
            genes[i] = "G"+str(i+1)+"_"+genes[i]
        data.append(genes)

    return data

#Get the support count of particular item.
def support_count(data,klist):
    counter = 0
#     print(klist)
    for disease in data:
        if set(klist).issubset(set(disease)):
            counter += 1
    return counter

#Helper funcitons
def check_unique(ilist):
    idx = 0
    retVal = True
    for i in range(idx,len(ilist)):
        for j in range(idx+1,len(ilist)):
            if ilist[i] == ilist[j]:
                return False
        idx += 1
    return retVal

def check_dup(iset,ilist):
    retVal = True
    for items in ilist:
        if set(iset).issubset(set(items)):
            return False
    return retVal
    

#Initialize k=1
def initialize_one_level(data,freq_map,min_sup):
    freq_map = {}
    for disease in data:
        for gene in disease:
            if gene not in freq_map:
                sup = support_count(data,[gene])
                if sup >= min_sup:
                    freq_map[gene] = sup
    return freq_map

# Find all frequent items from the data.
def k_level(data,freq_map,level,min_sup):
    new_fm = []
    new_fm_count = []
    idx = 0
    sup = 0
    for item1 in list(freq_map)[idx:]:
        for item2 in list(freq_map)[idx + 1:]:
            new_set = []
            s1 = item1
            s2 = item2
            if level == 2:
                s1 = item1.split()
                s2 = item2.split()
                new_set = new_set + s1
                new_set.append(s2[len(s2)-1])
                sup = support_count(data,new_set)
                if sup >= min_sup:
                    new_fm_count.append(sup)
                    new_fm.append(new_set)
            else:
                new_set = new_set + s1
                new_set.append (s2[len(s2)-1])
                temp = False
                for i in range(level-2):
                    if item1[i] != item2[i]:
                        temp = True
                        break
                if temp:
                    break
                # if check_unique(new_set):
                #     # print(new_set)
                sup = support_count(data, new_set)
                if sup >= min_sup:
                    new_fm_count.append(sup)
                    new_fm.append(new_set)
                    
                    # else:
                    #     break
        idx += 1
#     return new_fm
    
    return new_fm

In [2]:
data = read_data("association-rule-test-data.txt")

min_sup = 50
start = time.time()
freq_map = {}
freq_map = initialize_one_level(data,freq_map,min_sup)
rule_map = {}
rule_map[1] = list(freq_map.keys())
print("Support Level: " + str(min_sup))
print("Number of length - 1 Frequent Itemset: " + str(len(freq_map)))

for i in range(2,10):
    freq_map = k_level(data,freq_map,i,min_sup)
    if len(freq_map) == 0:
        break
    rule_map[i] = list(freq_map)
    print("Number of length - " + str(i)+ " Frequent Itemset: " + str(len(freq_map)))
    
print(time.time()-start)

Support Level: 50
Number of length - 1 Frequent Itemset: 109
Number of length - 2 Frequent Itemset: 63
Number of length - 3 Frequent Itemset: 2
4.03591251373291


In [3]:
def confidence(data,rule,head):
    sup_top = support_count(data,rule)
    sup_bot = support_count(data,head)
    return (sup_top/sup_bot)*100

In [4]:
def generate_rules(rule_map,min_confidence):
    prune_list = []
    keep_rule = np.array(["","",0,0])
    for i in range(len(rule_map),1,-1):
        for items in rule_map[i]:
            for j in range(len(items)-1,0,-1):
                sample = list(combinations(items,j))
                for head in sample:
                    body = set(items) - set(head)
    #                 print(head,body)
                    rule = (set(head).union(set(body)))
                    head = (set(head))
                    temp = False
                    for prune_rule in prune_list:
                        if head.issubset(prune_rule):
                            temp = True
                    if temp == False:
                        conf = confidence(data,list(rule),list(head))
                        if conf < min_confidence:
                            if len(head) > 1:
                                prune_list.append(head)
    #                         print(prune_list)
                        else:
                            keep_rule = np.vstack((keep_rule,np.array([head,body,len(head),len(body)])))
    return keep_rule


In [5]:
def template1(T1,T2,T3,rules):
    retVal = []
    if T2 == "ANY":
        if T1 == "RULE":
            count = 0
            for index,i in enumerate(rules[1:,:2]):
                if set(T3).issubset(i[0]) or set(T3).issubset(i[1]):
                    retVal.append(index)
                    count += 1
#             print(count)
            
            print(T1,T2,T3,count)
            return retVal
        
        elif T1 == "HEAD":
            count = 0
            for index,i in enumerate(rules[1:,:2]):
                if set(T3).issubset(i[0]):
                    count += 1
                    retVal.append(index)
                    
            print(T1,T2,T3,count)
            return retVal
            
        elif T1 == "BODY":
            count = 0
            for index,i in enumerate(rules[1:,:2]):
                if set(T3).issubset(i[1]):
                    count += 1
                    retVal.append(index)
                    
            print(T1,T2,T3,count)
            return retVal
            
        else:
            print("Error inputs!")
            
    elif T2 == "NONE":
        if T1 == "RULE":
            count = 0
            for index,i in enumerate(rules[1:,:2]):
                if set(T3).issubset(i[0]) or set(T3).issubset(i[1]):
                    count += 1
                    retVal.append(index)
                    
            print(T1,T2,T3,(len(rules)-1-count))
            return retVal
        
        elif T1 == "HEAD":
            count = 0
            for index,i in enumerate(rules[1:,:2]):
                if set(T3).issubset(i[0]):
                    count += 1
                    retVal.append(index)
                    
            print(T1,T2,T3,(len(rules)-1-count))
            return retVal
            
        elif T1 == "BODY":
            count = 0
            for index,i in enumerate(rules[1:,:2]):
                if set(T3).issubset(i[1]):
                    count += 1
                    retVal.append(index)
                    
            print(T1,T2,T3,(len(rules)-1-count))
            return retVal
            
        else:
            print("Error inputs!")
            
    elif type(T2) == int:
        if T1 == "RULE":
            count = 0
            for index,i in enumerate(rules[1:,:2]):
                if len(set(T3).intersection(i[0])) + len(set(T3).intersection(i[1])) == T2:
                    count += 1
                    retVal.append(index)
            print(T1,T2,T3,count)
            return retVal
        
        elif T1 == "HEAD":
            count = 0
            for index,i in enumerate(rules[1:,:2]):
                if len(set(T3).intersection(i[0])) == T2:
                    count += 1
                    retVal.append(index)
                    
            print(T1,T2,T3,count)
            return retVal
            
        elif T1 == "BODY":
            count = 0
            for index,i in enumerate(rules[1:,:2]):
                if len(set(T3).intersection(i[1])) == T2:
                    count += 1
                    retVal.append(index)
                    
            print(T1,T2,T3,count)
            return retVal
        
        else:
            print("Error inputs!")
            
    else:
        print("Error inputs!")
        

In [6]:
min_confidence = 70
keep_rule = generate_rules(rule_map,min_confidence)
print(len(keep_rule))
t1_a = template1("RULE", "ANY", ['G59_Up'],keep_rule)
t1_b = template1("RULE", "NONE", ['G59_Up'],keep_rule)
t1_c = template1("RULE", 1, ['G59_Up', 'G10_Down'],keep_rule)
t1_d = template1("HEAD", "ANY", ['G59_Up'],keep_rule)
t1_e = template1("HEAD", "NONE", ['G59_Up'],keep_rule)
t1_f = template1("HEAD", 1, ['G59_Up', 'G10_Down'],keep_rule)
t1_g = template1("BODY", "ANY", ['G59_Up'],keep_rule)
t1_h = template1("BODY", "NONE", ['G59_Up'],keep_rule)
t1_i = template1("BODY", 1, ['G59_Up', 'G10_Down'],keep_rule)

118
RULE ANY ['G59_Up'] 26
RULE NONE ['G59_Up'] 91
RULE 1 ['G59_Up', 'G10_Down'] 39
HEAD ANY ['G59_Up'] 9
HEAD NONE ['G59_Up'] 108
HEAD 1 ['G59_Up', 'G10_Down'] 17
BODY ANY ['G59_Up'] 17
BODY NONE ['G59_Up'] 100
BODY 1 ['G59_Up', 'G10_Down'] 24


In [7]:
def template2(T1,T2,rules):
    retVal = []
    if T1 == "RULE":
        count = 0
        for index,r in enumerate(rules[1:]):
            if r[2] + r[3] >= T2:
                count += 1
                retVal.append(index)
                
        print(T1,T2,count)
        return retVal
        
    elif T1 == "HEAD":
        count = 0
        for index,r in enumerate(rules[1:]):
            if r[2] >= T2:
                count += 1
                retVal.append(index)
                
        print(T1,T2,count)
        return retVal
        
    elif T1 == "BODY":
        count = 0
        for index,r in enumerate(rules[1:]):
            if r[3] >= T2:
                count += 1
                retVal.append(index)
                
        print(T1,T2,count)
        return retVal

In [8]:
t2_a = template2("RULE", 3,keep_rule)
t2_b = template2("HEAD", 2,keep_rule)
t2_c = template2("BODY", 1,keep_rule)

RULE 3 9
HEAD 2 6
BODY 1 117


In [27]:
def template3(T1,T2,T3,T4,T5,T6,T7,rules):
    if T1 == '1or1':
        t1 = template1(T2,T3,T4,rules)
        t2 = template1(T5,T6,T7,rules)
        count = len(set(t1) | set(t2))
        print("Tempalte3__= (",T1,T2,T3,T4,T5,T6,T7,") == ",count)
    
    elif T1 == '1and1':
        t1 = template1(T2,T3,T4,rules)
        t2 = template1(T5,T6,T7,rules)
        count = len(set(t1) & set(t2))
        print("Tempalte3__= (",T1,T2,T3,T4,T5,T6,T7,") == ",count)
    
    elif T1 == '1or2':
        t1 = template1(T2,T3,T4,rules)
        t2 = template2(T5,T6,rules)
        count = len(set(t1) | set(t2))
        print("Tempalte3__= (",T1,T2,T3,T4,T5,T6, ") == ",count)
    
    elif T1 == '1and2':
        t1 = template1(T2,T3,T4,rules)
        t2 = template2(T5,T6,rules)
        count = len(set(t1) & set(t2))
        print("Tempalte3__= (",T1,T2,T3,T4,T5,T6, ") == ",count)
    
    elif T1 == '2or2':
        t1 = template2(T2,T3,rules)
        t2 = template2(T4,T5,rules)
        count = len(set(t1) | set(t2))
        print("Tempalte3__= (",T1,T2,T3,T4,T5,") == ",count)
    
    elif T1 == '2and2':
        t1 = template2(T2,T3,rules)
        t2 = template2(T4,T5,rules)
        count = len(set(t1) & set(t2))
#         print(set(t1) & set(t2))
        print("Tempalte3__= (",T1,T2,T3,T4,T5,") == ",count)
        
    else:
        print("Error inputs!")

In [28]:
template3("1or1", "HEAD", "ANY",['G10_Down'], "BODY", 1, ['G59_Up'], keep_rule)
template3("1and1", "HEAD", "ANY",['G10_Down'], "BODY", 1, ['G59_Up'], keep_rule)
template3("1or2", "HEAD", "ANY",['G10_Down'], "BODY", 2, None, keep_rule)
template3("1and2", "HEAD", "ANY",['G10_Down'], "BODY", 2, None, keep_rule)
template3("2or2", "HEAD", 1,"BODY", 2, None, None, keep_rule)
template3("2and2", "HEAD", 1,"BODY", 2, None, None, keep_rule)

HEAD ANY ['G10_Down'] 8
BODY 1 ['G59_Up'] 17
Tempalte3__= ( 1or1 HEAD ANY ['G10_Down'] BODY 1 ['G59_Up'] ) ==  24
HEAD ANY ['G10_Down'] 8
BODY 1 ['G59_Up'] 17
Tempalte3__= ( 1and1 HEAD ANY ['G10_Down'] BODY 1 ['G59_Up'] ) ==  1
HEAD ANY ['G10_Down'] 8
BODY 2 3
Tempalte3__= ( 1or2 HEAD ANY ['G10_Down'] BODY 2 ) ==  11
HEAD ANY ['G10_Down'] 8
BODY 2 3
Tempalte3__= ( 1and2 HEAD ANY ['G10_Down'] BODY 2 ) ==  0
HEAD 1 117
BODY 2 3
Tempalte3__= ( 2or2 HEAD 1 BODY 2 ) ==  117
HEAD 1 117
BODY 2 3
Tempalte3__= ( 2and2 HEAD 1 BODY 2 ) ==  3


In [25]:
template3("2and2", "HEAD", 1,"BODY", 2, None, None, keep_rule)

HEAD 1 117
BODY 2 3
{8, 3, 4}
Tempalte3__= ( 2and2 HEAD 1 BODY 2 ) ==  3
