In [1]:
import math
from numpy import *
import csv
import pandas as pd
import random
import numpy as np

# read files
def load_files(method):
    train_data = np.load(path + method + '_train_data.npy').item()
    test_data = np.load(path + 'user_test_data.npy').item()
    return train_data, test_data

## Item-CF

In [2]:
def get_distance_matrix(name):
    matrix = pd.read_csv(path + name + '.csv')
    return matrix.set_index('Unnamed: 0')

class ItemCF:
    def __init__(self, metric='jaccard'):
        self.metric = metric
    
    def fit(self, cluster, train_data):
        self.train_data = train_data
        #self.distance_matrix = generate_distance_matrix(self.train_data, self.metric)
        self.distance_matrix = get_distance_matrix('item_' + self.metric + '_matrix_' + str(cluster))
        return self.distance_matrix
    
    def predict(self, user_id, k=10):
        predictions = {}
        if user_id not in self.train_data.keys(): return {}
        for item, value in self.train_data[user_id].items():
            neighbours = self.distance_matrix[str(item)].drop(item).sort_values(ascending=False)[:k]
            neighbours = neighbours.fillna(min(neighbours))
            #total_dist = sum(neighbours)
            #if total_dist == 0: total_dist = 1
            #neighbours = neighbours.apply(lambda x: x/total_dist) #求用户权重，越近权重越大
            for neighbour in neighbours.index:
                if neighbour not in self.train_data[user_id].keys():
                    predictions.setdefault(item, 0)
                    predictions[item] += neighbours[neighbour]*value
        return dict(sorted(predictions.items(), key=lambda e: e[1], reverse=True)[:k])

## Apriori

In [3]:
from numpy import *

def dict2list(data):
    listtmp = []
    for subDict in data.values():
        listtmp.append(list(subDict.keys()))
    return listtmp

def loadDataSet():
    return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]

def createC1(dataSet):
    C1 = []
    for transaction in dataSet:
        for item in transaction:
            if not [item] in C1:
                C1.append([item])
                
    C1.sort()
    return list(map(frozenset, C1))#use frozen set so we
                            #can use it as a key in a dict    

def scanD(D, Ck, minSupport):
    ssCnt = {}
    for tid in D:
        for can in Ck:
            if can.issubset(tid):
                ssCnt.setdefault(can, 0)
                ssCnt[can] += 1
    numItems = float(len(D))
    retList = []
    supportData = {}
    for key in ssCnt:
        support = ssCnt[key]/numItems
        if support >= minSupport:
            retList.insert(0,key)
        supportData[key] = support
    return retList, supportData

def aprioriGen(Lk, k): #creates Ck
    retList = []
    lenLk = len(Lk)
    for i in range(lenLk):
        for j in range(i+1, lenLk): 
            L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
            L1.sort(); L2.sort()
            if L1==L2: #if first k-2 elements are equal
                retList.append(Lk[i] | Lk[j]) #set union
    return retList

def apriori(dataSet, minSupport = 0.5):
    C1 = createC1(dataSet)
    D = list(map(set, dataSet))
    L1, supportData = scanD(D, C1, minSupport)
    L = [L1]
    k = 2
    while (len(L[k-2]) > 0):
        Ck = aprioriGen(L[k-2], k)
        Lk, supK = scanD(D, Ck, minSupport)#scan DB to get Lk
        supportData.update(supK)
        L.append(Lk)
        k += 1
    return L, supportData

def generateRules(L, supportData, minConf=0.7, minKulc=0.7, maxIR=0.5):  #supportData is a dict coming from scanD
    bigRuleList = []
    for i in range(1, len(L)):#only get the sets with two or more items
        for freqSet in L[i]:
            H1 = [frozenset([item]) for item in freqSet]
            if (i > 1):
                rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
            else:
                calcConf(freqSet, H1, supportData, bigRuleList, minConf, minKulc, maxIR)
    return bigRuleList         

def calcConf(freqSet, H, supportData, brl, minConf=0.7, minKulc=0.7, maxIR=0.5):
    prunedH = [] #create new list to return
    for conseq in H:
        if freqSet-conseq not in supportData or conseq not in supportData: continue
        conf = supportData[freqSet]/supportData[freqSet-conseq] #calc confidence
        kulc = (conf+ supportData[freqSet]/supportData[conseq])/2
        imbalance = abs(supportData[freqSet-conseq]-supportData[conseq])/(supportData[freqSet-conseq]+supportData[conseq]-supportData[freqSet])
        if conf >= minConf and kulc>=minKulc and imbalance<=maxIR: 
            #print(freqSet-conseq,'-->',conseq,'conf:',conf,'kulc:', kulc, 'imbalance:', imbalance)
            brl.append((freqSet-conseq, conseq, conf, kulc, imbalance))
            prunedH.append(conseq)
    return prunedH

def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
    m = len(H[0])
    if (len(freqSet) > (m + 1)): #try further merging
        Hmp1 = aprioriGen(H, m+1)#create Hm+1 new candidates
        Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
        if (len(Hmp1) > 1):    #need at least two sets to merge
            rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
            
def pntRules(ruleList, itemMeaning):
    for ruleTup in ruleList:
        for item in ruleTup[0]:
            print(itemMeaning[item])
        print("           -------->")
        for item in ruleTup[1]:
            print(itemMeaning[item])
        print("confidence: %f" % ruleTup[2])
        print(' ')      #print a blank line

def mine_rules(train_data, minSup=0.15, minConf=0.9, minKulc=0.7, maxIR=0.5):
    train_list = dict2list(train_data)
    freq_items, support = apriori(train_list, minSupport=minSup) # freq_items is a 2D list, support is a dict
    ass_rules = generateRules(freq_items, support, minConf, minKulc, maxIR) # ass_rules is a list of set (setA, setB, conf)
    return freq_items, support, ass_rules

def filter_rules_by_conf(ass_rules, minConf):
    new_list = list()
    for left, rightSet, conf, kulc, ir in ass_rules:
        if conf>=minConf:
            new_list.append((left, rightSet, conf, kulc, ir))
    return new_list

def filter_rules_by_kulc(ass_rules, minKulc):
    new_list = list()
    for left, rightSet, conf, kulc, ir in ass_rules:
        if kulc>=minKulc:
            new_list.append((left, rightSet, conf, kulc, ir))
    return new_list

def filter_rules_by_IR(ass_rules, maxIR):
    new_list = list()
    for left, rightSet, conf, kulc, ir in ass_rules:
        if ir<=maxIR:
            new_list.append((left, rightSet, conf, kulc, ir))
    return new_list

## CF-AROLS

In [4]:
def recommendOnCombinationV1(factor, N, algo, rules, support, trainDict, testData, th=0.7):
    hitCF = 0
    #hitRule = 0
    hitComb = 0
    allR = 0
    allPCF = 0
    #allPRule = 0
    allPComb = 0
    recommendDictCF = {}
    #recommendDictRule = {}  # 推荐集合-分数
    recommendDictComb = {}
    #recommendSetRule = set()
    #recommendSetComb = set()
    students = int(len(testData)*0.1)
    for user, itemdict in testData.items():
        if user not in trainDict.keys(): continue
        if np.random.random()>0.1: continue
        if students == 0: break
        students -= 1
        items = itemdict.keys()
        recommendDictCF.clear()
        recommendDictCF = recommender.predict(user, k=factor*N) #多返回一些推荐item
        for recItem, score in list(recommendDictCF.items())[:N]:
            if recItem in items:
                hitCF += 1

        allR += len(items)
        allPCF += N

        avgCF = 1
        if not len(recommendDictCF.values()) == 0:
            maxCF = max(recommendDictCF.values())
            minCF = min(recommendDictCF.values())
            avgCF = (maxCF+minCF)/2

        #Association Rules
        #recommendDictRule.clear()
        recommendDictComb.clear()
        #recommendSetRule.clear()
        #recommendSetComb.clear()
        history = trainDict[user].keys()
        # 遍历规则，添加推荐
        for left, rightSet, conf, kulc, ir in rules:
            if set(left).issubset(history):
                if not set(rightSet).issubset(history) and rightSet in support:
                    #recommendDictRule.setdefault(rightSet, 0)
                    recommendDictComb.setdefault(rightSet, 0)
                    #recommendDictRule[rightSet] += conf #* support[rightSet]
                    recommendDictComb[rightSet] += conf #* support[rightSet]


        #for item, score in sorted(recommendDictRule.items(), key=lambda x: x[1], reverse=True):
        #    recommendSetRule = recommendSetRule | set(item)
        #    if len(recommendSetRule) >= N: break

        #hitRule += len(recommendSetRule & set(items))
        #allPRule += len(recommendSetRule)

        #混合推荐算法：
        avgComb = 1
        if not len(recommendDictComb.values()) == 0:
            maxComb = max(recommendDictComb.values())
            minComb = min(recommendDictComb.values())
            avgComb = (maxComb+minComb)/2

        tmp = {}

        for itemSet, score in recommendDictComb.items():
            for item in itemSet:
                tmp.setdefault(item, 0)
                if item in recommendDictCF.keys():
                    tmp[item] += score*recommendDictCF[item]
                else:
                    tmp[item] += score*avgCF
                    
        for item, score in recommendDictCF.items():
                tmp.setdefault(item,0)
                tmp[item] += score*avgComb

        threshold = avgComb*avgCF*th
        #归一化recommendDictCF的值并使得recommendDictRule乘处理后的值，没有的乘平均值（毕竟CF准确率低）
        for item, score in sorted(tmp.items(), key=lambda x: x[1], reverse=True)[:N]:
            if(score < threshold): continue
            allPComb += 1
            if item in items:
                hitComb += 1

    precisionCF = hitCF * 100 / (allPCF * 1.0)
    recallCF = hitCF * 100 / (allR * 1.0)
    precisionComb = hitComb * 100 / (allPComb * 1.0)
    recallComb = hitComb * 100 / (allR * 1.0)

    return precisionCF, recallCF, precisionComb, recallComb

## Evaluate

In [5]:
random.seed(42)

path='./birch6_final/'

cluster=0
N=10
minSup_list = [0.13, 0.11, 0, 0.13, 0.17, 0.21]
param_name = 'th'
param_list = range(0, 110, 10)

results = pd.DataFrame(columns=['cluster', 'param', 'n_rules', 'precisionCF', 'recallCF', 'f1scoreCF', 'precisionComb', 'recallComb', 'f1scoreComb'])
train_data, test_data = load_files('user')

for cluster in range(0, 6):
    print('cluster', cluster)
    
    if cluster == 2: continue
    recommender = ItemCF(metric='cosine')
    distance_matrix = recommender.fit(cluster, train_data[cluster])

    freq_items, support, ass_rules = mine_rules(train_data[cluster], minSup=minSup_list[cluster], minConf=0.84, minKulc=0.88, maxIR=0.15)
    
    for param in param_list:
        print('param', param)
        minParam = float(param)/100
        new_ass_rules = ass_rules
        #if param<50: new_ass_rules = filter_rules_by_IR(ass_rules, minParam)
        n_rules = len(new_ass_rules)
        precisionCF, recallCF, precisionComb, recallComb = recommendOnCombinationV1(2, N, recommender, new_ass_rules, support, 
                                                                            train_data[cluster], test_data[cluster], th=minParam)
        results.loc[len(results)] = [cluster, float(param)/100, n_rules, precisionCF, recallCF, 2*precisionCF*recallCF/(precisionCF+recallCF),precisionComb, recallComb, 2*precisionComb*recallComb/(precisionComb+recallComb)]

results.to_csv(path+'birch_cf_arols_'+param_name+'.csv', index=False)

cluster 0
param 0
param 10
param 20
param 30
param 40
param 50
param 60
param 70
param 80
param 90
param 100
cluster 1
param 0
param 10
param 20
param 30
param 40
param 50
param 60
param 70
param 80
param 90
param 100
cluster 2
cluster 3
param 0
param 10
param 20
param 30
param 40
param 50
param 60
param 70
param 80
param 90
param 100
cluster 4


KeyboardInterrupt: 

In [7]:
random.seed(42)

path='./birch6_validate/'

cluster=0
N=10
minSup_list = [0.13, 0.11, 0, 0.13, 0.17, 0.21]
param_name = 'N'
param_list = range(5, 65, 5)

results = pd.DataFrame(columns=['cluster', 'param', 'n_rules', 'precisionCF', 'recallCF', 'f1scoreCF', 'precisionComb', 'recallComb', 'f1scoreComb'])
#train_data, test_data = load_files('user')
train_data = np.load('./birch6_validate/user_train_data.npy').item()
test_data = np.load('./birch6_final/user_test_data.npy').item()

for cluster in range(0, 6):
    print('cluster', cluster)
    
    if cluster == 2: continue
    recommender = ItemCF(metric='cosine')
    distance_matrix = recommender.fit(cluster, train_data[cluster])

    freq_items, support, ass_rules = mine_rules(train_data[cluster], minSup=minSup_list[cluster], minConf=0.84, minKulc=0.88, maxIR=0.15)
    
    for param in param_list:
        print('param', param)
        new_ass_rules = ass_rules
        n_rules = len(new_ass_rules)
        precisionCF, recallCF, precisionComb, recallComb = recommendOnCombinationV1(2, param, recommender, new_ass_rules, support, 
                                                                            train_data[cluster], test_data[cluster], th=minParam)
        results.loc[len(results)] = [cluster, param, n_rules, precisionCF, recallCF, 2*precisionCF*recallCF/(precisionCF+recallCF),precisionComb, recallComb, 2*precisionComb*recallComb/(precisionComb+recallComb)]

results.to_csv(path+'birch_cf_arols_'+param_name+'_final.csv', index=False)

cluster 0
param 5
param 10
param 15
param 20
param 25
param 30
param 35
param 40
param 45
param 50
param 55
param 60
cluster 1
param 5
param 10
param 15
param 20
param 25
param 30
param 35
param 40
param 45
param 50
param 55
param 60
cluster 2
cluster 3
param 5
param 10
param 15
param 20
param 25
param 30
param 35
param 40
param 45
param 50
param 55
param 60
cluster 4
param 5
param 10
param 15
param 20
param 25
param 30
param 35
param 40
param 45
param 50
param 55
param 60
cluster 5
param 5
param 10
param 15
param 20
param 25
param 30
param 35
param 40
param 45
param 50
param 55
param 60
