In [None]:
import random
import math
import pprint
class UserBasedCF:
    def __init__(self,datafile = None):
        self.datafile = datafile
        self.readData()
        self.splitData(3,47)
    def readData(self,datafile = None):
        """
        Read data
        """
        self.datafile = datafile or self.datafile
        self.data = []
        for line in open(self.datafile):
            userid,itemid,record,_ = line.split(',')
            self.data.append((userid,itemid,float(record)))
    def splitData(self,k,seed,data=None,M = 8):
        """
        Randomly split dataset to test data and training data
        test data: training data =  1:M-1
        """
        self.testdata = {}
        self.traindata = {}
        data = data or self.data
        random.seed(seed)
        for user,item, record in self.data:
            if random.randint(0,M) == k:
                self.testdata.setdefault(user,{})
                self.testdata[user][item] = record
            else:
                self.traindata.setdefault(user,{})
                self.traindata[user][item] = record
                
    def userSimilarityBest(self,train = None):
        train = train or self.traindata

        self.userSimBest = dict()
        #build inverse table for item_users
        item_users = dict()
        for u,item in train.items():
            for i in item.keys():
                item_users.setdefault(i,set())
                item_users[i].add(u)
        #calculate co-rated items between users
        user_item_count = dict()
        count = dict()
        for item,users in item_users.items():
            for u in users:
                user_item_count.setdefault(u,0)
                user_item_count[u] += 1
                for v in users:
                    if u == v:continue
                    count.setdefault(u,{})
                    count[u].setdefault(v,0)
                    count[u][v] += 1
        #calculate finial similarity matrix
        for u ,related_users in count.items():
            self.userSimBest.setdefault(u,dict())
            for v, cuv in related_users.items():
                self.userSimBest[u][v] = cuv / math.sqrt(user_item_count[u] * user_item_count[v] * 1.0)

    def recommend(self,user,train = None,k = 8,nitem = 10):
        train = train or self.traindata
        rank = dict()
        interacted_items = train.get(user,{})
        for v ,wuv in sorted(self.userSimBest[user].items(),key = lambda x : x[1],reverse = True)[0:k]:
            for i , rvi in train[v].items():
                if i in interacted_items:
                    continue
                rank.setdefault(i,0)
                rank[i] += wuv
        return dict(sorted(rank.items(),key = lambda x :x[1],reverse = True)[0:nitem])
    
    def recallAndPrecision(self,train = None,test = None,k = 8,nitem = 10):
        train  = train or self.traindata
        test = test or self.testdata
        hit = 0
        recall = 0
        precision = 0
        for user in train.keys():
            tu = test.get(user,{})
            rank = self.recommend(user, train = train,k = k,nitem = nitem)
            for item,_ in rank.items():
                if item in tu:
                    hit += 1
            recall += len(tu)
            precision += nitem
        return (hit / (recall * 1.0),hit / (precision * 1.0))
    
    def coverage(self,train = None,test = None,k = 8,nitem = 10):
        train = train or self.traindata
        test = test or self.testdata
        recommend_items = set()
        all_items  = set()
        for user in train.keys():
            for item in train[user].keys():
                all_items.add(item)
            rank = self.recommend(user, train, k = k, nitem = nitem)
            for item,_ in rank.items():
                recommend_items.add(item)
        return len(recommend_items) / (len(all_items) * 1.0)
    
    def popularity(self,train = None,test = None,k = 8,nitem = 10):
        train = train or self.traindata
        test = test or self.testdata
        item_popularity = dict()
        for user ,items in train.items():
            for item in items.keys():
                item_popularity.setdefault(item,0)
                item_popularity[item] += 1
        ret = 0
        n = 0
        for user in train.keys():
            rank = self.recommend(user, train, k = k, nitem = nitem)
            for item ,_ in rank.items():
                ret += math.log(1+item_popularity[item])
                n += 1
        return ret / (n * 1.0)
    
#Test recommendation algorithm
def testRecommend():
    ubcf = UserBasedCF('ratings.data')
    ubcf.readData()
    ubcf.splitData(4,100)
    ubcf.userSimilarityBest()
    user = "344"
    rank = ubcf.recommend(user,k = 3)
    print(u'Recommend 10 movies to users with id 344:\n')
    pprint.pprint(rank)


def testUserBasedCF():
    cf  =  UserBasedCF('ratings.data')
    cf.userSimilarityBest()
    # Open file and empty it
    result = open('result_ubcf.data','w')
    print(u'The indexes (precision, recall, coverage, popularity) of the recommendation algorithm under different K values\n')
    print ("%3s%20s%20s%20s%20s" % ('K','precision','recall','coverage','popularity'))
    for k in [5,10,20,40,80,160]:
        recall,precision = cf.recallAndPrecision( k = k)
        coverage = cf.coverage(k = k)
        popularity = cf.popularity(k = k)
        print ("%3d%19.2f%%%19.2f%%%19.2f%%%20.6f" % (k,precision * 100,recall * 100,coverage * 100,popularity))
        result.write(str(k)+' '+str('%2.2f' % (precision * 100))+' '+str('%2.2f' % (recall * 100))+' '+str('%2.2f' % (coverage * 100))+' '+str('%2.6f' % popularity)+'\n')
if __name__ == "__main__":
    testRecommend()
    testUserBasedCF()