In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import matplotlib
import pandas
import numpy
print (matplotlib.__version__)
print (pandas.__version__)
print (numpy.__version__)

1.5.3
0.18.1
1.11.1


In [3]:
#Reading ratings file:
u1 = pd.read_csv('/home/serge/Desktop/Kanye/Hubert/Memoir/data/data.csv')
u1

Unnamed: 0,UserId,JobId,Rating
0,1513,30,2
1,531,134,5
2,170,140,5
3,913,22,3
4,2007,86,3
5,702,91,3
6,1704,16,5
7,1575,84,2
8,506,44,4
9,1375,76,2


In [4]:
#from sklearn.cross_validation import train_test_split
from sklearn import cross_validation as cv
u1.base, u1.test = cv.train_test_split(u1,test_size=0.30, random_state=0)


In [5]:
from __future__ import division
from math import sqrt
import time


class UserBasedCF:

    def __init__(self, train_file, test_file):
        self.train_file = train_file
        self.test_file = test_file
        self.readData()
        self.userSimilarity()

    def readData(self):
        self.train, self.test = {}, {}
        for line in open(self.train_file):
            user, item, score, _ = line.strip().split("\t")
            self.train.setdefault(user, {})
            self.train[user][item] = int(score)
        for line in open(self.test_file):
            user, item, score, _ = line.strip().split("\t")
            self.test.setdefault(user, {})
            self.test[user][item] = int(score)

    def userSimilarity(self):
        # Build invese table for item_users
        self.item_users = {}
        for u, items in self.train.iteritems():
            for i in items.keys():
                if i not in self.item_users:
                    self.item_users[i] = set()
                self.item_users[i].add(u)

        C, N = {}, {}
        for i, users in self.item_users.iteritems():
            for u in users:
                N.setdefault(u, 0)
                N[u] += 1
                C.setdefault(u, {})
                for v in users:
                    if u == v:
                        continue
                    C[u].setdefault(v, 0)
                    C[u][v] += 1

        self.W = {}
        # Calculate finial similarity matrix W
        for u, related_users in C.iteritems():
            self.W.setdefault(u, {})
            for v, cuv in related_users.iteritems():
                self.W[u][v] = cuv / sqrt(N[u] * N[v])

    def recommend(self, user, K=3, N=10):
        rank = {}
        interacted_items = self.train[user]
        for v, wuv in sorted(self.W[user].items(),
                             key=lambda x: x[1], reverse=True)[0:K]:
            for i, rvi in self.train[v].items():
                if i in interacted_items:
                    # filter items user interacted before
                    continue
                rank.setdefault(i, 0)
                rank[i] += wuv * rvi

        return dict(sorted(rank.items(),
                           key=lambda x: x[1], reverse=True)[0:N])

    def evaluate(self, train=None, test=None, K=3, N=10):
        train = self.train
        test = self.test
        hit, recall, precision = 0, 0, 0
        for user in train.keys():
            tu = test.get(user, {})
            rank = self.recommend(user, K=K, N=N)
            for i, _ in rank.items():
                if i in tu:
                    hit += 1
            recall += len(tu)
            precision += N
        recall = hit / recall
        precision = hit / precision
        f = 2 * recall * precision / (precision + recall)
        return (recall, precision, f)


class ItemBasedCF:

    def __init__(self, train_file, test_file):
        self.train_file = train_file
        self.test_file = test_file
        self.readData()
        self.itemSimilarity()

    def readData(self):
        self.train, self.test = {}, {}
        for line in open(self.train_file):
            user, item, score, _ = line.strip().split("\t")
            self.train.setdefault(user, {})
            self.train[user][item] = int(score)
        for line in open(self.test_file):
            user, item, score, _ = line.strip().split("\t")
            self.test.setdefault(user, {})
            self.test[user][item] = int(score)

    def itemSimilarity(self):
        C = dict()  
        N = dict()
        for user, items in self.train.items():
            for i in items.keys():
                N.setdefault(i, 0)
                N[i] += 1
                C.setdefault(i, {})
                for j in items.keys():
                    if i == j:
                        continue
                    C[i].setdefault(j, 0)
                    C[i][j] += 1
        self.W = {}
        for i, related_items in C.items():
            self.W.setdefault(i, {})
            for j, cij in related_items.items():
                self.W[i][j] = cij / (sqrt(N[i] * N[j]))
        return self.W

    def recommend(self, user, K=3, N=10):
        rank = dict()
        interacted_items = self.train[user]
        for item, score in interacted_items.iteritems():
            for j, wj in sorted(self.W[item].iteritems(),
                                key=lambda x: x[1], reverse=True)[0:K]:
                if j in interacted_items.keys():
                    continue
                rank.setdefault(j, 0)
                rank[j] += score * wj
        return dict(sorted(rank.items(),
                           key=lambda x: x[1], reverse=True)[0:N])

    def evaluate(self, train=None, test=None, K=3, N=10):
        train = self.train
        test = self.test
        hit, recall, precision = 0, 0, 0
        for user in train.keys():
            tu = test.get(user, {})
            rank = self.recommend(user, K=K, N=N)
            for i, _ in rank.items():
                if i in tu:
                    hit += 1
            recall += len(tu)
            precision += N
        recall = hit / recall
        precision = hit / precision
        f = 2 * recall * precision / (precision + recall)
        return (recall, precision, f)

if __name__ == '__main__':
    start_time = time.time()

    # choose which method to use
    # model = UserBasedCF('u1.base', 'u1.test')
    model = ItemBasedCF('u1.base', 'u1.test')

    # The performance of model under various K
    klst = [5, 10, 15, 20, 25]
    print("recall", "precision", "f1")
    for k in klst:
        print model.evaluate(train='u1.base', test='u1.test', K=k)
    print("--- %s seconds ---" % (time.time() - start_time))

IOError: [Errno 2] No such file or directory: 'u1.base'