In [1]:
#!/usr/bin/env python
import pandas as pd
from math import sqrt
import numpy as np
from collections import defaultdict


class RecommenderSystem(object):
    FOLD = 20
    file_name = None
    sep = '\t'
    column_names = []
    size = 0
    user_len = 0
    item_len = 0
    matrix_size = 0
    file_size = 0
    u2u_corr = None
    general_mae = {}

    def __init__(self, file_name, sep, column_names, filtering_type=1):
        self.file_name = file_name
        self.sep = sep
        self.column_names = column_names
        self.filtering_type = filtering_type

    def get_data(self):
        assert isinstance(self.column_names, list)
        df = pd.read_csv(self.file_name, sep=self.sep, names=self.column_names)
        self.file_size = df.size
        return df.reindex(np.random.permutation(df.index))

    def data2dict(self):
        data = self.get_data()
        split_size = self.file_size / self.FOLD
        for iterator in xrange(0, self.file_size, split_size):
            self.dic = defaultdict(dict)
            test_data = data[iterator: iterator + split_size]
            train_data = data.loc[data.index.difference(test_data.index)]
            for data in train_data.values:
                user_id = data[0]
                movie_id = data[1]
                rating = data[2]
                if self.filtering_type == 1:
                    self.dic[user_id].update({movie_id: rating})
                    self.matrix_size = self.user_len = len(self.dic.keys())
                else:
                    self.dic[movie_id].update({user_id: rating})
                    self.matrix_size = self.item_len = len(self.dic.keys())
            self.create_matrix(self.dic)
            self.nan2zero()
            self.calculate_error(test_data)

    def create_matrix(self, item_list):
        assert isinstance(item_list, dict)
        self.u2u_corr = np.zeros([944, 944])
        for item in item_list.keys():
            for item2 in item_list.keys():
                if item != item2:
                    i1 = item_list[item]
                    i2 = item_list[item2]
                    intersect = list(set(i1.keys()).intersection(i2.keys()))
                    if intersect:
                        item_rating = [i1[rating] for rating in intersect]
                        item2_rating = [i2[rating] for rating in intersect]
                        if not self.u2u_corr[item][item2]:
                            corr_value = self.pearson_corr(item_rating, item2_rating)
                            self.u2u_corr[item2][item] = self.u2u_corr[item][item2] = corr_value

    def nan2zero(self):
        nans = np.isnan(self.u2u_corr)
        self.u2u_corr[nans] = 0

    def calculate_error(self, test_data):
        mae = 0.0
        k_neighbor = [5, 10, 20, 30, 40]

        for item in test_data.values:
            rating = item[2]
            if self.filtering_type == 1:
                key = item[0]
                value = item[1]
            else:
                key = item[1]
                value = item[2]
            for k in k_neighbor:
                mae += abs(rating - self.predict(key, value, rating, k))

            for k in k_neighbor:
                self.general_mae.update({k: mae / test_data.size})

    def predict(self, key, value, rating, k):
        weighted_value = []
        for i in self.get_max_val(key, k):
            if value:
                val = self.dic.get(i[0]).get(value, 0)
                weighted_value.append(val * i[1])
        mean = self.mean(weighted_value)
        if mean:
            return mean
        return 3

    def get_max_val(self, item, max_len):
        row_item = self.u2u_corr[item]
        max_user_list = [i + 1 for i in row_item.flatten().argsort()[-1 * max_len:]]
        max_corr_list = np.sort(row_item.flatten())[-1 * max_len:]
        return zip(max_user_list, max_corr_list)

    @staticmethod
    def mean(item_list):
        assert isinstance(item_list, list)
        item_list = filter(lambda x: x > 0, item_list)
        if not len(item_list):
            return 0
        return sum(item_list) / len(item_list)

    @staticmethod
    def pearson_corr(a, b):
        def average(x):
            assert len(x) > 0
            return float(sum(x)) / len(x)

        assert len(a) == len(b)
        n = len(a)
        assert n > 0
        avg_a = average(a)
        avg_b = average(b)
        diffprod = adiff2 = bdiff2 = 0
        for idx in range(n):
            adiff = a[idx] - avg_a
            bdiff = b[idx] - avg_b
            diffprod += adiff * bdiff
            adiff2 += adiff * adiff
            bdiff2 += bdiff * bdiff
        return diffprod / sqrt(adiff2 * bdiff2)
