In [2]:
#!/usr/bin/env python
import pandas as pd
from math import sqrt
import numpy as np
from collections import defaultdict

import matplotlib.pyplot as plt
import random
class RecommenderSystem(object):

    self.FOLD = 20
    self.file_name = None
    self.sep = '\t'
    self.column_names = []
    self.size = 0
    self.user_len = 0
    self.item_len = 0
    self.matrix_size = 0
    self.general_mae = {}

    def __init__(self, file_name, sep, column_names):
        self.file_name = file_name
        self.sep = sep
        self.column_names = column_names
        self.filtering_type = 1

    def get_data(self): 
        assert isinstance(self.column_names, list)
        df = pd.read_csv(self.file_name, sep=self.sep, names=self.column_names)
        self.file_size = df.size
        return df.reindex(np.random.permutation(df.index))

    def data2dict():
        data = self.get_data()
        split_size = self.size / self.FOLD
        for dataset in xrange(0, size, split_size):
            self.dic = defaultdict(dict)
            train_data, test_data = data[dataset: dataset+split_size], data[dataset+split_size:]
            for data in train_data.values:
                user_id = data[0]
                movie_id = data[1]
                rating = data[2]
                if self.filtering_type == 1:
                    self.dic[user_id].update({movie_id: rating})
                    self.matrix_size = self.user_len = len(dic.keys())
                else:
                    self.dic[movie_id].update({user_id: rating})
                    self.matrix_size = self.item_len = len(dic.keys())
            self.create_matrix(dic)
            self.calculate_error(test_data)

    def create_matrix(self, item_list):
        assert isinstance(item_list, dict)
        self.u2u_corr = numpy.empty([self.matrix_size, self.matrix_size])
        for item in item_list.keys():
            for item2 in item_list.keys():
                if item != item2:
                    intersect = list(set(item.keys()).intersection(item2.keys()))
                    if intersect:
                        item_rating = [rating for item[rating] in intersect]
                        item2_rating = [rating for item2[rating] in intersect]
                        if not u2u_corr[item][item2]:
                            corr_value = pearson_corr(item_rating, item2_rating)
                            self.u2u_corr[item][item2] = corr_value
                            self.u2u_corr[item2][item] = corr_value

    def calculate_error(test_data):
        mae = 0.0
        k_neighbor = [5, 10, 20, 30, 40]
        
        for item in test_data.values:
            user_id = item[0]
            movie_id = item[1]
            rating = item[2]
            for k in k_neighbor:
                mae += abs(rating - self.predict(item, rating))
                self.general_mae.update({k: mae/test_data.size})

    def predict(item, rating):
        if self.filtering_type == 1:
            # user-based filtering
            pass
        else:
            # item-based filtering
            pass

    def get_max_val(item, max_len):
        return item.flatten().sort()[-1*max_len:]

    def mean(item_list):
        assert isinstance(item_list, list)
        item_list = filter(lambda x: x>0, item_list)
        assert len(item_list) > 0
        return sum(item_list) / len(item_list)

    @staticmethod
    def pearson_corr(a,b):
         def average(x):
             assert len(x) > 0
             return float(sum(x)) / len(x)

         assert len(a) == len(b)
         n = len(a)
         assert n > 0
         avg_a = average(a)
         avg_b = average(b)
         diffprod = adiff2 = bdiff2 = 0
         for idx in range(n):
             adiff = a[idx] - avg_a
             bdiff = b[idx] - avg_b
             diffprod += xdiff * ydiff
             adiff2 += adiff * adiff
             bdiff2 += bdiff * bdiff
         import math
         return diffprod / sqrt(xdiff2 * ydiff2)




NameError: name 'self' is not defined