In [3]:
import math
import numpy as np
import sklearn.metrics
import sklearn.metrics.pairwise

In [4]:
train_files = ["../ml-100k/u" + str(i) + ".base" for i in range(1, 6)]
test_files  = ["../ml-100k/u" + str(i) + ".test" for i in range(1, 6)]

"""
    data[i] -> user id | item id | rating | timestamp
"""

"""
    -> 943 users
    -> 1682 movies
"""

numUsers = 943
numMovies = 1682

### 1. No bias removal, pearson metric

In [45]:
errors = []

for t in range(5):
    print 'Fold', t + 1,

    """ Load Train Data, fold = t """
    with open(train_files[t], 'r') as fp:
        trainData = fp.readlines()
        trainData = map(lambda x : x.strip('\r\n'), trainData)
        trainData = map(lambda x : map(int, x.split('\t')[:3]), trainData)

    """ Prepare rating matrix """
    ratingMatrix = np.zeros((numUsers, numMovies))
    for u, m, r in trainData:
        ratingMatrix[u - 1][m - 1] = r

    """ Compute similarity matrix """
    itemSimMatrix = sklearn.metrics.pairwise.pairwise_distances(ratingMatrix.T, metric = 'correlation')
    itemSimMatrix = np.nan_to_num(itemSimMatrix)

    """ Load Test Data, fold = t """
    with open(test_files[t], 'r') as fp:
        testData = fp.readlines()
        testData = map(lambda x : x.strip('\r\n'), testData)
        testData = map(lambda x : map(int, x.split('\t')[:3]), testData)


    """ Test Model """

    true = []
    pred = []
    unable = 0

    for u, m, trueRating in testData:

        """ Get common raters """
        movies = np.argwhere(ratingMatrix[u - 1, :] != 0)

        if movies.shape[0] == 0:
            unable += 1
            true.append(trueRating)
            pred.append(3.)
            continue

        """ Make rating prediction """
        num = itemSimMatrix[m - 1, movies].T.dot(ratingMatrix[u - 1, movies]).flatten()[0]
        den = itemSimMatrix[m - 1, movies].sum()
        if den == 0:
            unable += 1
            true.append(trueRating)
            pred.append(3.)
            continue

        predRating = num / den
        true.append(trueRating)
        pred.append(predRating)


    """ Compute Error """
    MSE = sklearn.metrics.mean_squared_error(true, pred)
    MAE = sklearn.metrics.mean_absolute_error(true, pred)
    print 'MSE:', MSE,
    print 'MAE:', MAE,
    print 'NMAE:', MAE / 5.
    print 'Unable to predict', unable, '\n'

    errors.append([MSE, MAE])

    del ratingMatrix
    del itemSimMatrix

errors = np.array(errors)
print 'MSE across 5 folds:', errors[:, 0].mean()
print 'MAE across 5 folds:', errors[:, 1].mean()
print 'NMAE across 5 folds :', errors[:, 1].mean() / 5.


Fold 1 MSE: 1.15484958213 MAE: 0.860813356437 NMAE: 0.172162671287
Unable to predict 32 

Fold 2 MSE: 1.11902441293 MAE: 0.848449749898 NMAE: 0.16968994998
Unable to predict 36 

Fold 3 MSE: 1.08692761242 MAE: 0.835797112255 NMAE: 0.167159422451
Unable to predict 36 

Fold 4 MSE: 1.0955291352 MAE: 0.840280127619 NMAE: 0.168056025524
Unable to predict 27 

Fold 5 MSE: 1.10178479954 MAE: 0.844754685189 NMAE: 0.168950937038
Unable to predict 36 

MSE across 5 folds: 1.11162310844
MAE across 5 folds: 0.84601900628
NMAE across 5 folds : 0.169203801256


### 2. No bias removal, cosine metric

In [46]:
errors = []

for t in range(5):
    print 'Fold', t + 1,

    """ Load Train Data, fold = t """
    with open(train_files[t], 'r') as fp:
        trainData = fp.readlines()
        trainData = map(lambda x : x.strip('\r\n'), trainData)
        trainData = map(lambda x : map(int, x.split('\t')[:3]), trainData)

    """ Prepare rating matrix """
    ratingMatrix = np.zeros((numUsers, numMovies))
    for u, m, r in trainData:
        ratingMatrix[u - 1][m - 1] = r

    """ Compute similarity matrix """
    itemSimMatrix = sklearn.metrics.pairwise.pairwise_distances(ratingMatrix.T, metric = 'cosine')
    itemSimMatrix = np.nan_to_num(itemSimMatrix)

    """ Load Test Data, fold = t """
    with open(test_files[t], 'r') as fp:
        testData = fp.readlines()
        testData = map(lambda x : x.strip('\r\n'), testData)
        testData = map(lambda x : map(int, x.split('\t')[:3]), testData)


    """ Test Model """

    true = []
    pred = []
    unable = 0

    for u, m, trueRating in testData:

        """ Get common raters """
        movies = np.argwhere(ratingMatrix[u - 1, :] != 0)

        if movies.shape[0] == 0:
            unable += 1
            true.append(trueRating)
            pred.append(3.)
            continue

        """ Make rating prediction """
        num = itemSimMatrix[m - 1, movies].T.dot(ratingMatrix[u - 1, movies]).flatten()[0]
        den = itemSimMatrix[m - 1, movies].sum()
        if den == 0:
            unable += 1
            true.append(trueRating)
            pred.append(3.)
            continue

        predRating = num / den
        true.append(trueRating)
        pred.append(predRating)


    """ Compute Error """
    MSE = sklearn.metrics.mean_squared_error(true, pred)
    MAE = sklearn.metrics.mean_absolute_error(true, pred)
    print 'MSE:', MSE,
    print 'MAE:', MAE,
    print 'NMAE:', MAE / 5.
    print 'Unable to predict', unable, '\n'

    errors.append([MSE, MAE])

    del ratingMatrix
    del itemSimMatrix

errors = np.array(errors)
print 'MSE across 5 folds:', errors[:, 0].mean()
print 'MAE across 5 folds:', errors[:, 1].mean()
print 'NMAE across 5 folds :', errors[:, 1].mean() / 5.


Fold 1 MSE: 1.15787063596 MAE: 0.862724729324 NMAE: 0.172544945865
Unable to predict 0 

Fold 2 MSE: 1.12072462933 MAE: 0.849811367697 NMAE: 0.169962273539
Unable to predict 0 

Fold 3 MSE: 1.09112749587 MAE: 0.837959458105 NMAE: 0.167591891621
Unable to predict 0 

Fold 4 MSE: 1.0987667342 MAE: 0.842253134949 NMAE: 0.16845062699
Unable to predict 0 

Fold 5 MSE: 1.10526358739 MAE: 0.846702744688 NMAE: 0.169340548938
Unable to predict 0 

MSE across 5 folds: 1.11475061655
MAE across 5 folds: 0.847890286953
NMAE across 5 folds : 0.169578057391


### 3. With bias removal, pearson metric

In [51]:
errors = []

for t in range(5):
    print 'Fold', t + 1,

    """ Load Train Data, fold = t """
    with open(train_files[t], 'r') as fp:
        trainData = fp.readlines()
        trainData = map(lambda x : x.strip('\r\n'), trainData)
        trainData = map(lambda x : map(int, x.split('\t')[:3]), trainData)

    """ Prepare rating matrix """
    ratingMatrix = np.zeros((numUsers, numMovies))
    for u, m, r in trainData:
        ratingMatrix[u - 1][m - 1] = r

    """ Compute similarity matrix """
    itemSimMatrix = sklearn.metrics.pairwise.pairwise_distances(ratingMatrix.T, metric = 'correlation')
    itemSimMatrix = np.nan_to_num(itemSimMatrix)

    """ Compute mean matrix """
    meanMatrix = ratingMatrix.T.sum(axis = 1) / (ratingMatrix.T != 0).sum(axis = 1)
    meanMatrix = np.nan_to_num(meanMatrix)
    
    """ Load Test Data, fold = t """
    with open(test_files[t], 'r') as fp:
        testData = fp.readlines()
        testData = map(lambda x : x.strip('\r\n'), testData)
        testData = map(lambda x : map(int, x.split('\t')[:3]), testData)


    """ Test Model """

    true = []
    pred = []
    unable = 0

    for u, m, trueRating in testData:

        """ Get common raters """
        movies = np.argwhere(ratingMatrix[u - 1, :] != 0)

        if movies.shape[0] == 0:
            unable += 1
            true.append(trueRating)
            pred.append(3.)
            continue

        """ Make rating prediction """
        num = itemSimMatrix[m - 1, movies].T.dot(ratingMatrix[u - 1, movies] - meanMatrix[movies]).flatten()[0]
        den = itemSimMatrix[m - 1, movies].sum()
        
        if den == 0:
            unable += 1
            true.append(trueRating)
            pred.append(3.)
            continue

        predRating = meanMatrix[m - 1] + (num / den)
        true.append(trueRating)
        pred.append(predRating)


    """ Compute Error """
    MSE = sklearn.metrics.mean_squared_error(true, pred)
    MAE = sklearn.metrics.mean_absolute_error(true, pred)
    print 'MSE:', MSE,
    print 'MAE:', MAE,
    print 'NMAE:', MAE / 5.
    print 'Unable to predict', unable, '\n'

    errors.append([MSE, MAE])

    del ratingMatrix
    del itemSimMatrix

errors = np.array(errors)
print 'MSE across 5 folds:', errors[:, 0].mean()
print 'MAE across 5 folds:', errors[:, 1].mean()
print 'NMAE across 5 folds :', errors[:, 1].mean() / 5.


Fold 1



 MSE: 0.932337849667 MAE: 0.761957281758 NMAE: 0.152391456352
Unable to predict 32 

Fold 2 MSE: 0.912561243273 MAE: 0.751290451581 NMAE: 0.150258090316
Unable to predict 36 

Fold 3 MSE: 0.906050464195 MAE: 0.750111406373 NMAE: 0.150022281275
Unable to predict 36 

Fold 4 MSE: 0.902011490583 MAE: 0.74820330163 NMAE: 0.149640660326
Unable to predict 27 

Fold 5 MSE: 0.901360833777 MAE: 0.752773642065 NMAE: 0.150554728413
Unable to predict 36 

MSE across 5 folds: 0.910864376299
MAE across 5 folds: 0.752867216681
NMAE across 5 folds : 0.150573443336


### 4. With bias removal, cosine metric

In [53]:
errors = []

for t in range(5):
    print 'Fold', t + 1,

    """ Load Train Data, fold = t """
    with open(train_files[t], 'r') as fp:
        trainData = fp.readlines()
        trainData = map(lambda x : x.strip('\r\n'), trainData)
        trainData = map(lambda x : map(int, x.split('\t')[:3]), trainData)

    """ Prepare rating matrix """
    ratingMatrix = np.zeros((numUsers, numMovies))
    for u, m, r in trainData:
        ratingMatrix[u - 1][m - 1] = r

    """ Compute similarity matrix """
    itemSimMatrix = sklearn.metrics.pairwise.pairwise_distances(ratingMatrix.T, metric = 'cosine')
    itemSimMatrix = np.nan_to_num(itemSimMatrix)

    """ Compute mean matrix """
    meanMatrix = ratingMatrix.T.sum(axis = 1) / (ratingMatrix.T != 0).sum(axis = 1)
    meanMatrix = np.nan_to_num(meanMatrix)
    
    """ Load Test Data, fold = t """
    with open(test_files[t], 'r') as fp:
        testData = fp.readlines()
        testData = map(lambda x : x.strip('\r\n'), testData)
        testData = map(lambda x : map(int, x.split('\t')[:3]), testData)


    """ Test Model """

    true = []
    pred = []
    unable = 0

    for u, m, trueRating in testData:

        """ Get common raters """
        movies = np.argwhere(ratingMatrix[u - 1, :] != 0)

        if movies.shape[0] == 0:
            unable += 1
            true.append(trueRating)
            pred.append(3.)
            continue

        """ Make rating prediction """
        num = itemSimMatrix[m - 1, movies].T.dot(ratingMatrix[u - 1, movies] - meanMatrix[movies]).flatten()[0]
        den = itemSimMatrix[m - 1, movies].sum()
        
        if den == 0:
            unable += 1
            true.append(trueRating)
            pred.append(3.)
            continue

        predRating = meanMatrix[m - 1] + (num / den)
        true.append(trueRating)
        pred.append(predRating)


    """ Compute Error """
    MSE = sklearn.metrics.mean_squared_error(true, pred)
    MAE = sklearn.metrics.mean_absolute_error(true, pred)
    print 'MSE:', MSE,
    print 'MAE:', MAE,
    print 'NMAE:', MAE / 5.
    print 'Unable to predict', unable, '\n'

    errors.append([MSE, MAE])

    del ratingMatrix
    del itemSimMatrix

errors = np.array(errors)
print 'MSE across 5 folds:', errors[:, 0].mean()
print 'MAE across 5 folds:', errors[:, 1].mean()
print 'NMAE across 5 folds :', errors[:, 1].mean() / 5.


Fold 1



 MSE: 0.943622446023 MAE: 0.764559060694 NMAE: 0.152911812139
Unable to predict 0 

Fold 2 MSE: 0.922731602966 MAE: 0.75391849993 NMAE: 0.150783699986
Unable to predict 0 

Fold 3 MSE: 0.920637826219 MAE: 0.75331665141 NMAE: 0.150663330282
Unable to predict 0 

Fold 4 MSE: 0.911472989817 MAE: 0.750693503031 NMAE: 0.150138700606
Unable to predict 0 

Fold 5 MSE: 0.918004061 MAE: 0.756939654367 NMAE: 0.151387930873
Unable to predict 0 

MSE across 5 folds: 0.923293785205
MAE across 5 folds: 0.755885473886
NMAE across 5 folds : 0.151177094777
