In [1]:
import math
import numpy as np
import sklearn.metrics
import sklearn.metrics.pairwise

In [2]:
train_files = ["../ml-100k/u" + str(i) + ".base" for i in range(1, 6)]
test_files  = ["../ml-100k/u" + str(i) + ".test" for i in range(1, 6)]

"""
    data[i] -> user id | item id | rating | timestamp
"""

"""
    -> 943 users
    -> 1682 movies
"""

numUsers = 943
numMovies = 1682

### 1. No bias removal, pearson metric

In [3]:
errors = []

for t in range(5):
    
    print 'Fold', t + 1,
    
    """ Load Train Data, fold = t """
    with open(train_files[t], 'r') as fp:
        trainData = fp.readlines()
        trainData = map(lambda x : x.strip('\r\n'), trainData)
        trainData = map(lambda x : map(int, x.split('\t')[:3]), trainData)

    """ Prepare rating matrix """
    ratingMatrix = np.zeros((numUsers, numMovies))
    for u, m, r in trainData:
        ratingMatrix[u - 1][m - 1] = r

    """ Compute similarity matrix """
    userSimMatrix = sklearn.metrics.pairwise.pairwise_distances(ratingMatrix, metric = 'correlation')

    """ Load Test Data, fold = t """
    with open(test_files[t], 'r') as fp:
        testData = fp.readlines()
        testData = map(lambda x : x.strip('\r\n'), testData)
        testData = map(lambda x : map(int, x.split('\t')[:3]), testData)

        
    """ Test Model """

    true = []
    pred = []
    unable = 0

    for u, m, trueRating in testData:
        
        """ Get common raters """
        raters = np.argwhere(ratingMatrix[:, m - 1] != 0)
        if raters.shape[0] == 0:
            unable += 1
            true.append(trueRating)
            pred.append(3.)
            continue
            
        """ Make rating prediction """
        num = userSimMatrix[u - 1, raters].T.dot(ratingMatrix[raters, m - 1]).flatten()[0]
        den = userSimMatrix[u - 1, raters].sum()
        predRating = num / den

        true.append(trueRating)
        pred.append(predRating)
    
    """ Compute Error """
    MSE = sklearn.metrics.mean_squared_error(true, pred)
    MAE = sklearn.metrics.mean_absolute_error(true, pred)
    print 'MSE:', MSE,
    print 'MAE:', MAE,
    print 'NMAE:', MAE / 5.
    print 'Unable to predict', unable, '\n'

    errors.append([MSE, MAE])
    
    del ratingMatrix
    del userSimMatrix

errors = np.array(errors)
print 'MSE across 5 folds:', errors[:, 0].mean()
print 'MAE across 5 folds:', errors[:, 1].mean()
print 'NMAE across 5 folds :', errors[:, 1].mean() / 5.


Fold 1 MSE: 1.07043669626 MAE: 0.828916372426 NMAE: 0.165783274485
Unable to predict 32 

Fold 2 MSE: 1.064463043 MAE: 0.822155046072 NMAE: 0.164431009214
Unable to predict 36 

Fold 3 MSE: 1.04290987643 MAE: 0.813339184809 NMAE: 0.162667836962
Unable to predict 36 

Fold 4 MSE: 1.03777222428 MAE: 0.81290298727 NMAE: 0.162580597454
Unable to predict 27 

Fold 5 MSE: 1.04859689394 MAE: 0.817329227969 NMAE: 0.163465845594
Unable to predict 36 

MSE across 5 folds: 1.05283574678
MAE across 5 folds: 0.818928563709
NMAE across 5 folds : 0.163785712742


### 2. No bias removal, cosine metric

In [4]:
errors = []

for t in range(5):
    
    print 'Fold', t + 1,
    
    """ Load Train Data, fold = t """
    with open(train_files[t], 'r') as fp:
        trainData = fp.readlines()
        trainData = map(lambda x : x.strip('\r\n'), trainData)
        trainData = map(lambda x : map(int, x.split('\t')[:3]), trainData)

    """ Prepare rating matrix """
    ratingMatrix = np.zeros((numUsers, numMovies))
    for u, m, r in trainData:
        ratingMatrix[u - 1][m - 1] = r

    """ Compute similarity matrix """
    userSimMatrix = sklearn.metrics.pairwise.pairwise_distances(ratingMatrix, metric = 'cosine')

    """ Load Test Data, fold = t """
    with open(test_files[t], 'r') as fp:
        testData = fp.readlines()
        testData = map(lambda x : x.strip('\r\n'), testData)
        testData = map(lambda x : map(int, x.split('\t')[:3]), testData)

        
    """ Test Model """

    true = []
    pred = []
    unable = 0

    for u, m, trueRating in testData:
        
        """ Get common raters """
        raters = np.argwhere(ratingMatrix[:, m - 1] != 0)
        if raters.shape[0] == 0:
            unable += 1
            true.append(trueRating)
            pred.append(3.)
            continue
            
        """ Make rating prediction """
        num = userSimMatrix[u - 1, raters].T.dot(ratingMatrix[raters, m - 1]).flatten()[0]
        den = userSimMatrix[u - 1, raters].sum()
        predRating = num / den

        true.append(trueRating)
        pred.append(predRating)
    
    """ Compute Error """
    MSE = sklearn.metrics.mean_squared_error(true, pred)
    MAE = sklearn.metrics.mean_absolute_error(true, pred)
    print 'MSE :', MSE,
    print 'MAE :', MAE,
    print 'NMAE :', MAE / 5.
    print 'Unable to predict', unable
    print '\n'
    
    errors.append([MSE, MAE])
    
    del ratingMatrix
    del userSimMatrix

errors = np.array(errors)
print 'MSE across 5 folds :', errors[:, 0].mean()
print 'MAE across 5 folds :', errors[:, 1].mean()
print 'NMAE across 5 folds :', errors[:, 1].mean() / 5.


Fold 1 MSE : 1.07068704412 MAE : 0.828937943624 NMAE : 0.165787588725
Unable to predict 32


Fold 2 MSE : 1.06463450735 MAE : 0.822128649795 NMAE : 0.164425729959
Unable to predict 36


Fold 3 MSE : 1.04278173253 MAE : 0.813268520439 NMAE : 0.162653704088
Unable to predict 36


Fold 4 MSE : 1.03788066449 MAE : 0.812913368633 NMAE : 0.162582673727
Unable to predict 27


Fold 5 MSE : 1.04873436228 MAE : 0.817361760036 NMAE : 0.163472352007
Unable to predict 36


MSE across 5 folds : 1.05294366215
MAE across 5 folds : 0.818922048505
NMAE across 5 folds : 0.163784409701


### 3. With bias removal, pearson metric

In [5]:
errors = []

for t in range(5):
    
    print 'Fold', t + 1,
    
    """ Load Train Data, fold = t """
    with open(train_files[t], 'r') as fp:
        trainData = fp.readlines()
        trainData = map(lambda x : x.strip('\r\n'), trainData)
        trainData = map(lambda x : map(int, x.split('\t')[:3]), trainData)

    """ Prepare rating matrix """
    ratingMatrix = np.zeros((numUsers, numMovies))
    for u, m, r in trainData:
        ratingMatrix[u - 1][m - 1] = r

    """ Compute similarity matrix """
    userSimMatrix = sklearn.metrics.pairwise.pairwise_distances(ratingMatrix, metric = 'correlation')

    """ Compute mean matrix """
    meanMatrix = ratingMatrix.sum(axis = 1) / (ratingMatrix != 0).sum(axis = 1)
    meanMatrix = np.nan_to_num(meanMatrix)
    
    """ Load Test Data, fold = t """
    with open(test_files[t], 'r') as fp:
        testData = fp.readlines()
        testData = map(lambda x : x.strip('\r\n'), testData)
        testData = map(lambda x : map(int, x.split('\t')[:3]), testData)

        
    """ Test Model """

    true = []
    pred = []
    unable = 0

    for u, m, trueRating in testData:
        
        """ Get common raters """
        raters = np.argwhere(ratingMatrix[:, m - 1] != 0)
        if raters.shape[0] == 0:
            unable += 1
            true.append(trueRating)
            pred.append(3.)
            continue
            
        """ Make rating prediction """
        num = userSimMatrix[u - 1, raters].T.dot(ratingMatrix[raters, m - 1] - meanMatrix[raters]).flatten()[0]
        den = userSimMatrix[u - 1, raters].sum()
        predRating = meanMatrix[u - 1] + num / den

        true.append(trueRating)
        pred.append(predRating)
    
    """ Compute Error """
    MSE = sklearn.metrics.mean_squared_error(true, pred)
    MAE = sklearn.metrics.mean_absolute_error(true, pred)
    print 'MSE :', MSE,
    print 'MAE :', MAE,
    print 'NMAE :', MAE / 5.
    print 'Unable to predict', unable, '\n'
    
    errors.append([MSE, MAE])
    
    del ratingMatrix
    del userSimMatrix

errors = np.array(errors)
print 'MSE across 5 folds :', errors[:, 0].mean()
print 'MAE across 5 folds :', errors[:, 1].mean()
print 'NMAE across 5 folds :', errors[:, 1].mean() / 5.


Fold 1 MSE : 0.958949254942 MAE : 0.772556301997 NMAE : 0.154511260399
Unable to predict 32 

Fold 2 MSE : 0.941550419681 MAE : 0.763390265381 NMAE : 0.152678053076
Unable to predict 36 

Fold 3 MSE : 0.925356138732 MAE : 0.757617776582 NMAE : 0.151523555316
Unable to predict 36 

Fold 4 MSE : 0.920122425236 MAE : 0.75616612709 NMAE : 0.151233225418
Unable to predict 27 

Fold 5 MSE : 0.919883207201 MAE : 0.75994893923 NMAE : 0.151989787846
Unable to predict 36 

MSE across 5 folds : 0.933172289158
MAE across 5 folds : 0.761935882056
NMAE across 5 folds : 0.152387176411


### 4. With bias removal, cosine metric

In [6]:
errors = []

for t in range(5):
    
    print 'Fold', t + 1,
    
    """ Load Train Data, fold = t """
    with open(train_files[t], 'r') as fp:
        trainData = fp.readlines()
        trainData = map(lambda x : x.strip('\r\n'), trainData)
        trainData = map(lambda x : map(int, x.split('\t')[:3]), trainData)

    """ Prepare rating matrix """
    ratingMatrix = np.zeros((numUsers, numMovies))
    for u, m, r in trainData:
        ratingMatrix[u - 1][m - 1] = r

    """ Compute similarity matrix """
    userSimMatrix = sklearn.metrics.pairwise.pairwise_distances(ratingMatrix, metric = 'cosine')

    """ Compute mean matrix """
    meanMatrix = ratingMatrix.sum(axis = 1) / (ratingMatrix != 0).sum(axis = 1)
    meanMatrix = np.nan_to_num(meanMatrix)
    
    """ Load Test Data, fold = t """
    with open(test_files[t], 'r') as fp:
        testData = fp.readlines()
        testData = map(lambda x : x.strip('\r\n'), testData)
        testData = map(lambda x : map(int, x.split('\t')[:3]), testData)

        
    """ Test Model """

    true = []
    pred = []
    unable = 0

    for u, m, trueRating in testData:
        
        """ Get common raters """
        raters = np.argwhere(ratingMatrix[:, m - 1] != 0)
        if raters.shape[0] == 0:
            unable += 1
            true.append(trueRating)
            pred.append(3.)
            continue
            
        """ Make rating prediction """
        num = userSimMatrix[u - 1, raters].T.dot(ratingMatrix[raters, m - 1] - meanMatrix[raters]).flatten()[0]
        den = userSimMatrix[u - 1, raters].sum()
        predRating = meanMatrix[u - 1] + num / den

        true.append(trueRating)
        pred.append(predRating)
    
    """ Compute Error """
    MSE = sklearn.metrics.mean_squared_error(true, pred)
    MAE = sklearn.metrics.mean_absolute_error(true, pred)
    print 'MSE :', MSE,
    print 'MAE :', MAE,
    print 'NMAE :', MAE / 5.
    print 'Unable to predict', unable, '\n'

    errors.append([MSE, MAE])
    
    del ratingMatrix
    del userSimMatrix

errors = np.array(errors)
print 'MSE across 5 folds :', errors[:, 0].mean()
print 'MAE across 5 folds :', errors[:, 1].mean()
print 'NMAE across 5 folds :', errors[:, 1].mean() / 5.


Fold 1 MSE : 0.959177915422 MAE : 0.772697394507 NMAE : 0.154539478901
Unable to predict 32 

Fold 2 MSE : 0.941718631157 MAE : 0.763558712141 NMAE : 0.152711742428
Unable to predict 36 

Fold 3 MSE : 0.925389706631 MAE : 0.757718806207 NMAE : 0.151543761241
Unable to predict 36 

Fold 4 MSE : 0.920239716968 MAE : 0.756305125376 NMAE : 0.151261025075
Unable to predict 27 

Fold 5 MSE : 0.920004297656 MAE : 0.760097512853 NMAE : 0.152019502571
Unable to predict 36 

MSE across 5 folds : 0.933306053567
MAE across 5 folds : 0.762075510217
NMAE across 5 folds : 0.152415102043
