In [156]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from surprise import Dataset
from surprise import SVD
from surprise import accuracy
from surprise.reader import Reader

In [157]:
USERS_COUNT = 943
ITEMS_COUNT = 1682
THRESHOLD = 0.05

In [158]:
def load_data():
    data = pd.DataFrame(Dataset.load_builtin("ml-100k").raw_ratings)
    data[0] = pd.to_numeric(data[0]) - 1
    data[1] = pd.to_numeric(data[1]) - 1
    del data[3]
    return data

In [159]:
def load_similarities():
    similarities = pd.read_csv("artificial_ratings.csv")
    similarities['0'] = similarities['0'] - 1
    similarities['1'] = similarities['1'] - 1

    similarities_arr = np.zeros((ITEMS_COUNT, ITEMS_COUNT))
    for _, row in similarities.iterrows():
        if int(row['0']) != int(row['1']):
            similarities_arr[int(row['0']), int(row['1'])] = row['2']
        
    return similarities_arr

In [160]:
data = load_data()
similarities = load_similarities()

In [161]:
result = {}
for split in [0.95, 0.80, 0.60, 0.40, 0.20]:
    print("Split: %.2f" % split)
    train_set, _ = train_test_split(data, test_size=split)
    sparcity = 1 - (len(train_set) / (USERS_COUNT * ITEMS_COUNT))
    print("Sparcitiy: %.4f" % sparcity)
    train_set, test_set = train_test_split(train_set, test_size=.20)
    
    train_dataset = Dataset.load_from_df(train_set, reader=Reader())
    
    algo = SVD()
    algo.fit(train_dataset.build_full_trainset())

    print("On Actual Train data")
    predictions = algo.test(train_set.values)
    accuracy.rmse(predictions)

    print("On Actual Test data")
    predictions = algo.test(test_set.values)
    accuracy.rmse(predictions)

    actual = [prediction.r_ui for prediction in predictions]
    est_actual = [prediction.est for prediction in predictions]
    
    print("Calculating artificial ratings")
    artificial = np.zeros((USERS_COUNT, ITEMS_COUNT))
    for item in range(ITEMS_COUNT):
        for user in range(USERS_COUNT):
            rating = 0
            all_user_ratings = train_set[train_set[0] == user]
            sum_sim = 0
            for _, row in all_user_ratings.iterrows():
                sim1 = similarities[item, int(row[1])]
                if (sim1 < threshold):
                    continue
                rating = rating + row[2] * sim1
                sum_sim = sum_sim + sim1
            if sum_sim == 0:
                artificial[user, item] = -1
            else:
                artificial[user, item] = rating / sum_sim

    dataframe = pd.DataFrame(columns=[0, 1, 2])
    artificial_data = []
    for item in range(ITEMS_COUNT):
        for user in range(USERS_COUNT):
            if (artificial[user, item] != -1):
                artificial_data.append([user, item, artificial[user, item]])
    artificial_dataset = pd.DataFrame(artificial_data)
    artificial_train_dataset = Dataset.load_from_df(artificial_dataset, reader=Reader())
    
    algo = SVD(verbose=True)
    algo.fit(artificial_train_dataset.build_full_trainset())

    predictions = algo.test(test_set.values)
    accuracy.rmse(predictions)

    est_artificial = [x.est for x in predictions]

    new_est = [(actual + artificial) / 2 for actual, artificial in zip(est_actual, est_artificial)]
    
    result[sparcity] = [math.sqrt(mean_squared_error(actual, new_est)),
                       math.sqrt(mean_squared_error(actual, est_actual)),
                       math.sqrt(mean_squared_error(actual, est_artificial))]
    print("Combined RMSE: %.3f" % math.sqrt(mean_squared_error(actual, new_est)))
    print()

Split: 0.95
Sparcitiy: 0.9968
On Actual Train data
RMSE: 0.7388
On Actual Test data
RMSE: 1.0541
Calculating artificial ratings
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
RMSE: 1.1913
Combined RMSE: 1.085

Split: 0.80
Sparcitiy: 0.9874
On Actual Train data
RMSE: 0.7184
On Actual Test data
RMSE: 1.0016
Calculating artificial ratings
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processin

In [149]:
%%time
# for threshold in [0.8, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.05, 0.02, 0.01, 0.005]:
for threshold in [0.05]:
    result = np.zeros((USERS_COUNT, ITEMS_COUNT))
    for item in range(ITEMS_COUNT):
        for user in range(USERS_COUNT):
            rating = 0
            all_user_ratings = train_set[train_set[0] == user]
            sum_sim = 0
            for _, row in all_user_ratings.iterrows():
                sim1 = similarities_arr[item, int(row[1])]
                if (sim1 < threshold):
                    continue
                rating = rating + row[2] * sim1
                sum_sim = sum_sim + sim1
            if sum_sim == 0:
                result[user, item] = -1
            else:
                result[user, item] = rating / sum_sim

    dataframe = pd.DataFrame(columns=[0, 1, 2])
    test = []
    for item in range(ITEMS_COUNT):
        for user in range(USERS_COUNT):
            if (result[user, item] != -1):
                test.append([user, item, result[user, item]])
    artificial_dataset = pd.DataFrame(test)
    artificial_train_dataset = Dataset.load_from_df(artificial_dataset, reader=Reader())
    
    algo = SVD(verbose=True)
    algo.fit(artificial_train_dataset.build_full_trainset())
    # predictions = algo.test(test_set)
    # accuracy.rmse(predictions)


    predictions = algo.test(test_set.values)
    print("THRESHOLD: %.3f" % threshold)
    accuracy.rmse(predictions)
    
    est_artificial = [x.est for x in predictions]

    new_est = [(actual + artificial) / 2 for actual, artificial in zip(est_actual, est_artificial)]
    print("Combined RMSE: %.3f" % math.sqrt(mean_squared_error(actual, new_est)))
    print()

KeyboardInterrupt: 

In [148]:
algo = SVD(verbose=True, biased=True)
algo.fit(artificial_train_dataset.build_full_trainset())
# predictions = algo.test(test_set)
# accuracy.rmse(predictions)


predictions = algo.test(test_set.values)
accuracy.rmse(predictions)

est_artificial = [x.est for x in predictions]

new_est = [(actual + artificial) / 2 for actual, artificial in zip(est_actual, est_artificial)]
print("Combined RMSE: %.3f" % math.sqrt(mean_squared_error(actual, new_est)))
print()

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
RMSE: 1.0342
Combined RMSE: 1.015



In [110]:
predictions = algo.test(test)
accuracy.rmse(predictions)

RMSE: 0.2904


0.2903710500767628

In [72]:
new_est = [(actual + artificial) / 2 for actual, artificial in zip(est_actual, est_artificial)]
print("Combined RMSE: %.3f" % math.sqrt(mean_squared_error(actual, new_est)))
print()

Combined RMSE: 1.090



In [416]:
np.sum(result == 5)

53823

In [417]:
np.sum(result != -1)

1149336

In [399]:
np.savetxt('artificial_ratings.out', result, delimiter=',')

In [418]:
dataframe = pd.DataFrame(columns=[0, 1, 2])

In [36]:
test = []
for item in range(ITEMS_COUNT):
    for user in range(USERS_COUNT):
        if (result[user, item] != -1):
            test.append([user, item, result[user, item]])

In [37]:
artificial_dataset = pd.DataFrame(test)

In [421]:
artificial_train_dataset = Dataset.load_from_df(artificial_dataset, reader=Reader())

In [405]:
est_artificial = [x.est for x in predictions]

In [65]:
est_actual = [x.est for x in predictions]

In [406]:
new_est = [(actual + artificial) / 2 for actual, artificial in zip(est_actual, est_artificial)]

In [70]:
actual = [x.r_ui for x in predictions]

In [67]:
from sklearn.metrics import mean_squared_error
import math

In [407]:
math.sqrt(mean_squared_error(actual, new_est))

1.052369084226506

In [93]:
# Daniel Alabi and Cody Wang
# ======================================
# SvdMatrix:
# generates matrices U and V such that
# U * V^T closely approximates
# the original matrix (in this case, the utility
# matrix M)
# =======================================


import math
import random
import time

"""
Rating class. 
Store every rating associated with a particular
userid and movieid.
================Optimization======================
"""
class Rating:
    def __init__(self, userid, movieid, rating):
        # to accomodate zero-indexing for matrices
        self.uid = userid-1 
        self.mid = movieid-1

        self.rat = rating


class SvdMatrix:
    """
    trainfile -> name of file to train data against
    nusers -> number of users in dataset
    nmovies -> number of movies in dataset
    r -> rank of approximation (for U and V)
    lrate -> learning rate
    regularizer -> regularizer
    typefile -> 0 if for smaller MovieLens dataset
                1 if for medium or larger MovieLens dataset
    """
    def __init__(self, trainfile, nusers, nmovies, r=100, lrate=0.005, regularizer=0.1, typefile=0):
        self.trainrats = []
        self.testrats = []
                
        self.nusers = nusers
        self.nmovies = nmovies

        if typefile == 0:
            self.readtrainsmaller(trainfile)
        elif typefile == 1:
            self.readtrainlarger(trainfile)

        # get average rating
        avg = self.averagerating()
        # set initial values in U, V using square root
        # of average/rank
        initval = math.sqrt(avg/r)
        
        # U matrix
        self.U = [[initval]*r for i in range(nusers)]
        # V matrix -- easier to store and compute than V^T
        self.V = [[initval]*r for i in range(nmovies)]

        self.r = r
        self.lrate = lrate
        self.regularizer = regularizer
        self.minimprov = 0.001
        self.maxepochs = 5            

    """
    Returns the dot product of v1 and v2
    """
    def dotproduct(self, v1, v2):
        return sum([v1[i]*v2[i] for i in range(len(v1))])

    """
    Returns the estimated rating corresponding to userid for movieid
    Ensures returns rating is in range [1,5]
    """
    def calcrating(self, uid, mid):
        p = self.dotproduct(self.U[uid], self.V[mid])
        if p > 5:
            p = 5
        elif p < 1:
            p = 1
        return p

    """
    Returns the average rating of the entire dataset
    """
    def averagerating(self):
        avg = 0
        n = 0
        for i in range(len(self.trainrats)):
            avg += self.trainrats[i].rat
            n += 1
        return float(avg/n)

    """
    Predicts the estimated rating for user with id i
    for movie with id j
    """
    def predict(self, i, j):
        return self.calcrating(i, j)

    """
    Trains the kth column in U and the kth row in
    V^T
    See docs for more details.
    """
    def train(self, k):
        sse = 0.0
        n = 0
        for i in range(len(self.trainrats)):
            # get current rating
            crating = self.trainrats[i]
            err = crating.rat - self.predict(crating.uid, crating.mid)
            sse += err**2
            n += 1

            uTemp = self.U[crating.uid][k]
            vTemp = self.V[crating.mid][k]

            self.U[crating.uid][k] += self.lrate * (err*vTemp - self.regularizer*uTemp)
            self.V[crating.mid][k] += self.lrate * (err*uTemp - self.regularizer*vTemp)
        return math.sqrt(sse/n)

    """
    Trains the entire U matrix and the entire V (and V^T) matrix
    """
    def trainratings(self):        
        # stub -- initial train error
        oldtrainerr = 1000000.0
       
        for k in range(self.r):
            print("k=", k)
            for epoch in range(self.maxepochs):
                trainerr = self.train(k)
                
                # check if train error is still changing
                if abs(oldtrainerr-trainerr) < self.minimprov:
                    break
                oldtrainerr = trainerr
                print("epoch=", epoch, "; train error=", trainerr)
                
    """
    Calculates the RMSE using between arr
    and the estimated values in (U * V^T)
    """
    def calcrmse(self, arr):
        nusers = self.nusers
        nmovies = self.nmovies
        sse = 0.0
        total = 0
        for i in range(len(arr)):
            crating = arr[i]
            sse += (crating.rat - self.calcrating(crating.uid, crating.mid))**2
            total += 1
        return math.sqrt(sse/total)

    """
    Read in the ratings from fname and put in arr
    Use splitter as delimiter in fname
    """
    def readinratings(self, fname, arr, splitter="\t"):
        f = open(fname)

        for line in f:
            newline = [int(float(each)) for each in line.split(splitter)]
            userid, movieid, rating = newline[0], newline[1], newline[2]
            arr.append(Rating(userid, movieid, rating))

        arr = sorted(arr, key=lambda rating: (rating.uid, rating.mid))
        return len(arr)
        
    """
    Read in the smaller train dataset
    """
    def readtrainsmaller(self, fname):
        return self.readinratings(fname, self.trainrats, splitter="\t")
        
    """
    Read in the large train dataset
    """
    def readtrainlarger(self, fname):
        return self.readinratings(fname, self.trainrats, splitter="::")
        
    """
    Read in the smaller test dataset
    """
    def readtestsmaller(self, fname):
        return self.readinratings(fname, self.testrats, splitter="\t")
                
    """
    Read in the larger test dataset
    """
    def readtestlarger(self, fname):
        return self.readinratings(fname, self.testrats, splitter="::")


if __name__ == "__main__":
    #========= test SvdMatrix class on smallest MovieLENS dataset =========
    init = time.time()
    svd = SvdMatrix("ua.base", 943, 1682)
    svd.trainratings()
    print("rmsetrain: ", svd.calcrmse(svd.trainrats))
    svd.readtestsmaller("ua.test")
    print("rmsetest: ", svd.calcrmse(svd.testrats))
    print("time: ", time.time()-init)


k= 0
epoch= 0 ; train error= 0.6741144397347321
epoch= 1 ; train error= 0.5352607878468978
epoch= 2 ; train error= 0.5237354856749674
k= 1
epoch= 0 ; train error= 0.5213615802595712
epoch= 1 ; train error= 0.5166406387804316
epoch= 2 ; train error= 0.5105357860276206
epoch= 3 ; train error= 0.504803967573186
epoch= 4 ; train error= 0.5010907963666331
k= 2
k= 3
epoch= 0 ; train error= 0.4999686069454351
k= 4
k= 5
k= 6
epoch= 0 ; train error= 0.4987896712923216
k= 7
k= 8
k= 9
k= 10
k= 11
k= 12
k= 13
epoch= 0 ; train error= 0.497766722365444
k= 14
k= 15
k= 16
k= 17
k= 18
k= 19
k= 20
k= 21
k= 22
k= 23
k= 24
k= 25
k= 26
k= 27
k= 28
k= 29
epoch= 0 ; train error= 0.49675714420205164
k= 30
k= 31
k= 32
k= 33
k= 34
k= 35
k= 36
k= 37
k= 38
k= 39
k= 40
k= 41
k= 42
k= 43
k= 44
k= 45
k= 46
k= 47
k= 48
k= 49
k= 50
k= 51
k= 52
k= 53
k= 54
k= 55
k= 56
k= 57
k= 58
k= 59
k= 60
k= 61
k= 62
k= 63
k= 64
k= 65
k= 66
k= 67
k= 68
k= 69
k= 70
k= 71
k= 72
k= 73
k= 74
k= 75
k= 76
k= 77
k= 78
k= 79
k= 80
k= 81
k= 

In [92]:
similarities_arr

array([[0.        , 0.0326087 , 0.04054054, ..., 0.        , 0.05454545,
        0.01666667],
       [0.0326087 , 0.        , 0.02702703, ..., 0.        , 0.01785714,
        0.01694915],
       [0.04054054, 0.02702703, 0.        , ..., 0.        , 0.05405405,
        0.02439024],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.05454545, 0.01785714, 0.05405405, ..., 0.        , 0.        ,
        0.0952381 ],
       [0.01666667, 0.01694915, 0.02439024, ..., 0.        , 0.0952381 ,
        0.        ]])