In [None]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds
from sklearn.metrics import mean_squared_error
import sys
import numpy as np
import sklearn as sklearn
from math import sqrt


def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

def loadTrainFile():
    userID = []
    itemID = []
    rating = []
    with open("train.csv") as f:
        for i,line in enumerate(f):
            if(i>0):
                row = line.strip().split(',')
                
                if(row[2]!='99.0'):
                    userID.append(round(float(row[0])))
                    itemID.append(round(float(row[1])))
                    rating.append(float(row[2]))
                
    return np.array(userID), np.array(itemID), np.array(rating)

def loadTestFile():
    userID = []
    itemID = []
    with open("test.csv") as f:
        for i,line in enumerate(f):
            if(i>0):
                row = line.strip().split(',')
                userID.append(round(float(row[0])))
                itemID.append(round(float(row[1])))
                
    return userID, itemID

#############Data#############
train_userID, train_itemID, train_rating = loadTrainFile()
test_userID, test_itemID = loadTestFile()

# randomize dataset
indices = np.random.permutation(len(train_userID))
print(indices)

validate=False
if(validate):
    vn = 200
    train_userID = train_userID[indices[:-vn]]
    train_itemID = train_itemID[indices[:-vn]]
    train_rating = train_rating[indices[:-vn]]
    validate_userID = train_userID[indices[-vn:]]
    validate_itemID = train_itemID[indices[-vn:]]
    validate_rating = train_rating[indices[-vn:]]
    print("train size:", len(train_userID), "validate size:", len(validate_userID))

#############User-User#############
user_item_ratings = np.zeros([10000,100])
user_item_has_ratings = np.full([10000,100], False)
for i, r in enumerate(train_rating):
    user_item_ratings[train_userID[i]-1][train_itemID[i]-1]=r
    user_item_has_ratings[train_userID[i]-1][train_itemID[i]-1]=True

for i,ratings in enumerate(user_item_ratings):
    n = user_item_has_ratings[i].tolist().count(True)
    mean = ratings.sum()/n
    for j in range(len(ratings)):
        if(user_item_has_ratings[i][j]):
            user_item_ratings[i][j]-=mean
print(user_item_ratings)

if(validate):
    test_user_item_ratings = np.zeros([10000,100])
    for i, r in enumerate(validate_rating):
        test_user_item_ratings[validate_userID[i]-1][validate_itemID[i]-1]=r

#get SVD components from train matrix. Choose k.
u, s, vt = svds(user_item_ratings, k = 20)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)

if(validate):
    print ('User-based CF MSE: ', str(rmse(X_pred, test_user_item_ratings)))



In [None]:
test_output=[['user_id-item_id','rating']]
for i in range(len(test_userID)):
    user_idx = test_userID[i]-1
    item_idx = test_itemID[i]-1
    value = X_pred[user_idx][item_idx]
    entry = str(test_userID[i])+'-'+str(test_itemID[i])
    test_output.append([entry,value])
np.savetxt("output.csv", np.array(test_output, dtype=np.str), fmt='%s,%s', delimiter=",")

In [None]:
from surprise import Reader, Dataset
from surprise import SVD, KNNBasic, evaluate
import pandas as pd

train_userID, train_itemID, train_rating = loadTrainFile()

# Define the format
ratings_dict = {'userID': train_userID,
                'itemID': train_itemID,
                'rating': train_rating}
df = pd.DataFrame(ratings_dict)

reader = Reader(rating_scale=(-10.0,10.0))
# Load the data from the file using the reader format
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader=reader)
print(data)
# Split data into 5 folds
data.split(n_folds=5)

algo = SVD()

evaluate(algo, data, measures=['RMSE', 'MAE'])

In [1]:
from surprise import Reader, Dataset
from surprise import SVD, KNNBasic, evaluate
import pandas as pd

train_userID, train_itemID, train_rating = loadTrainFile()

# Define the format
ratings_dict = {'userID': train_userID,
                'itemID': train_itemID,
                'rating': train_rating}
df = pd.DataFrame(ratings_dict)
reader = Reader(rating_scale=(-10.0,10.0))
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader=reader)
trainset = data.build_full_trainset()

algo = KNNBasic()
algo.fit(trainset)

test_userID, test_itemID = loadTestFile()
test_output=[['user_id-item_id','rating']]

for i in range(len(test_userID)):
    entry=str(test_userID[i])+'-'+str(test_itemID[i])
    value = algo.predict(uid=str(test_userID[i]), iid=str(test_itemID[i]), verbose=True).est
    test_output.append([entry,value])

np.savetxt("output.csv", np.array(test_output, dtype=np.str), fmt='%s,%s', delimiter=",")


NameError: name 'loadTrainFile' is not defined