# Recommender System with Scipy linalg svds SVD Model (Option 2)

### Elizabeth Hanley
### Uniqname: hanleyel
### Kaggle ID: hanleyel

In [None]:
import pandas as pd
import json
import gzip
import numpy as np
import scipy
from scipy.sparse import csr_matrix
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds
import sklearn
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
import csv
from sparsesvd import sparsesvd
import math
import operator
import random
import decimal

In [None]:
# Unzips gzip data.
def unzip_json(filename):
    
    print('Unzipping json file...')
    
    unzipped_data = pd.read_json(gzip.open(filename), lines=True)
    
    return unzipped_data

In [None]:
unzip_json('reviews.training.json.gz')

In [None]:
# Outputs json training data as a Pandas dataframe.
def json_to_df(file_name):

    print('Converting json file to dataframe...')

    try:
        training_data = pd.read_json(file_name, lines=True)
        return training_data
    except:
        print('Please try another file name.')

        return None

In [None]:
training_df = json_to_df('reviews.training.json')

In [None]:
# dev_df = json_to_df('reviews.dev.json')

In [None]:
# Converts dataframe to csv.
def convert_to_csv(dataframe, desired_filename):

    print('Converting dataframe to csv: ' + desired_filename + '...')

    try:
        return dataframe.to_csv(desired_filename, index=False)
    except:
        print('Please try another dataframe or file name.')

    return None

In [None]:
# Training file to CSV
convert_to_csv(training_df[['reviewerID', 'asin', 'overall']].head(90000), 'reviews.training.shortened.csv')

In [None]:
# Development file to CSV.
# convert_to_csv(dev_df[['reviewerID', 'asin', 'overall']], 'reviews.dev.csv')

In [None]:
# Returns dictionaries with unique users and products as keys and unique ints as values.
def create_user_product_dicts(filename):

    print('Creating dictionaries from CSV for unique users and products...')

    user_dict = {}
    product_dict = {}
    user_count = 0
    product_count = 0

    with open(filename, 'r') as train_file:
        file_reader=csv.reader(train_file, delimiter=',')
        next(file_reader, None)

        for row in file_reader:
            if row[0] not in user_dict:
                user_dict[row[0]] = user_count
                user_count += 1
            if row[1] not in product_dict:
                product_dict[row[1]] = product_count
                product_count += 1

    return user_dict, product_dict, user_count, product_count

In [None]:
user_dict, product_dict, user_count, product_count = create_user_product_dicts('reviews.training.shortened.csv')

In [None]:
# Creates a dense matrix from training data.
def training_mtx(filename, user_dict, product_dict):

        print('Creating a dense matrix from training data...')

        num_user_ids = len(user_dict)
        num_product_ids = len(product_dict)

        dense_matrix = np.zeros(shape=(num_user_ids, num_product_ids), dtype=np.float32)

        with open(filename, 'r') as train_file:
            matrix_reader = csv.reader(train_file, delimiter=',')
            next(matrix_reader, None)
            for row in matrix_reader:
                dense_matrix[user_dict[row[0]], product_dict[row[1]]] = float(row[2])

        return dense_matrix

In [None]:
training_matrix = training_mtx('reviews.training.shortened.csv', user_dict, product_dict)

In [None]:
training_matrix

In [None]:
# training_matrix.shape

In [None]:
# Outputs dictionaries with unique test users and test products.
def get_test_users_products(filename, training_user_dict, training_product_dict):

    print('Importing test users and products...')

    test_user_count = len(training_user_dict)
    test_product_count = len(training_product_dict)
    test_user_dict = training_user_dict.copy()
    test_product_dict = training_product_dict.copy()

    with open(filename, 'r') as test_file:
        test_reader = csv.reader(test_file, delimiter=',')
        next(test_reader, None)

        for row in test_reader:
            # Add unique users to test_user dictionary.
            if row[0] not in test_user_dict:
                test_user_dict[row[0]] = test_user_count
                test_user_count += 1
            # Add unique products to test_product dictionary.
            # print(row[2])
            if row[1] not in test_product_dict:
                test_product_dict[row[1]] = test_product_count
                test_product_count += 1

    return test_user_dict, test_product_dict

In [None]:
test_user_dict, test_product_dict = get_test_users_products('reviews.dev.csv', user_dict, product_dict)

In [None]:
test_user_dict['A16NGP74HECTI9']

In [None]:
len(test_user_dict)

In [None]:
len(test_product_dict)

In [None]:
# Creates a new matrix with unknown products on the x axis.
# def merged_mtx_products(filename, user_dict, test_product_dict):
    
#         print('Creating a matrix with new products on the x axis...')

#         num_user_ids = len(user_dict)
#         num_product_ids = len(test_product_dict)

#         dense_matrix = np.zeros(shape=(num_user_ids, num_product_ids), dtype=np.float32)

#         with open(filename, 'r') as train_file:
#             matrix_reader = csv.reader(train_file, delimiter=',')
#             next(matrix_reader, None)
#             for row in matrix_reader:
#                 dense_matrix[user_dict[row[0]], product_dict[row[1]]] = float(row[2])

#         return dense_matrix

In [None]:
# merged_matrix_product_rows = merged_mtx_products('reviews.training.shortened.csv', user_dict, test_product_dict)

In [None]:
# merged_matrix_product_rows.shape

In [None]:
# merged_matrix_product_rows

In [None]:
# sum(merged_matrix_product_rows[-1,:])

In [None]:
# merged_matrix_user_rows.shape

In [None]:
# merged_matrix_user_rows

In [None]:
# sum(merged_matrix_user_rows[-1, :])

In [None]:
# Converts dense matrices to sparse.
def to_sparse(filename_prefix, matrix):
    print('Creating a sparse matrix...')
    try:
        # Try loading previously saved sparse matrix from file (becaues I keep crashing my kernel)
        loader = np.load(filename_prefix + '.npz')
        sparse_matrix = csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape'], dtype=np.float32)
        loader.close()
    except:
        # Create sparse matrix from dense matrix, write to file as backup
        sparse_matrix = scipy.sparse.csr_matrix(matrix, dtype=np.float32)
        scipy.sparse.save_npz((filename_prefix + 'npz'), sparse_matrix)
    return sparse_matrix

In [None]:
# sparse_merged_matrix_pr = to_sparse('sparse.merged.matrix.pr', merged_matrix_product_rows)

In [None]:
sparse_matrix = to_sparse('merged.matrix', training_matrix)

In [None]:
sparse_matrix.shape

In [None]:
# sparse_merged_matrix_ur.shape

In [None]:
# Calculate global, row, and column means
def calculate_means(sparse_matrix):
    print('Calculating global mean...')
    # global_mean = sparse_matrix.sum()/(sparse_matrix != 0).sum()
    global_mean = np.true_divide(sparse_matrix.sum(), (sparse_matrix != 0).sum(), dtype=np.float32)
    print(global_mean)
    
    print('Calculating row mean...')
    matrix_row_mean = np.true_divide(sparse_matrix.sum(1), (sparse_matrix != 0).sum(1), dtype=np.float32)
    print(matrix_row_mean[-1])
    
    np.savetxt("row.mean.csv", matrix_row_mean, delimiter=",")
    
#     row_pad_len = len(test_user_dict) - len(user_dict)
#     matrix_row_mean_padded = np.pad(matrix_row_mean, (0, row_pad_len), 'constant')
#     print(matrix_row_mean_padded)
    
    
    print('Calculating column mean...')
    matrix_column_mean = np.true_divide(sparse_matrix.sum(0), (sparse_matrix != 0).sum(0), dtype=np.float32)
    print(matrix_column_mean)
    
    np.savetxt("column.mean.csv", matrix_column_mean.T, delimiter=",")

    return global_mean, matrix_row_mean, matrix_column_mean

In [None]:
global_mean, matrix_row_mean, matrix_column_mean = calculate_means(sparse_matrix)

In [None]:
# Merge test data and normalize matrix.
def normalize_matrix(sparse_matrix, global_mean, matrix_row_mean, matrix_column_mean):
    
    print('Creating a new matrix for merging test data scores...')

    num_product_ids = len(test_product_dict)
    num_user_ids = len(test_user_dict)

    dense_merged_matrix = np.full((num_user_ids+1, num_product_ids+1), 0, dtype=np.float32)
    
    
    with open('row.mean.csv', 'r') as infile:
        file_reader = csv.reader(infile, delimiter=',')
        row_count = 0
        for row in file_reader:
            dense_merged_matrix[row_count, :] = (float(row[0])-global_mean)/2
            row_count += 1
            
    print(dense_merged_matrix)
            
    with open('column.mean.csv', 'r') as infile:
        file_reader = csv.reader(infile, delimiter=',')
        column_count = 0
        for row in file_reader:
            dense_merged_matrix[:, column_count] += (float(row[0])-global_mean)/2
            column_count += 1

#     print('Normalizing the data...')
#     normalized_matrix = np.add(merged_matrix, matrix_row_mean_padded)
    
#     print('Normalized matrix: ')
#     print(normalized_matrix.shape)
#     print(normalized_matrix)

    return dense_merged_matrix

In [None]:
normalized_matrix = normalize_matrix(sparse_matrix, global_mean, matrix_row_mean, matrix_column_mean)

In [None]:
normalized_matrix.shape

In [None]:
normalized_matrix

In [None]:
# normalized_matrix_users, matrix_row_mean_users, global_mean = normalize_merged_matrix(sparse_merged_matrix_ur, merged_matrix_user_rows)

In [None]:
# Implements an SVD model.
def compute_svd_from_normalized(normalized_matrix):

    print('Computing svd from de-meaned matrix...')

    U, sigma, Vt = svds(normalized_matrix, k = 100)
    # U, sigma, Vt = np.linalg.svd(normalized_matrix)
    S = np.diag(sigma)

    return U, S, Vt

In [None]:
U, S, Vt = compute_svd_from_normalized(normalized_matrix.astype(float))

In [None]:
normalized_predictions = np.dot(np.dot(U, S), Vt)

In [None]:
normalized_predictions

In [None]:
predictions_matrix = normalized_predictions+global_mean

In [None]:
predictions_matrix

In [None]:
predictions_matrix.shape

In [None]:
test_product_dict['B003F3NE1Q']

In [None]:
def query_matrix(infile, outfile, predictions_matrix):
    
    print(predictions_matrix.shape)
    
    with open(infile, 'r') as test_file:
        test_reader = csv.reader(test_file, delimiter=',')
        next(test_reader, None)
        with open(outfile, 'w') as outfile:
            outfile_reader = csv.writer(outfile, delimiter=',')
            outfile_reader.writerow(['datapointID', 'overall'])

            for row in test_reader:
                
                try:
                    prediction = predictions_matrix[test_user_dict[row[0]], test_product_dict[row[1]]]
                    outfile_reader.writerow([row[0], row[2], prediction])
                    print(row[2], prediction)
                except:
                    print('Error')
                    pass
    
    return None

In [None]:
query_matrix('reviews.dev.csv', 'reviews.test.labeled.csv', predictions_matrix)

In [None]:
# Queries the prediction matrix.
def query_normalized_matrix_test(test_file, prediction_file, test_user_dict, test_product_dict,
                                predictions_product_rows, predictions_user_rows, global_mean):

    print('Reconstructing matrix and making predictions...')

    with open(test_file, 'r') as test_file:
        test_reader = csv.reader(test_file, delimiter=',')
        next(test_reader, None)
        with open(prediction_file, 'w') as outfile:
            outfile_reader = csv.writer(outfile, delimiter=',')
            outfile_reader.writerow(['datapointID', 'overall'])
            


            for row in test_reader:
                prediction = random.randrange(1,5,1)

                try:
                    # Query by user (new products on x axis).
                    user_query = predictions_product_rows[test_user_dict[row[1]]]
                    prediction = user_query[0, test_product_dict[row[2]]]
                    outfile_reader.writerow([row[0], prediction])
                    # print('Query by USER')
                    # print(prediction)
                except:
                    pass
#                     try:
#                         # Query by product.
#                         product_query = predictions_user_rows[product_dict[row[2]]]
#                         prediction = product_query[0, test_user_dict[row[1]]]
#                         outfile_reader.writerow([row[0], prediction])
#                         # print('Query by PRODUCT')
#                         # print(prediction)
#                     except:
#                         # If no matching users or products are found, make prediction based on global mean.
#                         prediction = global_mean
#                         outfile_reader.writerow([row[0], prediction])
#                         # print('No matching query: GLOBAL MEAN')
#                         # print(prediction)

    print('Done.')
    return None

In [None]:
predicted_ratings = query_normalized_matrix_test('reviews.dev.csv', 'reviews.test.labeled.csv',
                                                test_user_dict, test_product_dict, predictions_product_rows,
                                                predictions_user_rows, global_mean)

In [None]:
# RMSE analysis
def rmse(analysis_file):
    
    print('Checking RMSE...')
    
    targets = np.array([])
    predictions = np.array([])

    with open(analysis_file, 'r') as analysis_file:
        analysis_reader = csv.reader(analysis_file, delimiter=',')
        next(analysis_reader, None)
        for row in analysis_reader:
                targets = np.append(targets, row[1])
                predictions = np.append(predictions, row[2])

    return np.sqrt(((predictions - targets) ** 2).mean())

In [None]:
# rmse_val = rmse('reviews.dev.labeled.2.csv')
# print("rms error is: " + str(rmse_val))