In [3]:
import pandas as pd
import json
import gzip
import numpy as np
import scipy
from scipy.sparse import csr_matrix
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds
import sklearn
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
import csv
from sparsesvd import sparsesvd
import math
import operator

In [4]:
def unzip_json(filename):
    
    print('Unzipping json file...')
    
    unzipped_data = pd.read_json(gzip.open(filename))
    
    return unzipped_data

In [5]:
# Output json training data as a Pandas dataframe.
def json_to_df(file_name):

    print('Converting json file to dataframe...')

    try:
        training_data = pd.read_json(file_name, lines=True)
        return training_data
    except:
        print('Please try another file name.')

        return None

In [6]:
def convert_to_csv(dataframe, desired_filename):

    print('Converting dataframe to csv: ' + desired_filename + '...')

    try:
        return dataframe.to_csv(desired_filename, index=False)
    except:
        print('Please try another dataframe or file name.')

    return None

In [7]:
# Returns dictionaries with unique users and products as keys and unique ints as values.
def create_user_product_dicts(filename):

    print('Creating dictionaries from CSV for unique users and products...')

    user_dict = {}
    product_dict = {}
    user_count = 0
    product_count = 0

    with open(filename, 'r') as train_file:
        file_reader=csv.reader(train_file, delimiter=',')
        next(file_reader, None)

        for row in file_reader:
            if row[0] not in user_dict:
                user_dict[row[0]] = user_count
                user_count += 1
            if row[1] not in product_dict:
                product_dict[row[1]] = product_count
                product_count += 1

    return user_dict, product_dict, user_count, product_count

In [8]:
user_dict, product_dict, user_count, product_count = create_user_product_dicts('reviews.training.csv')

Creating dictionaries from CSV for unique users and products...


In [9]:
def training_mtx(filename, user_dict, product_dict):

        print('Creating a dense matrix from training data...')

        num_user_ids = len(user_dict)
        num_product_ids = len(product_dict)

        dense_matrix = np.zeros(shape=(num_user_ids, num_product_ids), dtype=np.float32)

        with open(filename, 'r') as train_file:
            matrix_reader = csv.reader(train_file, delimiter=',')
            next(matrix_reader, None)
            for row in matrix_reader:
                dense_matrix[user_dict[row[0]], product_dict[row[1]]] = float(row[2])

        return dense_matrix

In [10]:
training_matrix = training_mtx('reviews.training.csv', user_dict, product_dict)

Creating a dense matrix from training data...


In [11]:
training_matrix

array([[4., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.],
       [0., 0., 5., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [12]:
training_matrix.shape

(123952, 50050)

In [13]:
# Outputs dictionaries with unique test users and test products.
def get_test_users_products(filename, training_user_dict, training_product_dict):

    print('Importing test users and products...')

    test_user_count = len(training_user_dict)
    test_product_count = len(training_product_dict)
    test_user_dict = training_user_dict.copy()
    test_product_dict = training_product_dict.copy()

    with open(filename, 'r') as test_file:
        test_reader = csv.reader(test_file, delimiter=',')
        next(test_reader, None)

        for row in test_reader:
            # Add unique users to test_user dictionary.
            # print(row[1])
            if row[1] not in test_user_dict:
                test_user_dict[row[1]] = test_user_count
                test_user_count += 1
            # Add unique products to test_product dictionary.
            # print(row[2])
            if row[2] not in test_product_dict:
                test_product_dict[row[2]] = test_product_count
                test_product_count += 1

    return test_user_dict, test_product_dict

In [14]:
test_user_dict, test_product_dict = get_test_users_products('reviews.test.unlabeled.csv', user_dict, product_dict)

Importing test users and products...


In [15]:
# print(test_user_dict)

In [16]:
# # Merging new users OR new products into the matrix along the x axis
# def merged_mtx(test_file, train_file, test_y_axis, test_x_axis, train_y_axis,
#                train_x_axis, test_y_axis_row_num, test_x_axis_row_num, train_y_axis_row_num, train_x_axis_row_num):

#     print('Merging training and test data for ratings imputation...')

#     num_user_ids = len(user_dict) # Training users only
#     num_product_ids = len(test_x_axis) # Training and test products

#     merged_matrix = np.zeros(shape=(num_user_ids, num_product_ids), dtype=np.float32)
    
#     print('Merged matrix shape: ')
#     print(merged_matrix.shape)

#     with open(test_file, 'r') as test_file:
#         file_reader=csv.reader(test_file, delimiter=',')
#         next(file_reader, None)

#         for row in file_reader:
            
#             # print(row[1])
#             # print(test_product_dict[row[2]])
            
#             merged_matrix[:, test_x_axis[row[test_x_axis_row_num]]] = float(0)

#     with open(train_file, 'r') as train_file:
#         file_reader=csv.reader(train_file, delimiter=',')
#         next(file_reader, None)

#         for row in file_reader:
#             merged_matrix[train_y_axis[row[train_y_axis_row_num]], train_x_axis[row[train_x_axis_row_num]]] = float(row[2])

#     return merged_matrix

In [17]:
# Pad the matrix to add extra products along the x axis

def merged_mtx_products(filename, user_dict, test_product_dict):
    
        print('Creating a matrix with new products on the x axis...')

        num_user_ids = len(user_dict)
        num_product_ids = len(test_product_dict)

        dense_matrix = np.zeros(shape=(num_user_ids, num_product_ids), dtype=np.float32)

        with open(filename, 'r') as train_file:
            matrix_reader = csv.reader(train_file, delimiter=',')
            next(matrix_reader, None)
            for row in matrix_reader:
                dense_matrix[user_dict[row[0]], product_dict[row[1]]] = float(row[2])

        return dense_matrix

In [18]:
merged_matrix_product_rows = merged_mtx_products('reviews.training.csv', user_dict, test_product_dict)

Creating a matrix with new products on the x axis...


In [19]:
merged_matrix_product_rows.shape

(123952, 51744)

In [20]:
merged_matrix_product_rows

array([[4., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.],
       [0., 0., 5., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [21]:
sum(merged_matrix_product_rows[-1,:])

5.0

In [22]:
def merged_mtx_users(filename, product_dict, test_user_dict):

        print('Creating a matrix with new users on the x axis...')

        num_product_ids = len(product_dict)
        num_user_ids = len(test_user_dict)

        dense_matrix = np.zeros(shape=(num_product_ids, num_user_ids), dtype=np.float32)

        with open(filename, 'r') as train_file:
            matrix_reader = csv.reader(train_file, delimiter=',')
            next(matrix_reader, None)
            for row in matrix_reader:
                dense_matrix[product_dict[row[1]], user_dict[row[0]]] = float(row[2])

        return dense_matrix

In [23]:
merged_matrix_user_rows = merged_mtx_users('reviews.training.csv', product_dict, test_user_dict)

Creating a matrix with new users on the x axis...


In [24]:
merged_matrix_user_rows.shape

(50050, 123960)

In [25]:
merged_matrix_user_rows

array([[4., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.],
       [0., 0., 5., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [26]:
def to_sparse(filename_prefix, matrix):
    print('Creating a sparse matrix...')
    try:
        # Try loading previously saved sparse matrix from file (becaues I keep crashing my kernel)
        loader = np.load('sparse.merged.matrix.pr' + '.npz')
        sparse_matrix = csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape'])
        loader.close()
    except:
        # Create sparse matrix from dense matrix, write to file as backup
        sparse_matrix = scipy.sparse.csr_matrix(merged_matrix_product_rows)
        scipy.sparse.save_npz((filename_prefix + 'npz'), sparse_merged_matrix_pr)
    return sparse_matrix

In [27]:
sparse_merged_matrix_pr = to_sparse('sparse.merged.matrix.pr', merged_matrix_product_rows)

Creating a sparse matrix...


In [28]:
sparse_merged_matrix_pr.shape

(123952, 51744)

In [29]:
sparse_merged_matrix_ur = to_sparse('sparse.merged.matrix.ur', merged_matrix_user_rows)

Creating a sparse matrix...


In [30]:
sparse_merged_matrix_ur.shape

(123952, 51744)

In [33]:
def normalize_merged_matrix(sparse_matrix, dense_merged_matrix):

    print('Calculating global mean...')
    global_mean = sparse_matrix.sum()/(sparse_matrix != 0).sum()
    print(global_mean)
    
    print('Calculating row mean...')
    matrix_row_mean = sparse_matrix.sum(1)/(sparse_matrix != 0).sum(1)
    # print(matrix_row_mean)

    print('Normalizing the data...')
    # count = 0
    # normalized_matrix = np.zeros(shape=(len(dense_merged_matrix[:, 0]), len(dense_merged_matrix[0, :])))
    normalized_matrix = sparse_matrix.copy()
    normalized_matrix -= matrix_row_mean.reshape(-1, 1)
    print(normalized_matrix.shape)

#     # Seems kind of hacky, but normalizing in batches due to memory shortage
#     # I should compress this to a function
#     print('First normalization batch...')
#     for row in range(0, 30000):
#         try:
#             # print(count)
#             normalized_matrix[row, :] = sparse_matrix[row, :] - matrix_row_mean[row, :].reshape(-1, 1)
#             # count += 1
#         except:
#             pass
#     print('Second normalization batch...')
#     for row in range(30000, 60000):
#         try:
#             # print(count)
#             normalized_matrix[row, :] = sparse_matrix[row, :] - matrix_row_mean[row, :].reshape(-1, 1)
#             # count += 1
#         except:
#             pass
#     print('Third normalization batch...')
#     for row in range(60000, 90000):
#         try:
#             # print(count)
#             normalized_matrix[row, :] = sparse_matrix[row, :] - matrix_row_mean[row, :].reshape(-1, 1)
#             # count += 1
#         except:
#             pass
#     print('Fourth normalization batch...')
#     for row in range(90000, 120000):
#         try:
#             # print(count)
#             normalized_matrix[row, :] = sparse_matrix[row, :] - matrix_row_mean[row, :].reshape(-1, 1)
#             # count += 1
#         except:
#             pass
#     print('Fifth normalization batch...')
#     for row in range(120000, row_mean.shape[0]-1):
#         try:
#             # print(count)
#             normalized_matrix[row, :] = sparse_matrix[row, :] - matrix_row_mean[row, :].reshape(-1, 1)
#             # count += 1
#         except:
#             pass
    
    print('Normalized matrix: ')
    print(normalized_matrix.shape)
    print(normalized_matrix)

    return normalized_matrix, matrix_row_mean, global_mean

In [None]:

normalized_matrix_products, matrix_row_mean_products, global_mean = normalize_merged_matrix(sparse_merged_matrix_pr,
                                                                                            merged_matrix_product_rows)


Calculating global mean...
4.110994929404886
Calculating row mean...
Normalizing the data...


In [None]:

normalized_matrix_users, matrix_row_mean_users, global_mean = normalize_merged_matrix(sparse_merged_matrix_ur)


In [None]:
def compute_svd_from_demeaned(urm_demeaned):

    print('Computing svd from de-meaned matrix...')

    U, sigma, Vt = svds(urm_demeaned, k = 100)
    S = np.diag(sigma)

    return U, S, Vt

In [None]:
U_products, S_products, Vt_products = compute_svd_from_demeaned(normalized_matrix_products)

In [None]:
U_users, S_users, Vt_users = compute_svd_from_demeaned(normalized_matrix_users)

In [None]:
normalized_matrix_products.shape

In [None]:
np.dot(np.dot(U_products, S_products), Vt_products)

In [None]:
# U_products

In [None]:
# S_products

In [None]:
# Vt_products

In [None]:
normalized_matrix_users.shape

In [None]:
np.dot(np.dot(U_users, S_users), Vt_users)

In [None]:
# U_users

In [None]:
# S_users

In [None]:
# Vt_users

In [None]:
def reconstruct_demeaned_matrix(test_file, prediction_file, test_user_dict, test_product_dict,
                                U_products, S_products, Vt_products, U_users, S_users, Vt_users,
                                matrix_row_mean_products, matrix_row_mean_users, global_mean):

    print('Reconstructing matrix and making predictions...')
    
#     print(len(test_user_dict))
#     print(len(test_product_dict))
    
    
    right_term_products = np.dot(S_products, Vt_products)
    right_term_users = np.dot(S_users, Vt_users)

    with open(test_file, 'r') as test_file:
        test_reader = csv.reader(test_file, delimiter=',')
        next(test_reader, None)
        with open(prediction_file, 'w') as outfile:
            outfile_reader = csv.writer(outfile, delimiter=',')
            outfile_reader.writerow(['datapointID', 'overall'])

            for row in test_reader:

                try: 
                    # Query by user.
                    user_query = np.dot(right_term_products.T, U_products[test_user_dict[row[0]], :].T)
                    prediction = -user_query[test_product_dict[row[1]]]
                    outfile_reader.writerow([row[0], prediction])
#                     print('Query by USER')
#                     print(row[1])
#                     print(prediction)
                except:
                    try:
                        # Query by product.
                        product_query = np.dot(right_term_users.T, U_users[test_product_dict[row[1]], :].T)
                        prediction = -product_query[test_user_dict[row[0]]]
                        outfile_reader.writerow([row[0], prediction])
#                         print('Query by PRODUCT')
#                         print(row[1])
#                         print(prediction)
                    except:
                        # If no matching users or products are found, make prediction based on global mean.
                        prediction = global_mean
                        outfile_reader.writerow([row[0], prediction])
#                         print('No matching query: GLOBAL MEAN')
#                         print(row[1])
#                         print(prediction)

    print('Done.')
    return None

In [None]:
predicted_ratings = reconstruct_demeaned_matrix('reviews.test.unlabeled.csv', 'reviews.test.labeled.csv',
                                                test_user_dict, test_product_dict,
                                                U_products, S_products, Vt_products, U_users, S_users, Vt_users, 
                                                matrix_row_mean_products, matrix_row_mean_users,
                                                global_mean)