In [51]:
import pandas as pd
import json
import gzip
import numpy as np
import scipy
from scipy.sparse import csr_matrix
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds
import sklearn
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
import csv
from sparsesvd import sparsesvd
import math
import operator

In [25]:
def unzip_json(filename):
    
    print('Unzipping json file...')
    
    unzipped_data = pd.read_json(gzip.open(filename))
    
    return unzipped_data

In [26]:
# Output json training data as a Pandas dataframe.
def json_to_df(file_name):

    print('Converting json file to dataframe...')

    try:
        training_data = pd.read_json(file_name, lines=True)
        return training_data
    except:
        print('Please try another file name.')

        return None

In [27]:
def convert_to_csv(dataframe, desired_filename):

    print('Converting dataframe to csv: ' + desired_filename + '...')

    try:
        return dataframe.to_csv(desired_filename, index=False)
    except:
        print('Please try another dataframe or file name.')

    return None

In [28]:
# Returns dictionaries with unique users and products as keys and unique ints as values.
def create_user_product_dicts(filename):

    print('Creating dictionaries from CSV for unique users and products...')

    user_dict = {}
    product_dict = {}
    user_count = 0
    product_count = 0

    with open(filename, 'r') as train_file:
        file_reader=csv.reader(train_file, delimiter=',')
        next(file_reader, None)

        for row in file_reader:
            if row[0] not in user_dict:
                user_dict[row[0]] = user_count
                user_count += 1
            if row[1] not in product_dict:
                product_dict[row[1]] = product_count
                product_count += 1

    return user_dict, product_dict, user_count, product_count

In [29]:
user_dict, product_dict, user_count, product_count = create_user_product_dicts('reviews.test.shortened.csv')

Creating dictionaries from CSV for unique users and products...


In [30]:
def training_mtx(filename, user_dict, product_dict):

        print('Creating a dense matrix from training data...')

        num_user_ids = len(user_dict)
        num_product_ids = len(product_dict)

        dense_matrix = np.zeros(shape=(num_user_ids, num_product_ids), dtype=np.float32)

        with open(filename, 'r') as train_file:
            matrix_reader = csv.reader(train_file, delimiter=',')
            next(matrix_reader, None)
            for row in matrix_reader:
                dense_matrix[user_dict[row[0]], product_dict[row[1]]] = float(row[2])

        return dense_matrix

In [31]:
training_matrix = training_mtx('reviews.test.shortened.csv', user_dict, product_dict)

Creating a dense matrix from training data...


In [32]:
training_matrix

array([[5., 0., 0., ..., 0., 0., 0.],
       [0., 2., 0., ..., 0., 0., 0.],
       [0., 0., 5., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 2., 0., 0.],
       [0., 0., 0., ..., 0., 5., 0.]], dtype=float32)

In [33]:
training_matrix.shape

(969, 944)

In [34]:
# Outputs dictionaries with unique test users and test products.
def get_test_users_products(filename, training_user_dict, training_product_dict):

    print('Importing test users and products...')

    test_user_count = len(training_user_dict)
    test_product_count = len(training_product_dict)
    test_user_dict = training_user_dict.copy()
    test_product_dict = training_product_dict.copy()

    with open(filename, 'r') as test_file:
        test_reader = csv.reader(test_file, delimiter=',')
        next(test_reader, None)

        for row in test_reader:
            # Add unique users to test_user dictionary.
            # print(row[1])
            if row[1] not in test_user_dict:
                test_user_dict[row[1]] = test_user_count
                test_user_count += 1
            # Add unique products to test_product dictionary.
            # print(row[2])
            if row[2] not in test_product_dict:
                test_product_dict[row[2]] = test_product_count
                test_product_count += 1

    return test_user_dict, test_product_dict

In [35]:
test_user_dict, test_product_dict = get_test_users_products('reviews.test.unlabeled.shortened.csv', user_dict, product_dict)

Importing test users and products...


In [36]:
# print(test_user_dict)

In [37]:
# # Merging new users OR new products into the matrix along the x axis
# def merged_mtx(test_file, train_file, test_y_axis, test_x_axis, train_y_axis,
#                train_x_axis, test_y_axis_row_num, test_x_axis_row_num, train_y_axis_row_num, train_x_axis_row_num):

#     print('Merging training and test data for ratings imputation...')

#     num_user_ids = len(user_dict) # Training users only
#     num_product_ids = len(test_x_axis) # Training and test products

#     merged_matrix = np.zeros(shape=(num_user_ids, num_product_ids), dtype=np.float32)
    
#     print('Merged matrix shape: ')
#     print(merged_matrix.shape)

#     with open(test_file, 'r') as test_file:
#         file_reader=csv.reader(test_file, delimiter=',')
#         next(file_reader, None)

#         for row in file_reader:
            
#             # print(row[1])
#             # print(test_product_dict[row[2]])
            
#             merged_matrix[:, test_x_axis[row[test_x_axis_row_num]]] = float(0)

#     with open(train_file, 'r') as train_file:
#         file_reader=csv.reader(train_file, delimiter=',')
#         next(file_reader, None)

#         for row in file_reader:
#             merged_matrix[train_y_axis[row[train_y_axis_row_num]], train_x_axis[row[train_x_axis_row_num]]] = float(row[2])

#     return merged_matrix

In [38]:
# Pad the matrix to add extra products along the x axis

def merged_mtx_products(filename, user_dict, test_product_dict):
    
        print('Creating a matrix with new products on the x axis...')

        num_user_ids = len(user_dict)
        num_product_ids = len(test_product_dict)

        dense_matrix = np.zeros(shape=(num_user_ids, num_product_ids), dtype=np.float32)

        with open(filename, 'r') as train_file:
            matrix_reader = csv.reader(train_file, delimiter=',')
            next(matrix_reader, None)
            for row in matrix_reader:
                dense_matrix[user_dict[row[0]], product_dict[row[1]]] = float(row[2])

        return dense_matrix

In [39]:
merged_matrix_product_rows = merged_mtx_products('reviews.test.shortened.csv', user_dict, test_product_dict)

Creating a matrix with new products on the x axis...


In [40]:
merged_matrix_product_rows.shape

(969, 1040)

In [41]:
merged_matrix_product_rows

array([[5., 0., 0., ..., 0., 0., 0.],
       [0., 2., 0., ..., 0., 0., 0.],
       [0., 0., 5., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [42]:
sum(merged_matrix_product_rows[-1,:])

5.0

In [43]:
def merged_mtx_users(filename, product_dict, test_user_dict):

        print('Creating a matrix with new users on the x axis...')

        num_product_ids = len(product_dict)
        num_user_ids = len(test_user_dict)

        dense_matrix = np.zeros(shape=(num_product_ids, num_user_ids), dtype=np.float32)

        with open(filename, 'r') as train_file:
            matrix_reader = csv.reader(train_file, delimiter=',')
            next(matrix_reader, None)
            for row in matrix_reader:
                dense_matrix[product_dict[row[1]], user_dict[row[0]]] = float(row[2])

        return dense_matrix

In [44]:
merged_matrix_user_rows = merged_mtx_users('reviews.test.shortened.csv', product_dict, test_user_dict)

Creating a matrix with new users on the x axis...


In [45]:
merged_matrix_user_rows.shape

(944, 1065)

In [46]:
merged_matrix_user_rows

array([[5., 0., 0., ..., 0., 0., 0.],
       [0., 2., 0., ..., 0., 0., 0.],
       [0., 0., 5., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [93]:
def to_sparse(matrix):
    sparse_matrix = scipy.sparse.csr_matrix(merged_matrix_product_rows)
    return sparse_matrix

In [94]:
sparse_merged_matrix_pr = to_sparse(merged_matrix_product_rows)

In [95]:
sparse_merged_matrix_ur = to_sparse(merged_matrix_user_rows)

In [152]:
def normalize_merged_matrix(sparse_matrix):

    print('Calculating global mean...')
    global_mean = sparse_matrix.sum()/(sparse_matrix != 0).sum()
    print(global_mean)
    
    print('Calculating row mean...')
    matrix_row_mean = sparse_matrix.sum(1)/(sparse_matrix != 0).sum(1)
    # print(matrix_row_mean)

    print('Normalizing the data...')
    normalized_matrix = sparse_matrix - matrix_row_mean.reshape(-1, 1)
    
    print('Normalized matrix: ')
    print(normalized_matrix.shape)
    print(normalized_matrix)

    return normalized_matrix, matrix_row_mean, global_mean

In [153]:

normalized_matrix_products, matrix_row_mean_products, global_mean_products = normalize_merged_matrix(sparse_merged_matrix_pr)


Calculating global mean...
4.128
Calculating row mean...
Normalizing the data...
Normalized matrix: 
(969, 1040)
[[ 0. -5. -5. ... -5. -5. -5.]
 [-2.  0. -2. ... -2. -2. -2.]
 [-5. -5.  0. ... -5. -5. -5.]
 ...
 [-5. -5. -5. ... -5. -5. -5.]
 [-2. -2. -2. ... -2. -2. -2.]
 [-5. -5. -5. ... -5. -5. -5.]]


In [154]:

normalized_matrix_users, matrix_row_mean_users, global_mean_users = normalize_merged_matrix(sparse_merged_matrix_ur)


Calculating global mean...
4.128
Calculating row mean...
Normalizing the data...
Normalized matrix: 
(969, 1040)
[[ 0. -5. -5. ... -5. -5. -5.]
 [-2.  0. -2. ... -2. -2. -2.]
 [-5. -5.  0. ... -5. -5. -5.]
 ...
 [-5. -5. -5. ... -5. -5. -5.]
 [-2. -2. -2. ... -2. -2. -2.]
 [-5. -5. -5. ... -5. -5. -5.]]


In [155]:
def compute_svd_from_demeaned(urm_demeaned):

    print('Computing svd from de-meaned matrix...')

    U, sigma, Vt = svds(urm_demeaned, k = 100)
    S = np.diag(sigma)

    return U, S, Vt

In [156]:
U_products, S_products, Vt_products = compute_svd_from_demeaned(normalized_matrix_products)

Computing svd from de-meaned matrix...


In [157]:
U_users, S_users, Vt_users = compute_svd_from_demeaned(normalized_matrix_users)

Computing svd from de-meaned matrix...


In [158]:
normalized_matrix_products.shape

(969, 1040)

In [159]:
np.dot(np.dot(U_products, S_products), Vt_products)

array([[-4.24445586, -4.99878483, -4.92037002, ..., -4.99998847,
        -4.99998847, -4.99998847],
       [-1.99701946, -1.99956519, -1.99701946, ..., -2.00002854,
        -2.00002854, -2.00002854],
       [-4.92037002, -4.99878483, -4.63998896, ..., -4.99998847,
        -4.99998847, -4.99998847],
       ...,
       [-5.0318666 , -4.99878483, -5.02709448, ..., -4.99998847,
        -4.99998847, -4.99998847],
       [-1.99701946, -1.99956519, -1.99701946, ..., -2.00002854,
        -2.00002854, -2.00002854],
       [-5.06782099, -4.99878483, -5.02507599, ..., -4.99998847,
        -4.99998847, -4.99998847]])

In [160]:
# U_products

In [161]:
# S_products

In [162]:
# Vt_products

In [163]:
normalized_matrix_users.shape

(969, 1040)

In [164]:
np.dot(np.dot(U_users, S_users), Vt_users)

array([[-4.30122229, -4.99878483, -5.09301256, ..., -4.99998847,
        -4.99998847, -4.99998847],
       [-1.99701946, -1.99956519, -1.99701946, ..., -2.00002854,
        -2.00002854, -2.00002854],
       [-5.09301256, -4.99878483, -4.45486786, ..., -4.99998847,
        -4.99998847, -4.99998847],
       ...,
       [-5.05644411, -4.99878483, -4.99258145, ..., -4.99998847,
        -4.99998847, -4.99998847],
       [-1.99701946, -1.99956519, -1.99701946, ..., -2.00002854,
        -2.00002854, -2.00002854],
       [-4.99515639, -4.99878483, -5.02676742, ..., -4.99998847,
        -4.99998847, -4.99998847]])

In [165]:
# U_users

In [166]:
# S_users

In [167]:
# Vt_users

In [168]:
def reconstruct_demeaned_matrix(test_file, prediction_file, test_user_dict, test_product_dict,
                                U_products, S_products, Vt_products, U_users, S_users, Vt_users,
                                matrix_row_mean_products, matrix_row_mean_users, global_mean_products, global_mean_users):

    print('Reconstructing matrix and making predictions...')
    
#     print(len(test_user_dict))
#     print(len(test_product_dict))
    
    right_term_products = np.dot(S_products, Vt_products)
    right_term_users = np.dot(S_users, Vt_users)

    with open(test_file, 'r') as test_file:
        test_reader = csv.reader(test_file, delimiter=',')
        next(test_reader, None)
        with open(prediction_file, 'w') as outfile:
            outfile_reader = csv.writer(outfile, delimiter=',')
            outfile_reader.writerow(['datapointID', 'overall'])

            for row in test_reader:

                try: 
                    # Query by user.
                    user_query = np.dot(right_term_products.T, U_products[test_user_dict[row[0]], :].T)
                    prediction = -user_query[test_product_dict[row[1]]]
                    outfile_reader.writerow([row[0], prediction])
#                     print('Query by USER')
#                     print(row[1])
#                     print(prediction)
                except:
                    try:
                        # Query by product.
                        product_query = np.dot(right_term_users.T, U_users[test_product_dict[row[1]], :].T)
                        prediction = -product_query[test_user_dict[row[0]]]
                        outfile_reader.writerow([row[0], prediction])
#                         print('Query by PRODUCT')
#                         print(row[1])
#                         print(prediction)
                    except:
                        # If no matching users or products are found, make prediction based on global mean.
                        prediction = ((global_mean_products + global_mean_users)/2)
                        outfile_reader.writerow([row[0], prediction])
#                         print('No matching query: GLOBAL MEAN')
#                         print(row[1])
#                         print(prediction)

    print('Done.')
    return None

In [144]:
predicted_ratings = reconstruct_demeaned_matrix('reviews.test.unlabeled.csv', 'reviews.test.labeled.csv',
                                                test_user_dict, test_product_dict,
                                                U_products, S_products, Vt_products, U_users, S_users, Vt_users, 
                                                matrix_row_mean_products, matrix_row_mean_users,
                                                global_mean_products, global_mean_users)

Reconstructing matrix and making predictions...
Done.
