In [1]:
import pandas as pd
import json
import gzip
import numpy as np
import scipy
from scipy.sparse import csr_matrix
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
import csv
from sparsesvd import sparsesvd
import math
import operator

In [2]:
def unzip_json(filename):
    print('Unzipping json file...')
    unzipped_data = pd.read_json(gzip.open(filename))
    return unzipped_data

In [3]:
# Output json training data as a Pandas dataframe.
def json_to_df(file_name):

    print('Converting json file to dataframe...')

    try:
        training_data = pd.read_json(file_name, lines=True)
        return training_data
    except:
        print('Please try another file name.')

        return None

In [4]:
def convert_to_csv(dataframe, desired_filename):

    print('Converting dataframe to csv: ' + desired_filename + '...')

    try:
        return dataframe.to_csv(desired_filename, index=False)
    except:
        print('Please try another dataframe or file name.')

    return None

In [5]:
# Returns dictionaries with unique users and products as keys and unique ints as values.
def create_user_product_dicts(filename):

    print('Creating dictionaries from CSV for unique users and products...')

    user_dict = {}
    product_dict = {}
    user_count = 0
    product_count = 0

    with open(filename, 'r') as train_file:
        file_reader=csv.reader(train_file, delimiter=',')
        next(file_reader, None)

        for row in file_reader:
            if row[0] not in user_dict:
                user_dict[row[0]] = user_count
                user_count += 1
            if row[1] not in product_dict:
                product_dict[row[1]] = product_count
                product_count += 1

    print('Len user and product dicts')
    print(len(user_dict))
    print(len(product_dict))
    # print(product_dict)

    print('Getting largest values')
    print('Users: ')
    lg_user = max(user_dict.items(), key=operator.itemgetter(1))[0]
    print(lg_user)
    print(user_dict[lg_user])
    print('Products: ')
    lg_prod = max(product_dict.items(), key=operator.itemgetter(1))[0]
    print(lg_prod)
    print(product_dict[lg_prod])

    print('User and items end counts')
    print('users:')
    print(user_count)
    print('products: ')
    print(product_count)

    print('dict lengths:')
    print('user')
    print(len(user_dict))
    print('products: ')
    print(len(product_dict))

    return user_dict, product_dict, user_count, product_count

In [6]:
user_dict, product_dict, user_count, product_count = create_user_product_dicts('reviews.test.shortened.csv')

Creating dictionaries from CSV for unique users and products...
Len user and product dicts
969
944
Getting largest values
Users: 
A9TG2NFA614S9
968
Products: 
B0000DKDUR
943
User and items end counts
users:
969
products: 
944
dict lengths:
user
969
products: 
944


In [7]:
print(len(user_dict))

969


In [8]:
def training_mtx(filename, user_dict, product_dict):

        print('Creating a dense matrix from training data...')

        num_user_ids = len(user_dict)
        num_product_ids = len(product_dict)

        dense_matrix = np.zeros(shape=(num_user_ids, num_product_ids), dtype=np.float32)
        
        print(dense_matrix.shape)

        with open(filename, 'r') as train_file:
            matrix_reader = csv.reader(train_file, delimiter=',')
            next(matrix_reader, None)
            for row in matrix_reader:
                dense_matrix[user_dict[row[0]], product_dict[row[1]]] = float(row[2])

        print('training matrix shape: ')
        print(dense_matrix.shape)
        print(dense_matrix)
        
        print(dense_matrix.shape)

        return dense_matrix

In [9]:
training_mtx('reviews.test.shortened.csv', user_dict, product_dict)

Creating a dense matrix from training data...
(969, 944)
training matrix shape: 
(969, 944)
[[5. 0. 0. ... 0. 0. 0.]
 [0. 2. 0. ... 0. 0. 0.]
 [0. 0. 5. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 2. 0. 0.]
 [0. 0. 0. ... 0. 5. 0.]]
(969, 944)


array([[5., 0., 0., ..., 0., 0., 0.],
       [0., 2., 0., ..., 0., 0., 0.],
       [0., 0., 5., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 2., 0., 0.],
       [0., 0., 0., ..., 0., 5., 0.]], dtype=float32)

In [10]:
# Outputs dictionaries with unique test users and test products.
def get_test_users_products(filename, training_user_dict, training_product_dict, user_count, product_count):
    
    print(len(training_user_dict))
    print(len(training_product_dict))

    print('Importing test users and products...')

    test_user_count = user_count
    test_product_count = product_count
    test_user_dict = training_user_dict.copy()
    test_product_dict = training_product_dict.copy()

    with open(filename, 'r') as test_file:
        test_reader = csv.reader(test_file, delimiter=',')
        next(test_reader, None)

        for row in test_reader:
            # Add unique users to test_user dictionary.
            if row[1] not in test_user_dict:
                test_user_dict[row[1]] = test_user_count
                test_user_count += 1
            # Add unique products to test_product dictionary.
            if row[2] not in test_product_dict:
                test_product_dict[row[2]] = test_product_count
                test_product_count += 1

    print('Len test dicts, user and product: ')
    print(len(test_user_dict))
    print(len(test_product_dict))
    print('Len test user and product counts: ')
    print(test_user_count)
    print(test_product_count)
    # print(test_user_dict)
    # print(test_product_dict)
    
    print('Training user and product dict lengths: ')
    print(len(training_user_dict))
    print(len(training_product_dict))

    return test_user_dict, test_product_dict

In [11]:
test_user_dict, test_product_dict = get_test_users_products('reviews.dev.shortened.csv', user_dict, product_dict,
                                                            user_count, product_count)

969
944
Importing test users and products...
Len test dicts, user and product: 
1925
949
Len test user and product counts: 
1925
949
Training user and product dict lengths: 
969
944


In [12]:
print(test_product_dict)

{'B000CDSS22': 0, '6305186774': 1, 'B004LWZW24': 2, 'B00001U0DM': 3, 'B00005JOZI': 4, 'B0002ZMHWM': 5, '6305426678': 6, 'B005ER6SNM': 7, 'B005LAIIJY': 8, 'B004EPYZUS': 9, 'B0001NBNJ8': 10, 'B00BBAQD6S': 11, 'B003EYVXSW': 12, '6304431856': 13, '6305364664': 14, 'B000HEVZ7G': 15, 'B00005JLZN': 16, 'B00005UQ9W': 17, 'B00006C7G9': 18, 'B0050N0UR0': 19, '6302038308': 20, 'B000FDECEM': 21, 'B00003CXKM': 22, 'B000VY1EYG': 23, 'B00B52F278': 24, '6302098440': 25, 'B002WIDRLC': 26, 'B002BWP3UC': 27, '6305499071': 28, '6305426651': 29, '0767813871': 30, 'B000055WAO': 31, '078323211X': 32, 'B0002Y4PPK': 33, '0800141660': 34, 'B00019G3BK': 35, 'B00118VEJ6': 36, 'B0006FFRB6': 37, 'B00005JPS8': 38, '6304697961': 39, '6302636779': 40, 'B008JFUOWM': 41, 'B00005Y6XI': 42, 'B00008W64E': 43, 'B0002I84JO': 44, '6302919509': 45, '0792154649': 46, '079284615X': 47, 'B0070B9SKU': 48, 'B000A6T1J0': 49, '6300248569': 50, 'B00000F3SU': 51, 'B006X040NY': 52, '630126939X': 53, 'B001KZIRKY': 54, 'B0018CNNV2': 55, '

In [13]:
# Merging (ONLY) new movies (not new users) into the matrix
def merged_mtx(test_file, train_file, test_user_dict, test_product_dict, user_dict,
               product_dict):

    print('Merging training and test data for ratings imputation...')

    num_user_ids = len(user_dict) # Training users
    num_product_ids = len(test_product_dict) # Training and test products

    merged_matrix = np.zeros(shape=(num_user_ids, num_product_ids), dtype=np.float32)
    
    print(merged_matrix.shape)

    with open(test_file, 'r') as test_file:
        file_reader=csv.reader(test_file, delimiter=',')
        next(file_reader, None)

        for row in file_reader:
            
            # print(row[1])
            # print(test_product_dict[row[2]])
            
            merged_matrix[:, test_product_dict[row[2]]] = float(0)

    with open(train_file, 'r') as train_file:
        file_reader=csv.reader(train_file, delimiter=',')
        next(file_reader, None)

        for row in file_reader:
            merged_matrix[user_dict[row[0]], product_dict[row[1]]] = float(row[2])
    
    print(merged_matrix.shape)
    print(merged_matrix)

#     print('Merged matrix tests: ')
#     print(merged_matrix[[test_user_dict['A34DNO6UAH67Z0']],[test_product_dict['B000CDSS22']]]) # Should be 5
#     print(merged_matrix[[test_user_dict['A3APW42N5MRVWT']], [test_product_dict['6305186774']]]) # Should be 2
#     print(merged_matrix[[test_user_dict['A2M03PQV8R826Z']], [test_product_dict['B0000DKDUR']]]) # Should be 5

    return merged_matrix

In [14]:
merged_matrix = merged_mtx('reviews.dev.shortened.csv', 'reviews.test.shortened.csv', test_user_dict,
                           test_product_dict, user_dict, product_dict)

Merging training and test data for ratings imputation...
(969, 949)
(969, 949)
[[5. 0. 0. ... 0. 0. 0.]
 [0. 2. 0. ... 0. 0. 0.]
 [0. 0. 5. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [15]:
def normalize_merged_matrix(matrix):

    print('Calculating ratings mean...')
    # matrix_mean = np.mean(matrix, axis=1)
    matrix_row_mean = np.true_divide(matrix.sum(1), (matrix != 0).sum(1))
    print('Matrix row mean: ')
    print(matrix_row_mean[0])
    matrix_column_mean = np.true_divide(matrix.sum(0), (matrix != 0).sum(0))
    print('matrix column mean: ')
    print(matrix_column_mean[0])

    # matrix_mean_test = np.mean(matrix[matrix > 0])
    # print(matrix_mean_test)
    # matrix_mean = matrix.sum(1)/(matrix!=0).sum(1).astype(float)

    global_mean = np.mean(matrix)
    print('Global mean: ')
    print(global_mean)

    print('Normalizing the data...')
    normalized_matrix = matrix - matrix_row_mean.reshape(-1, 1)
    print('Normalized matrix: ')
    print(normalized_matrix)

    print('Test normalized mean value: ')
    print(matrix_row_mean[0])

    print('Fill nan with 0')
    normalized_matrix = np.nan_to_num(normalized_matrix)
    print(normalized_matrix)

    return normalized_matrix, matrix_row_mean, matrix_column_mean, global_mean

In [16]:
normalized_matrix, matrix_row_mean, matrix_column_mean, global_mean = normalize_merged_matrix(merged_matrix)

Calculating ratings mean...
Matrix row mean: 
5.0
matrix column mean: 
5.0
Global mean: 
0.004489001
Normalizing the data...
Normalized matrix: 
[[ 0. -5. -5. ... -5. -5. -5.]
 [-2.  0. -2. ... -2. -2. -2.]
 [-5. -5.  0. ... -5. -5. -5.]
 ...
 [-5. -5. -5. ... -5. -5. -5.]
 [-2. -2. -2. ... -2. -2. -2.]
 [-5. -5. -5. ... -5. -5. -5.]]
Test normalized mean value: 
5.0
Fill nan with 0
[[ 0. -5. -5. ... -5. -5. -5.]
 [-2.  0. -2. ... -2. -2. -2.]
 [-5. -5.  0. ... -5. -5. -5.]
 ...
 [-5. -5. -5. ... -5. -5. -5.]
 [-2. -2. -2. ... -2. -2. -2.]
 [-5. -5. -5. ... -5. -5. -5.]]


  


In [17]:
def compute_svd_from_demeaned(urm_demeaned):

    print('Computing svd from de-meaned matrix...')

    U, sigma, Vt = svds(urm_demeaned, k = 100)
    S = np.diag(sigma)

    return U, S, Vt

In [18]:
U, S, Vt = compute_svd_from_demeaned(normalized_matrix)

Computing svd from de-meaned matrix...


In [19]:
U.shape

(969, 100)

In [20]:
S.shape

(100, 100)

In [21]:
Vt.shape

(100, 949)

In [22]:
np.dot(U, S).shape

(969, 100)

In [23]:
np.dot(S, Vt).T.shape

(949, 100)

In [24]:
right_term = np.dot(S, Vt)
right_term

array([[-1.37205650e-01, -5.89805982e-16, -4.20318367e-01, ...,
        -3.81639165e-16, -4.51028104e-16, -5.20417043e-16],
       [-3.81277566e-01,  9.89808821e-16,  2.66869126e-01, ...,
         1.15958810e-15,  1.21471301e-15,  1.25730183e-15],
       [ 1.02816145e-02, -6.22332047e-16, -7.37814147e-01, ...,
         4.05491613e-16,  3.61039323e-16,  3.32850067e-16],
       ...,
       [-1.83510544e-02, -1.18028604e-02, -1.83510544e-02, ...,
        -1.09573015e-02, -1.09573015e-02, -1.09573015e-02],
       [-1.30626992e-02, -1.16084588e-03, -1.30626992e-02, ...,
         5.37102953e-04,  5.37102952e-04,  5.37102952e-04],
       [ 1.33719043e+02,  1.33875875e+02,  1.33719043e+02, ...,
         1.33905748e+02,  1.33905748e+02,  1.33905748e+02]])

In [25]:
reconstructed_user_query = np.dot(right_term.T, U[0, :].T)
reconstructed_user_query

array([-4.53999667, -4.99878382, -5.01956388, -4.99725138, -4.99599273,
       -4.97408451, -5.01079028, -5.00499575, -5.00188509, -4.91127262,
       -4.99502001, -4.90725531, -4.99725138, -4.99968802, -5.05458758,
       -4.99502001, -4.99502001, -5.00188509, -5.00121525, -4.84908835,
       -5.00121525, -4.99725138, -4.99725138, -4.99502001, -4.97543973,
       -4.99502001, -4.9012458 , -4.89994935, -4.99725138, -5.0074009 ,
       -5.14554059, -4.83743051, -4.99725138, -4.99502001, -5.13173357,
       -5.00174976, -5.0336907 , -4.99878382, -4.99725138, -5.07768594,
       -5.0013573 , -5.02412163, -5.08411467, -4.99502001, -4.99968802,
       -5.09605656, -5.00121525, -4.86294563, -5.04170124, -4.93178321,
       -5.03431382, -4.92073637, -5.00121525, -5.04919716, -4.99502001,
       -4.99502001, -4.85783127, -4.86727461, -5.01788884, -4.99502001,
       -4.92059394, -5.00121525, -5.00157283, -4.92983148, -4.99878382,
       -5.0312819 , -4.99878382, -5.10004367, -4.93043905, -4.99

In [26]:
# re_meaned_prediction = reconstructed_user_query + matrix_row_mean.reshape(-1,1)
# re_meaned_prediction

In [27]:
right_term = np.dot(S, Vt)

In [28]:
user_query = np.dot(right_term.T, U[user_dict['A34DNO6UAH67Z0'], :].T)
user_query[test_product_dict['B000CDSS22']]

-4.539996673721209

In [39]:
def reconstruct_demeaned_matrix(U, S, Vt, urm_mean, testing_file, outfile, test_user_dict, test_product_dict):

    print('Reconstructing matrix and making predictions...')
    
    print(len(test_user_dict))
    print(len(test_product_dict))
    
    right_term = np.dot(S, Vt)

    with open(testing_file, 'r') as test_file:
        test_reader = csv.reader(test_file, delimiter=',')
        next(test_reader, None)
        with open(outfile, 'w') as outfile:
            outfile_reader = csv.writer(outfile, delimiter=',')
            outfile_reader.writerow(['datapointID', 'overall'])

            for row in test_reader:

                try: 
                    # user_query = np.dot(right_term.T, U[test_user_dict[row[0]], :].T)
                    user_query = np.dot(right_term.T, U[test_user_dict['A1JPGO1PST2QBA'], :].T)
                    prediction = -user_query[test_product_dict[row[1]]]
                    print('Making a prediction based on SVD')
                except:
                    print(row[1])
                    prediction = matrix_column_mean[test_product_dict[row[1]]]
                    print('Making a prediction based on mean.')               

                outfile_reader.writerow([row[0], row[2], prediction])

    print('Done.')
    return None

In [40]:
predicted_ratings = reconstruct_demeaned_matrix(U, S, Vt, matrix_row_mean, 'reviews.dev.shortened.csv',
                                                'reviews.test.labeled.csv', test_user_dict, test_product_dict)

Reconstructing matrix and making predictions...
1925
949
6300210685


KeyError: '6300210685'

In [43]:
test_df = pd.read_csv('reviews.dev.shortened.csv')
test_df.head()

Unnamed: 0,reviewerID,asin,overall
0,A16NGP74HECTI9,6300210685,5
1,A2Z2W192L0WN9B,6305892865,5
2,A2U1SWUB5HDQ5J,B008JFUN50,5
3,A23NSSJMEISOO2,B004LWZW24,1
4,A14YZVQTMRO7NE,B000MNP2K8,5


In [46]:
test_df.shape

(1000, 3)

In [44]:
training_df = pd.read_csv('reviews.test.shortened.csv')
training_df.head()

Unnamed: 0,reviewerID,asin,overall
0,A34DNO6UAH67Z0,B000CDSS22,5
1,A3APW42N5MRVWT,6305186774,2
2,A20D9VGCF3P13L,B004LWZW24,5
3,A82LIVYSX6WZ9,B00001U0DM,3
4,A3LRKDF5WU4ZDO,B00005JOZI,3


In [47]:
training_df.shape

(1000, 3)