In [1]:
import pandas as pd
import json
import gzip
import numpy as np
import scipy
from scipy.sparse import csr_matrix
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
import csv
from sparsesvd import sparsesvd
import math
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix
from scipy.sparse.linalg import *

### Unzip json file.

In [2]:
def unzip_json(filename):
    print('Unzipping json file...')
    unzipped_data = pd.read_json(gzip.open(filename))
    return unzipped_data

### Load and write json data to csv.

In [3]:
# Output json training data as a Pandas dataframe.
def json_to_df(file_name):

    print('Converting json file to dataframe...')

    try:
        training_data = pd.read_json(file_name, lines=True)
        return training_data
    except:
        print('Please try another file name.')

        return None


# Convert Pandas dataframe to csv file for storage purposes.
# NOTE: Don't run this with the actual training data. This was just for saving a small version of the file for time
# saving purposes while I was setting up my dataframe and matrices.
def convert_to_csv(dataframe, desired_filename):

    print('Converting dataframe to csv: ' + desired_filename + '...')

    try:
        return dataframe.to_csv(desired_filename, index=False)
    except:
        print('Please try another dataframe or file name.')

    return None

In [4]:
reviews_df = pd.read_csv('reviews.dev.csv')
shortened_reviews = reviews_df.head(1000)
shortened_reviews.to_csv('reviews.test.shortened.csv', index = False)
shortened_reviews.head()

Unnamed: 0,reviewerID,asin,overall
0,A34DNO6UAH67Z0,B000CDSS22,5
1,A3APW42N5MRVWT,6305186774,2
2,A20D9VGCF3P13L,B004LWZW24,5
3,A82LIVYSX6WZ9,B00001U0DM,3
4,A3LRKDF5WU4ZDO,B00005JOZI,3


### Load and store CSV data as sparse matrix.

In [5]:
# Returns dictionaries with unique users and products as keys and unique ints as values.
def create_user_product_dicts(filename):

    print('Creating dictionaries from CSV for unique users and products...')

    user_dict = {}
    product_dict = {}
    user_count = 0
    product_count = 0

    with open(filename, 'r') as train_file:
        file_reader=csv.reader(train_file, delimiter=',')
        next(file_reader, None)

        for row in file_reader:
            if row[0] not in user_dict:
                user_dict[row[0]] = user_count
                user_count += 1
            if row[1] not in product_dict:
                product_dict[row[1]] = product_count
                product_count += 1

    return user_dict, product_dict

In [6]:
user_dict, product_dict = create_user_product_dicts('reviews.test.shortened.csv')

Creating dictionaries from CSV for unique users and products...


### Retrieve test users and products.

In [7]:
# Outputs dictionaries with unique test users and test products.
def get_test_users_products(filename, training_user_dict, training_product_dict):

    print('Importing test users and products...')

    test_user_count = len(training_user_dict)
    test_product_count = len(training_product_dict)
    # test_user_count = 0
    # test_product_count = 0
    test_user_dict = {}
    test_product_dict = {}

    with open(filename, 'r') as test_file:
        test_reader = csv.reader(test_file, delimiter=',')
        next(test_reader, None)

        for row in test_reader:
            # Add unique users to test_user dictionary.
            if row[1] in training_user_dict and row[1 not in test_user_dict]:
                test_user_dict[row[1]] = training_user_dict[row[1]]
            elif row[1] not in test_user_dict:
                test_user_count += 1
                test_user_dict[row[1]] = test_user_count
            # Add unique products to test_product dictionary.
            if row[2] in training_product_dict and row[2 not in test_product_dict]:
                test_product_dict[row[2]] = training_product_dict[row[2]]
            elif row[2] not in test_product_dict:
                test_product_count += 1
                test_product_dict[row[2]] = test_product_count

    return test_user_dict, test_product_dict

In [8]:
test_user_dict, test_product_dict = get_test_users_products('reviews.test.shortened.csv', user_dict, product_dict)

Importing test users and products...


In [9]:
df = pd.read_csv('reviews.test.shortened.csv')
df.head()

Unnamed: 0,reviewerID,asin,overall
0,A34DNO6UAH67Z0,B000CDSS22,5
1,A3APW42N5MRVWT,6305186774,2
2,A20D9VGCF3P13L,B004LWZW24,5
3,A82LIVYSX6WZ9,B00001U0DM,3
4,A3LRKDF5WU4ZDO,B00005JOZI,3


In [10]:
def readUrm(filename, user_dict, product_dict):

    print('Creating a first dense matrix from rating data...')

    num_user_ids = len(user_dict)
    # print(num_user_ids)
    num_product_ids = len(product_dict)
    # print(num_product_ids)

    urm = np.zeros(shape=(num_user_ids, num_product_ids), dtype=np.float32)

    with open(filename, 'r') as train_file:
        urmReader = csv.reader(train_file, delimiter=',')
        next(urmReader, None)
        for row in urmReader:
            urm[user_dict[row[0]], product_dict[row[1]]] = float(row[2])

    # print('Creating a sparse CSR matrix from dense rating matrix data...')
    # urm_sparse_csr = scipy.sparse.csr_matrix(urm, dtype=np.float32)

    print('Creating a sparse CSC matrix from dense rating matrix data...')
    urm_sparse_csc = scipy.sparse.csc_matrix(urm, dtype=np.float32)

    return urm_sparse_csc, num_user_ids, num_product_ids, urm

In [11]:
sparse_matrix, num_user_ids, num_product_ids, urm = readUrm('reviews.test.shortened.csv', user_dict, product_dict)

Creating a first dense matrix from rating data...
Creating a sparse CSC matrix from dense rating matrix data...


In [12]:
urm

array([[5., 0., 0., ..., 0., 0., 0.],
       [0., 2., 0., ..., 0., 0., 0.],
       [0., 0., 5., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 2., 0., 0.],
       [0., 0., 0., ..., 0., 5., 0.]], dtype=float32)

In [13]:
urm.shape

(969, 944)

### Implement SVD matrix from sparse matrix.

In [14]:
def computeSVD(sparse_matrix, K):

    print('Computing SVD matrix...')

    U, s, Vt = sparsesvd(sparse_matrix, K) #csr --> csc
    S = np.diag(s)

    # dim = (len(s), len(s))
    # S = np.zeros(dim, dtype=np.float32)
    # for i in range(0, len(s)):
    #     S[i,i] = math.sqrt(s[i])

    return U.T, s, Vt

In [15]:
# U, S, Vt = computeSVD(sparse_matrix, 100)

In [16]:
# U.shape

In [17]:
# S.shape

In [18]:
# Vt.shape

In [19]:
# np.dot(U, np.dot(np.diag(S), Vt))

### Implement SVD from dense matrix with np.linalg

In [20]:
u, s, vh = np.linalg.svd(urm, full_matrices=False)

In [21]:
u.shape

(969, 944)

In [22]:
s.shape

(944,)

In [23]:
vh.shape

(944, 944)

In [24]:
np.dot(u * s, vh)

array([[5., 0., 0., ..., 0., 0., 0.],
       [0., 2., 0., ..., 0., 0., 0.],
       [0., 0., 5., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 2., 0., 0.],
       [0., 0., 0., ..., 0., 5., 0.]], dtype=float32)

### Implement SVD from dense matrix with svds

In [25]:
U, sigma, Vt = svds(urm, k = 50)

In [26]:
U.shape

(969, 50)

In [27]:
sigma.shape

(50,)

In [29]:
s = np.diag(sigma)

In [30]:
s.shape

(50, 50)

In [28]:
Vt.shape

(50, 944)

In [32]:
rightTerm = np.dot(s, Vt)

In [33]:
rightTerm.shape

(50, 944)

In [37]:
test_prod = rightTerm[:, product_dict['B000CDSS22']]

In [None]:
U[:, ]

### Make ratings predictions from SVD matrix.

In [215]:
def recompose_matrix(U, S, Vt, user_dict, product_dict):

    rightTerm = np.dot(S, Vt)

    print('Right Term')
    print(rightTerm.shape)
    print(rightTerm)

    estimated_ratings = np.zeros(shape=(len(user_dict), len(product_dict)), dtype=np.float16)

    with open('reviews.training.csv', 'r') as test_file:
        test_reader = csv.reader(test_file, delimiter=',')
        next(test_reader, None)
        with open('reviews.test.labeled.csv', 'w') as outfile:
            outfile_reader = csv.writer(outfile, delimiter=',')
            outfile_reader.writerow(['userID', 'actual overall', 'predicted'])

            for row in test_reader:
                pass

                print('U queried shape')
                u_queried = U[:, user_dict[row[0]]]
                # print(u_queried.shape)
                print(u_queried)
                #
                print('Right term queried shape')
                rt_queried = rightTerm[:, product_dict[row[1]]]
                # print(rt_queried.shape)
                print(rt_queried)
                #
                print('Product of u queried by rt queried')
                prod = np.dot(u_queried, rt_queried)
                print(prod.shape)
                print(prod)

                # print('Estimated ratings')
                # estimated_ratings[:, user_dict[row[0]]] = prod.todense()
                # print(estimated_ratings)

                # estimated_ratings[user_dict[row[0]], :] = prod.todense()
                # predicted_rating = (estimated_ratings[user_dict[row[0]], product_dict[row[1]]])
                outfile_reader.writerow([row[0], row[2], prod])

    return estimated_ratings

In [216]:
estimated_ratings = recompose_matrix(U, S, Vt, user_dict, product_dict)

Right Term
(3, 944)
[[ 1.95178935e-20 -1.79320799e-17 -3.62921679e-17 ...  7.25561708e-17
   1.26387935e-16  1.14096283e-16]
 [-1.75554369e-20  2.02293626e-16  3.78484008e-17 ... -7.97219254e-16
  -1.31921309e-16 -1.34773945e-15]
 [ 3.28741732e-20 -1.41293686e-17 -4.56704885e-16 ... -3.69167201e-17
   2.10447054e-16 -3.42716586e-16]]
U queried shape


KeyError: 'AMFIPCYDYWGVT'