In [5]:
import pandas as pd
import json
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# Loading and Storing the Data

In [6]:
# Output json training data as a Pandas dataframe.
def get_training_data(file_name):

    try:
        training_data = pd.read_json('reviews.training.json', lines=True)
        return training_data
    except:
        print('Please try another file name.')
        return None

In [7]:
training_data = get_training_data('reviews.training.json')

In [8]:
training_data.shape

(1358026, 9)

In [9]:
training_data.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,B0090SI56Y,"[0, 0]",4,The movie was entertaining and falls in line w...,"12 22, 2013",AMFIPCYDYWGVT,AMRDxn,Entertaining,1387670400
1,B00005JL99,"[3, 6]",5,One thing to be said about Japanese horror/ext...,"11 19, 2003",A3G602Z4DWDZKS,Matthew King,Absolutely stunning! A battle not to be missed!,1069200000
2,B00109KN0M,"[27, 28]",5,Michelle Pfeiffer gives a brilliant performanc...,"02 13, 2008",A33BOYMVG3U58Y,TV Critic,Michelle Pfeiffer shines,1202860800
3,B00005JMPT,"[3, 4]",5,I wanted to see &quot;The Missing&quot; when i...,"06 18, 2004",ANEDXRFDZDL18,Serene Night,Hard Gritty Western,1087516800
4,B00005AAA9,"[0, 0]",4,I think &#34;Miss Congeniality&#34; is a light...,"10 22, 2013",A1VN7IS16PY024,Amazon Customer,Very entertaining but.....,1382400000


In [10]:
# Convert Pandas dataframe to csv file for storage purposes.
# NOTE: Don't run this with the actual training data. This was just for saving a small version of the file for time
# saving purposes while I was setting up my dataframe and matrices.
def convert_to_csv(dataframe, desired_filename):

    try:
        return dataframe.to_csv(desired_filename, index=False)
    except:
        print('Please try another dataframe or file name.')

In [11]:
# convert_to_csv(shortened_training_data, 'shortened_training_data.csv')

In [12]:
# Create dataframe grouped by product (asin)
asin_data = training_data.groupby('asin').count()
asin_data = asin_data.reset_index()

In [13]:
asin_data.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,5019281,82,82,82,82,82,82,82,82
1,5119367,73,73,73,73,73,73,73,73
2,307141985,4,4,4,4,4,4,4,4
3,307142469,31,31,31,31,31,31,31,31
4,307142477,5,5,5,5,5,5,5,5


In [14]:
asin_data.shape

(50050, 9)

In [15]:
# Create dataframe grouped by reviewer (reviewerID)
reviewer_data = training_data.groupby('reviewerID').count()
reviewer_data = reviewer_data.reset_index()

In [16]:
reviewer_data.head()

Unnamed: 0,reviewerID,asin,helpful,overall,reviewText,reviewTime,reviewerName,summary,unixReviewTime
0,A00295401U6S2UG3RAQSZ,5,5,5,5,5,5,5,5
1,A00348066Q1WEW5BMESN,5,5,5,5,5,5,5,5
2,A0040548BPHKXMHH3NTI,8,8,8,8,8,8,8,8
3,A00438023NNXSDBGXK56L,4,4,4,4,4,4,4,4
4,A0048168OBFNFN7WW8XC,7,7,7,7,7,7,7,7


In [17]:
reviewer_data.shape

(123952, 9)

# Converting Dataframe to a CSR Matrix

In [18]:
# Pivot datarame and create Reviewer x Product matrix populated with ratings. Return tuple with sparse matrix and dataframe.
def create_reviewer_product_matrix(dataframe):

    # Pivot the dataframe so that unique reviewers are on the y axis and unique products are on the x axis.
    # NOTE: Removed zeros in order to perform algebraic operations.
    reviewer_product_dataframe = dataframe.pivot(index='reviewerID', columns='asin', values='overall').fillna(0)

    # Convert the dataframe to a matrix.
    # This matrix still contains NaN values.
    reviewer_product_sparse = csr_matrix(reviewer_product_dataframe.values)

    return (reviewer_product_sparse, reviewer_product_dataframe)

In [1]:
reviewer_product_matrix = create_reviewer_product_matrix(training_data)

NameError: name 'create_reviewer_product_matrix' is not defined

In [None]:
reviewer_product_sparse = reviewer_product_matrix[0]

In [None]:
reviewer_product_dataframe = reviewer_product_matrix[1]

# Implementing K Nearest Neighbors

In [6]:
# Input a matrix and return a k_nn model using cosine similarity.
# NOTE: In the future, this should be switched to a centered cosine.
def k_nn(matrix):

    try:
        model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
        model_knn.fit(matrix)
        return model_knn

    except:
        print('Please try another matrix.')
        return None

# Making Recommendations

In [7]:
def make_recommendations(dataframe):
    query_index = np.random.choice(dataframe.shape[0])
    distances, indices = model_knn.kneighbors(dataframe.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 6)
    for i in range(0, len(distances.flatten())):
        if i == 0:
            print('Nearest neighbors of {0}:\n'.format(dataframe.index[query_index]))
        else:
            print('{0}: {1}, with distance of {2}:'.format(i, dataframe.index[indices.flatten()[i]], distances
                                                           .flatten()[i]))
    return None

# Running Functions

In [None]:
# training_data = get_training_data('reviews.training.json')
# convert_to_csv(shortened_training_data, 'shortened_training_data.csv')
# reviewer_product_sparse = create_reviewer_product_matrix(training_data)[0]
# reviewer_product_dataframe = create_reviewer_product_matrix(training_data)[1]
model_knn = k_nn(reviewer_product_sparse)
make_recommendations(reviewer_product_dataframe)