<a href="https://colab.research.google.com/github/jtsou/Amazon-Recommendation-Engine/blob/main/ISYE_6740.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Final Project - Build recommendation engine using Amazon electronic products dataset



In [None]:
import os
import json
import gzip
import pandas as pd
from urllib.request import urlopen
import seaborn
from matplotlib import pyplot as plt
import random
import numpy as np
from tqdm import tqdm_notebook as tqdm
from collections import defaultdict
# import wget

## 1. Import data and build user/items dataframe - *Jennifer Tsou*

In [None]:
# !wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Electronics.csv

--2022-04-18 04:41:41--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Electronics.csv
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 876247561 (836M) [application/octet-stream]
Saving to: ‘Electronics.csv.2’


2022-04-18 04:42:21 (20.8 MB/s) - ‘Electronics.csv.2’ saved [876247561/876247561]



In [None]:
electronics_rating = pd.read_csv('/Users/Pikatsou/Downloads/Electronics_filtered.csv', header= None)

FileNotFoundError: ignored

In [None]:
electronics_rating.columns = ['item','user','rating','timestamp']
electronics_rating.head()

Unnamed: 0,item,user,rating,timestamp
0,60009810,A1N070NS9CJQ2I,5.0,1026864000
1,60009810,A3P0KRKOBQK1KN,5.0,1025913600
2,60009810,A192HO2ICJ75VU,5.0,1025654400
3,60009810,A2T278FKFL3BLT,4.0,1025395200
4,60009810,A2ZUXVTW8RXBXW,5.0,1025222400


In [None]:
# Jennifer - Please upload the dataset to the shared drive in order to ingest data
electronics_rating = pd.read_csv('Electronics_filtered.csv')
electronics_rating.head()

NameError: ignored

# Data formatting and exploration

In [None]:
electronics_rating['timestamp'] = pd.to_datetime(electronics_rating['timestamp'],unit='s')


In [None]:
electronics_rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20994353 entries, 0 to 20994352
Data columns (total 4 columns):
 #   Column     Dtype         
---  ------     -----         
 0   item       object        
 1   user       object        
 2   rating     float64       
 3   timestamp  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 640.7+ MB


In [None]:
electronics_rating.timestamp.describe()

  """Entry point for launching an IPython kernel.


count                20994353
unique                   7015
top       2016-03-01 00:00:00
freq                    25244
first     1997-12-04 00:00:00
last      2018-10-05 00:00:00
Name: timestamp, dtype: object

In [None]:
electronics_rating.item.describe()

count       20994353
unique        756489
top       B010OYASRG
freq           28539
Name: item, dtype: object

In [None]:
electronics_rating.user.describe()

count          20994353
unique          9838676
top       A680RUE1FDO8B
freq                633
Name: user, dtype: object

In [None]:
# Find out the rating distribution
electronics_rating.groupby('rating').size()

In [None]:
electronics_rating.hist(column="rating",grid=False,bins=5)

# Create user_id and item_id columns - Ming Lu

In [None]:
# Create indices df for user and item lists
user_list = pd.DataFrame(electronics_rating.user.unique(),columns=["user"]).reset_index().rename(columns={"index":"user_index"})
item_list = pd.DataFrame(electronics_rating.item.unique(),columns=["item"]).reset_index().rename(columns={"index":"item_index"})

In [None]:
# Add user_id and item_id columns to the df
electronics_rating = electronics_rating.merge(user_list,on='user')
electronics_rating = electronics_rating.merge(item_list,on='item')

In [None]:
electronics_rating.head()

# Split train and test data - Ming Lu

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(electronics_rating, test_size=0.2, random_state=42, stratify=electronics_rating.user)

In [None]:
# Validate both datasets contains the same users
train.user.unique().shape[0],test.user.unique().shape[0]

## 2. Build similarity function to calculate distance between users - *Jennifer Tsou*

In [None]:
# KNN model with cosine
from sklearn.neighbors import NearestNeighbors
n_users = electronics_rating.user.unique().shape[0]
n_items = electronics_rating.item.unique().shape[0]

NameError: ignored

In [None]:
# create user-item matrix for training dataset
# Ming - I updated the for loop to generate the matrix using the user_id and item_id
data_matrix_train = np.zeros((n_users, n_items))
for line in train.itertuples():
    data_matrix_train[line[5], line[6]] = line[3]

In [None]:
# create user-item matrix for testing dataset
data_matrix_test = np.zeros((n_users, n_items))
for line in test.itertuples():
    data_matrix_test[line[5], line[6]] = line[3]

In [None]:
# calculate similarity
from sklearn.metrics.pairwise import pairwise_distances 
user_similarity = pairwise_distances(data_matrix_test, metric='cosine')
item_similarity = pairwise_distances(data_matrix_test.T, metric='cosine')

## 3. (optional) build function to make predictions based on similarity - *Jennifer Tsou*

In [None]:
# build function to make predictions based on similarity
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred
# make prediction
user_prediction = predict(data_matrix_test, user_similarity, type='user')
item_prediction = predict(data_matrix_test, item_similarity, type='item')

NameError: ignored

## 4. Build utility functions for training the models - MSE, SGD, etc. - *Ming Lu*

In [None]:
# Full user rating-item matrix
def full_matrix(b,user_bias,item_bias,P,Q):
    return b + user_bias[:,np.newaxis] + item_bias[np.newaxis:,] + P.dot(Q.T)

In [None]:
# Ratings for user i and item j
def get_rating(b,user_bias,item_bias,P,Q,i,j):
    prediction = b + user_bias[i] + item_bias[j] + P[i, :].dot(Q[j, :].T)
    return prediction

In [None]:
# Computing total mean squared error
def mse(data_matrix,b,user_bias,item_bias,P,Q):
    xs, ys = data_matrix.nonzero()
    predicted = full_matrix(b,user_bias,item_bias,P,Q)
    error = 0
    for x, y in zip(xs, ys):
        error += pow(data_matrix[x, y] - predicted[x, y], 2)
    return np.sqrt(error)

In [None]:
# Stochastic gradient descent to get optimized P and Q matrix
def sgd(b,user_bias,item_bias,P,Q,samples,alpha,beta):
    # alpha – Learning rate for stochastic gradient descent
    # beta – Regularization parameter for bias
    
    for i, j, r in samples:
        prediction = get_rating(b,user_bias,item_bias,P,Q,i,j)
        e = (r - prediction)

        # Update the bias terms and P & Q matrices
        user_bias[i] += alpha * (e - beta * user_bias[i])
        item_bias[j] += alpha * (e - beta * item_bias[j])

        P[i, :] += alpha * (e * Q[j, :] - beta * P[i,:])
        Q[j, :] += alpha * (e * P[i, :] - beta * Q[j,:])
    return user_bias, item_bias, P, Q

## 5. Build training function - *Jonhua Qin*

In [None]:
def train_model(data_matrix, K, alpha, beta, iterations=100):
    # data_matrix – The user-movie rating matrix 
    # K – Number of latent features
    # iterations – Number of iterations to perform stochastic gradient descent
    
    # Define number of user and items
    n_users = data_matrix.shape[0]
    n_items = data_matrix.shape[1]
    
    # Initializing user-feature (P) and movie-feature (Q) matrix 
    P = np.random.normal(scale=1./K, size=(n_users, K))
    Q = np.random.normal(scale=1./K, size=(n_items, K))

    # Initialize bias terms
    user_bias = np.zeros(n_users)
    item_bias = np.zeros(n_items)
    b = np.mean(data_matrix[np.where(data_matrix != 0)])
    
    # Define list of training samples
    samples = [
    (i, j, data_matrix[i, j])
    for i in range(n_users)
    for j in range(n_items)
    if data_matrix[i, j] > 0
    ]
    
    # Stochastic gradient descent for given number of iterations
    training_process = []
    for i in range(iterations):
        np.random.shuffle(samples)
        user_bias, item_bias, P, Q = sgd(b,user_bias,item_bias,P,Q,samples,alpha,beta)
        error = mse(data_matrix,b,user_bias,item_bias,P,Q)
        training_process.append((i, error))
        if (i+1) % 20 == 0:
            print("Iteration: %d ; error = %.4f" % (i+1, error))

    return training_process,b,user_bias,item_bias,P,Q

## 6. Make predictions using the model - *Jonhua Qin*

In [None]:
# predict the missing ratings with alpha = 0.001, beta=0.01, K=20
training_process,b,user_bias,item_bias,P,Q = train_model(data_matrix_train, K=20, alpha=0.001, beta=0.01, iterations=100)

In [None]:
# predict the missing ratings with alpha = 0.001, beta=0.01, K=50
training_process_2,b_2,user_bias_2,item_bias_2,P_2,Q_2 = train_model(data_matrix_train, K=50, alpha=0.001, beta=0.01, iterations=100)

In [None]:
# predict the missing ratings with alpha = 0.01, beta=0.01, K=20
training_process_3,b_3,user_bias_3,item_bias_3,P_3,Q_3 = train_model(data_matrix_train, K=20, alpha=0.01, beta=0.01, iterations=100)

In [None]:
# predict the missing ratings with alpha = 0.001, beta=0.1, K=20
training_process_4,b_4,user_bias_4,item_bias_4,P_4,Q_4 = train_model(data_matrix_train, K=20, alpha=0.001, beta=0.1, iterations=100)

In [None]:
# predict the missing ratings with alpha = 0.1, beta=0.01, K=20
training_process_5,b_5,user_bias_5,item_bias_5,P_5,Q_5 = train_model(data_matrix_train, K=20, alpha=0.1, beta=0.01, iterations=100)

Convert the best model into prediction matrix

In [None]:
# Round the predictions to integer ratings
prediction = np.around(full_matrix(b,user_bias,item_bias,P,Q),0)

Validate the prediction results with the test dataset - **to be completed**

## 7. Visualize Results - *Ming Lu*

In [None]:
plt.plot(training_process[:,0],training_process[:,1],label="model 1")
plt.plot(training_process2[:,0],training_process2[:,1],label="model 2")
plt.plot(training_process3[:,0],training_process3[:,1],label="model 3")
plt.title("MSE Cost vs. Iteration")
plt.xlabel("Iterations")
plt.ylabel("MSE")
plt.legend()
plt.show()

# **Sample Codes**
### Case study using movie lens data

In [None]:
# Read data
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_train = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')
ratings_train.shape, ratings_test.shape
# get number of users and items
n_users = ratings.user_id.unique().shape[0]
n_items = ratings.movie_id.unique().shape[0]
# create user-item matrix
data_matrix = np.zeros((n_users, n_items))
for line in ratings.itertuples():
    data_matrix[line[1]-1, line[2]-1] = line[3]

In [None]:
# calculate similarity
from sklearn.metrics.pairwise import pairwise_distances 
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

In [None]:
# build function to make predictions based on similarity
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [None]:
# make prediction
user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')

### Building a recommendation engine using matrix factorization. The input for this function are:

- R – The user-movie rating matrix 
- K – Number of latent features
- alpha – Learning rate for stochastic gradient descent
- beta – Regularization parameter for bias
- iterations – Number of iterations to perform stochastic gradient descent

In [None]:
class MF():

    # Initializing the user-movie rating matrix, no. of latent features, alpha and beta.
    def __init__(self, R, K, alpha, beta, iterations):
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations

    # Initializing user-feature and movie-feature matrix 
    def train(self):
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # Initializing the bias terms
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])

        # List of training samples
        self.samples = [
        (i, j, self.R[i, j])
        for i in range(self.num_users)
        for j in range(self.num_items)
        if self.R[i, j] > 0
        ]

        # Stochastic gradient descent for given number of iterations
        training_process = []
        for i in range(self.iterations):
        np.random.shuffle(self.samples)
        self.sgd()
        mse = self.mse()
        training_process.append((i, mse))
        if (i+1) % 20 == 0:
            print("Iteration: %d ; error = %.4f" % (i+1, mse))

        return training_process

    # Computing total mean squared error
    def mse(self):
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    # Stochastic gradient descent to get optimized P and Q matrix
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_rating(i, j)
            e = (r - prediction)

            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])

            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])

    # Ratings for user i and moive j
    def get_rating(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

    # Full user-movie rating matrix
    def full_matrix(self):
        return mf.b + mf.b_u[:,np.newaxis] + mf.b_i[np.newaxis:,] + mf.P.dot(mf.Q.T)

In [None]:
# convert the user item rating to matrix form
R= np.array(ratings.pivot(index = 'user_id', columns ='movie_id', values = 'rating').fillna(0))

In [None]:
# predict the missing ratings
mf = MF(R, K=20, alpha=0.001, beta=0.01, iterations=100)
training_process = mf.train()
print()
print("P x Q:")
print(mf.full_matrix())
print()