# HW4: implementing item-based CF with cosine
First, run recommenderDemo.ipynb and be familar with the code and data.
Second, implement item-based CF with cosine

In [315]:
import gzip
from collections import defaultdict
from scipy import spatial
import scipy.optimize
import numpy as np
import random

1. load the data, and convert integer-valued fields as we go. Note that here we use the same "Musical Instruments" dataset. Download the date from here: https://web.cs.wpi.edu/~kmlee/cs547/amazon_reviews_us_Musical_Instruments_v1_00_small.tsv.gz
The dataset contains 20K user-item reviews.

In [316]:
# From https://web.cs.wpi.edu/~kmlee/cs547/amazon_reviews_us_Musical_Instruments_v1_00_small.tsv.gz
#----------------------------------------------
# Your code starts here
#   Please add comments or text cells in between to explain the general idea of each block of the code.
#   Please feel free to add more cells below this cell if necessary
path = "/home/InfoRetrieval/hw4/data/amazon_reviews_us_Musical_Instruments_v1_00_small.tsv.gz"
f = gzip.open(path, 'rt', encoding="utf8")

2. now store the loaded data into a matrix -- you may use numpy array/matrix to store the untility matrix or use sparse matrix (advanced approach)

In [317]:
#----------------------------------------------
# Your code starts here
#   Please add comments or text cells in between to explain the general idea of each block of the code.
#   Please feel free to add more cells below this cell if necessary
header = f.readline()
header = header.strip().split('\t')
dataset = []
for line in f:
    fields = line.strip().split('\t')
    d = dict(zip(header, fields))
    d['star_rating'] = int(d['star_rating'])
    d['helpful_votes'] = int(d['helpful_votes'])
    d['total_votes'] = int(d['total_votes'])
    dataset.append(d)

itemNames = {}

usersPerItem = defaultdict()
itemsPerUser = defaultdict()

for d in dataset:
    user,item = d['customer_id'], d['product_id']
    usersPerItem[item] = (user)
    itemsPerUser[user] = (item)
    itemNames[item] = d['product_title']

reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)

itemNames = {}

for d in dataset:
    user,item = d['customer_id'], d['product_id']
    reviewsPerUser[user].append(d)
    reviewsPerItem[item].append(d)

3. Implement cosine function and rating prediction function by using the cosine function. If a hasn't rated any similar items before, then return ratingMean (i.e., global rating mean). Refer to predictRating() in hw4jaccard.ipynb

In [318]:
#----------------------------------------------
# Your code starts here
#   Please add comments or text cells in between to explain the general idea of each block of the code.
#   Please feel free to add more cells below this cell if necessary
ratingMean = sum([d['star_rating'] for d in dataset]) / len(dataset)

def Pad(arr1, arr2):
    diff = len(arr2) - len(arr1)
    if diff <0 :
        for i in range(abs(diff)):
            arr2 = np.append(arr2, ratingMean)
    elif diff > 0 :
        for i in range(abs(diff)):
            arr1 = np.append(arr1, ratingMean)

    return arr1, arr2

def Cosine(u1, u2):
    array1 = np.asarray([reviewsPerUser[u1][i]['star_rating'] for i in range(len(reviewsPerUser[u1]))])
    array2 = np.asarray([reviewsPerUser[u2][i]['star_rating'] for i in range(len(reviewsPerUser[u2]))])
    
    array1, array2 = Pad(array1, array2)
    
    return spatial.distance.cosine(array1, array2)

def predictRatingCosine(user, product):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['product_id']
        if i2 == product: continue
        ratings.append(d['star_rating'])
        similarities.append(Cosine(usersPerItem[product],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return sum(weightedRatings) / sum(similarities)
    else:
        # User hasn't rated any similar items
        return ratingMean

labels = [d['star_rating'] for d in dataset]

4. Measure and report MSE (don't need to change the below code)

In [319]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)
print(dataset[0])
cfPredictions = [predictRatingCosine(d['customer_id'], d['product_id']) for d in dataset]
print(MSE(cfPredictions, labels))

{'marketplace': 'US', 'customer_id': '45610553', 'review_id': 'RMDCHWD0Y5OZ9', 'product_id': 'B00HH62VB6', 'product_parent': '618218723', 'product_title': 'AGPtek® 10 Isolated Output 9V 12V 18V Guitar Pedal Board Power Supply Effect Pedals with Isolated Short Cricuit / Overcurrent Protection', 'product_category': 'Musical Instruments', 'star_rating': 3, 'helpful_votes': 0, 'total_votes': 1, 'vine': 'N', 'verified_purchase': 'N', 'review_headline': 'Three Stars', 'review_body': 'Works very good, but induces ALOT of noise.', 'review_date': '2015-08-31'}
[0.029854811713493357, 0.029854811713493357, 0.029854811713493357, 0.029854811713493357, 0.029854811713493357, 0.029854811713493357]
[0, 0.2985404111165405]
[0, 0.016295649291735437]
[0.0874078846521048]
[0.11924634553298619, 0, 0, 0.01466196695735944, 0, 0.014049347380350241]
[0, 0, 0, 0.20363538269243808, 0, 0, 0.060871412193709395, 0, 0, 0, 0, 0]
[0, 0, 0.026323746570075635, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0.01466196695735944]
[0.044

(optional/bonus task: you will get additional 25 points) 
download https://web.cs.wpi.edu/~kmlee/cs547/amazon_reviews_us_Musical_Instruments_v1_00_large.tsv.gz
this dataset contains over 900K user-item reviews. repeat the above process (i.e., meauring MSE with cosine). report the MSE and compare it with MSE of alwaysPredictMean. This optional task would require better data structure and implementation.

In [320]:
# From https://web.cs.wpi.edu/~kmlee/cs547/amazon_reviews_us_Musical_Instruments_v1_00_small.tsv.gz
#----------------------------------------------
# Your code starts here
#   Please add comments or text cells in between to explain the general idea of each block of the code.
#   Please feel free to add more cells below this cell if necessary
path = "/home/InfoRetrieval/hw4/data/amazon_reviews_us_Musical_Instruments_v1_00_large.tsv.gz"
f = gzip.open(path, 'rt', encoding="utf8")

In [321]:
#----------------------------------------------
# Your code starts here
#   Please add comments or text cells in between to explain the general idea of each block of the code.
#   Please feel free to add more cells below this cell if necessary
header = f.readline()
header = header.strip().split('\t')
dataset = []
for line in f:
    fields = line.strip().split('\t')
    d = dict(zip(header, fields))
    d['star_rating'] = int(d['star_rating'])
    d['helpful_votes'] = int(d['helpful_votes'])
    d['total_votes'] = int(d['total_votes'])
    dataset.append(d)

itemNames = {}

usersPerItem = defaultdict()
itemsPerUser = defaultdict()

for d in dataset:
    user,item = d['customer_id'], d['product_id']
    usersPerItem[item] = (user)
    itemsPerUser[user] = (item)
    itemNames[item] = d['product_title']

reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)

itemNames = {}

for d in dataset:
    user,item = d['customer_id'], d['product_id']
    reviewsPerUser[user].append(d)
    reviewsPerItem[item].append(d)

In [322]:
#----------------------------------------------
# Your code starts here
#   Please add comments or text cells in between to explain the general idea of each block of the code.
#   Please feel free to add more cells below this cell if necessary
ratingMean = sum([d['star_rating'] for d in dataset]) / len(dataset)

def Pad(arr1, arr2):
    diff = len(arr2) - len(arr1)
    if diff <0 :
        for i in range(abs(diff)):
            arr2 = np.append(arr2, ratingMean)
    elif diff > 0 :
        for i in range(abs(diff)):
            arr1 = np.append(arr1, ratingMean)

    return arr1, arr2

def Cosine(u1, u2):
    array1 = np.asarray([reviewsPerUser[u1][i]['star_rating'] for i in range(len(reviewsPerUser[u1]))])
    array2 = np.asarray([reviewsPerUser[u2][i]['star_rating'] for i in range(len(reviewsPerUser[u2]))])
    
    array1, array2 = Pad(array1, array2)
    
    return spatial.distance.cosine(array1, array2)

def predictRatingCosine(user, product):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['product_id']
        if i2 == product: continue
        ratings.append(d['star_rating'])
        similarities.append(Cosine(usersPerItem[product],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return sum(weightedRatings) / sum(similarities)
    else:
        # User hasn't rated any similar items
        return ratingMean

labels = [d['star_rating'] for d in dataset]

In [323]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)
print(dataset[0])
cfPredictions = [predictRatingCosine(d['customer_id'], d['product_id']) for d in dataset]
print(MSE(cfPredictions, labels))

{'marketplace': 'US', 'customer_id': '45610553', 'review_id': 'RMDCHWD0Y5OZ9', 'product_id': 'B00HH62VB6', 'product_parent': '618218723', 'product_title': 'AGPtek® 10 Isolated Output 9V 12V 18V Guitar Pedal Board Power Supply Effect Pedals with Isolated Short Cricuit / Overcurrent Protection', 'product_category': 'Musical Instruments', 'star_rating': 3, 'helpful_votes': 0, 'total_votes': 1, 'vine': 'N', 'verified_purchase': 'N', 'review_headline': 'Three Stars', 'review_body': 'Works very good, but induces ALOT of noise.', 'review_date': '2015-08-31'}
1.6310097627998033


*-----------------
# Done

All set! 

** What do you need to submit?**

* **hw4.ipynb Notebook File**: Save this Jupyter notebook with all output, and find the notebook file in your folder (for example, "filename.ipynb"). This is the file you need to submit. 

** How to submit: **
        Please submit through canvas.wpi.edu
