# Similarity-based recommendation

In [4]:
import gzip
from collections import defaultdict # dictionary that takes default values
import scipy
import scipy.optimize
import numpy
import random

In [2]:
path = "amazon_reviews_us_Musical_Instruments_v1_00.tsv.gz"

In [5]:
f = gzip.open(path, 'rt', encoding='utf8')

In [6]:
header = f.readline()

In [7]:
header

'marketplace\tcustomer_id\treview_id\tproduct_id\tproduct_parent\tproduct_title\tproduct_category\tstar_rating\thelpful_votes\ttotal_votes\tvine\tverified_purchase\treview_headline\treview_body\treview_date\n'

In [9]:
header = header.strip().split('\t')

In [10]:
header

['marketplace',
 'customer_id',
 'review_id',
 'product_id',
 'product_parent',
 'product_title',
 'product_category',
 'star_rating',
 'helpful_votes',
 'total_votes',
 'vine',
 'verified_purchase',
 'review_headline',
 'review_body',
 'review_date']

In [11]:
dataset = []

In [13]:
for line in f:
    fields = line.strip().split('\t')
    d = dict(zip(header, fields))
    d['star_rating'] = int(d['star_rating'])
    d['helpful_votes'] = int(d['helpful_votes'])
    d['total_votes'] = int(d['total_votes'])
    dataset.append(d)

In [14]:
dataset[0]

{'marketplace': 'US',
 'customer_id': '14640079',
 'review_id': 'RZSL0BALIYUNU',
 'product_id': 'B003LRN53I',
 'product_parent': '986692292',
 'product_title': 'Sennheiser HD203 Closed-Back DJ Headphones',
 'product_category': 'Musical Instruments',
 'star_rating': 5,
 'helpful_votes': 0,
 'total_votes': 0,
 'vine': 'N',
 'verified_purchase': 'Y',
 'review_headline': 'Five Stars',
 'review_body': 'Nice headphones at a reasonable price.',
 'review_date': '2015-08-31'}

In [15]:
dataset[0]['customer_id']

'14640079'

In [16]:
len(dataset)

904764

In [19]:
usersPerItem = defaultdict(set) # U_i # d[x].add(y) without checking whether d[x] is initialized
itemsPerUser = defaultdict(set) # I_u

In [20]:
itemNames = {}

In [21]:
for d in dataset:
    user = d['customer_id']
    item = d['product_id']
    itemNames[item] = d['product_title']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)

In [22]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer / denom

In [23]:
def mostSimilar(i, N=10):
    # Iterate through all items j (other than i); compute Jaccard similarity;
    # return most similar
    similarities = []
    users = usersPerItem[i]
    for j in usersPerItem: # For all items
        if i == j: continue # Don't compare i to itself
        sim = Jaccard(users, usersPerItem[j])
        similarities.append((sim, j)) # Keep track of sim and the item that generated it
    similarities.sort(reverse=True) # Sort by most similar
    return similarities[:N]

Generating a recommendation

In [33]:
entry = dataset[1]

In [34]:
entry

{'marketplace': 'US',
 'customer_id': '6111003',
 'review_id': 'RIZR67JKUDBI0',
 'product_id': 'B0006VMBHI',
 'product_parent': '603261968',
 'product_title': 'AudioQuest LP record clean brush',
 'product_category': 'Musical Instruments',
 'star_rating': 3,
 'helpful_votes': 0,
 'total_votes': 1,
 'vine': 'N',
 'verified_purchase': 'Y',
 'review_headline': 'Three Stars',
 'review_body': 'removes dust. does not clean',
 'review_date': '2015-08-31'}

In [35]:
query = entry['product_id']

In [36]:
query

'B0006VMBHI'

In [37]:
m = mostSimilar(query)

In [29]:
m

[(0.005194805194805195, 'B000GBAR5G'),
 (0.004842615012106538, 'B00181T20O'),
 (0.004024144869215292, 'B0002E3FCO'),
 (0.0035714285714285713, 'B00JQYUAQU'),
 (0.0035714285714285713, 'B00IPQZWAY'),
 (0.0035714285714285713, 'B00GT01VQW'),
 (0.0035714285714285713, 'B00DZVYDEK'),
 (0.0035714285714285713, 'B005FM2DJE'),
 (0.0035714285714285713, 'B004XNRFIU'),
 (0.0035714285714285713, 'B00439I880')]

In [38]:
len(usersPerItem[query])

305

In [39]:
itemNames[query]

'AudioQuest LP record clean brush'

In [40]:
[itemNames[i] for _,i in m]

['Shure SFG-2 Stylus Tracking Force Gauge',
 'Shure M97xE High-Performance Magnetic Phono Cartridge',
 'ART Pro Audio DJPRE II Phono Turntable Preamplifier',
 'Signstek Blue LCD Backlight Digital Long-Playing LP Turntable Stylus Force Scale Gauge Tester',
 'Audio Technica AT120E/T Standard Mount Phono Cartridge',
 'Technics: 45 Adaptor for Technics 1200 (SFWE010)',
 'GruvGlide GRUVGLIDE DJ Package',
 'STANTON MAGNETICS Record Cleaner Kit',
 'Shure M97xE High-Performance Magnetic Phono Cartridge',
 'Behringer PP400 Ultra Compact Phono Preamplifier']