In [3]:
dataset = 'Musical_Instruments'

reviews_filepath = './data/raw_data/reviews_{0}_5.json.gz'.format(dataset)
metadata_filepath = './data/metadata/meta_{0}.json.gz'.format(dataset)

In [None]:
%load modules/scripts/Load\ datasets.py

In [None]:
%load modules/scripts/Summarize\ reviews.py

In [10]:
%load modules/scripts/Trusted\ user\ groups.py

Unnamed: 0,reviewerID,trustedness
0,A17A1KTVI3DG6U,3.047815
1,A36C867ZDP30NQ,2.0
2,A5MC7LP0ZBO4Q,0.0
3,A2DG65AWX5RJ4J,0.666667
4,A2IZ3ST24HSO4H,0.0
5,ACWJDL1ZYX8RE,0.833333
6,A3AOPVQ7EZHTWA,7.267545
7,A3LOJ2QHXITCF7,0.0
8,A2CARFAX5FNQT9,0.833333
9,AX11NOUMV8G95,1.0


In [81]:
%load modules/scripts/Recommender\ system.py

The RMSE of always predicting 0.509133408363 stars is 0.491705072053
The RMSE of the recommender system is 0.389991926061


## Select a product

In [84]:
reviewed_products = (all_metadata
    .join(reviews, 'asin')
    .filter('''
        categories is not null 
        and related is not null'''))

(reviewed_products
     .sample(
         withReplacement=False, 
         fraction= 10. / reviewed_products.count() , 
         seed=4325535L)
     .select('asin', 'title')
     .toPandas())

Unnamed: 0,asin,title
0,B000068NW5,Hosa Cable GTR210 Guitar Instrument Cable - 10...
1,B0002CZV82,Boss DS1 Distortion Guitar Pedal
2,B0002E1NQ4,"Neotech 5701002 Super Banjo Strap, Black"
3,B0002GW3Y8,Fingerease Guitar String Lubricant
4,B0002M6B2M,"Martin M140 Bronze Acoustic Guitar Strings, Light"
5,B0002M72JS,Electro-Harmonix 12AX7EH Preamp Tube
6,B000KIRT74,Behringer TO800 Vintage Tube-Sound Overdrive P...
7,B000L7MNUM,Mighty Bright Duet Music Stand Light
8,B000LFCXL8,Seiko SQ50-V Quartz Metronome
9,B000WS1QC6,Yamaha PA130 120 Volt Keyboard AC Power Adaptor


In [85]:
selected_product = 'B000L7MNUM'

## Product negative words

In [86]:
from pyspark.ml.feature import Tokenizer
from pyspark.sql.functions import explode

product_words_per_reviewer = (
    Tokenizer(inputCol='summary', outputCol='words')
        .transform(all_reviews.filter(col('asin') == selected_product))
    .select('reviewerID', 'words'))

word_ranks = (product_words_per_reviewer
    .select(explode(col('words')).alias('word'))
    .distinct()
    .join(negative_predictive_words, col('word') == negative_predictive_words.negative_word)
    .select('word', 'neg_prob')
    .sort('neg_prob', ascending=False))

word_ranks.limit(10).toPandas()

Unnamed: 0,word,neg_prob
0,bummer,0.658206
1,ok,0.64858
2,not,0.609827
3,while,0.560444
4,work,0.519024
5,was,0.483148
6,light,0.482553
7,of,0.464182
8,they,0.461554
9,i,0.449433


In [87]:
selected_negative_word = 'bummer'

## Trusted users that used the word

In [111]:
from pyspark.sql.functions import udf, lit
from pyspark.sql.types import BooleanType

is_elemen_of = udf(lambda word, words: word in words, BooleanType())

users_that_used_the_word = (product_words_per_reviewer
    .filter(is_elemen_of(lit(selected_negative_word), col('words')))
    .select('reviewerID'))

users_that_used_the_word.toPandas()

Unnamed: 0,reviewerID
0,A35XRT4BW4I6UD


## Suggested products in the same category

In [78]:
product_category = (reviewed_products
    .filter(col('asin') == selected_product)
    .select('categories')
    .take(1)[0][0][0][-1])

print('Product category: {0}'.format(product_category))

Product category: Tuning Pegs


In [113]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)

last_element = udf(lambda categories: categories[0][-1])

products_in_same_category = (reviewed_products
    .limit(100000)
    .filter(last_element(col('categories')) == product_category)
    .select('asin', 'title')
    .distinct())

products_in_same_category.limit(10).toPandas()

Unnamed: 0,asin,title
0,B0007ORM0K,"Grover 102-18C Rotomatic 18:1 3 per Side Tuners, Chrome"
1,B001L8IKLG,Fender American Series Stratocaster Guitar Tuners with Gold Hardware Set of 6 Gold
2,B000L6GD04,Fender Locking Tuners Chrome
3,B0007Y3XGW,"Grover 406C Rotomatic Mini 3 per Side Self Locking Machine Heads, Chrome"
4,B0002E2G2O,Fender Strat/Tele Vintage Machine Head set of 6


In [118]:
indexed_products = indexer.transform(
    products_in_same_category.crossJoin(users_that_used_the_word))

alternative_products = recommender_system.transform(indexed_products).sort('prediction', ascending=False)

alternative_products.toPandas()

Unnamed: 0,asin,title,reviewerID,reviewerIndex,asinIndex,prediction
0,B000L6GD04,Fender Locking Tuners Chrome,A35XRT4BW4I6UD,76.0,30.0,0.629843
1,B001L8IKLG,Fender American Series Stratocaster Guitar Tuners with Gold Hardware Set of 6 Gold,A35XRT4BW4I6UD,76.0,825.0,0.59957
2,B0007Y3XGW,"Grover 406C Rotomatic Mini 3 per Side Self Locking Machine Heads, Chrome",A35XRT4BW4I6UD,76.0,621.0,0.597146
3,B0007ORM0K,"Grover 102-18C Rotomatic 18:1 3 per Side Tuners, Chrome",A35XRT4BW4I6UD,76.0,547.0,0.573342
4,B0002E2G2O,Fender Strat/Tele Vintage Machine Head set of 6,A35XRT4BW4I6UD,76.0,798.0,0.535599
