In [69]:
reviews = (spark
    .read
    .json('./data/raw_data/reviews_Musical_Instruments_5.json.gz'))

In [11]:
reviews.toPandas().head(5)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,1384719342,"[0, 0]",5.0,"Not much to write about here, but it does exac...","02 28, 2014",A2IBPI20UZIR0U,"cassandra tu ""Yeah, well, that's just like, u...",good,1393545600
1,1384719342,"[13, 14]",5.0,The product does exactly as it should and is q...,"03 16, 2013",A14VAT5EAX3D9S,Jake,Jake,1363392000
2,1384719342,"[1, 1]",5.0,The primary job of this device is to block the...,"08 28, 2013",A195EZSQDW3E21,"Rick Bennette ""Rick Bennette""",It Does The Job Well,1377648000
3,1384719342,"[0, 0]",5.0,Nice windscreen protects my MXL mic and preven...,"02 14, 2014",A2C00NNG1ZQQG2,"RustyBill ""Sunday Rocker""",GOOD WINDSCREEN FOR THE MONEY,1392336000
4,1384719342,"[0, 0]",5.0,This pop filter is great. It looks and perform...,"02 21, 2014",A94QU4C90B1AX,SEAN MASLANKA,No more pops when I record my vocals.,1392940800


In [12]:
product_list = reviews.groupBy('asin').count()
product_list.sort('count', ascending = False).toPandas().head(10)

Unnamed: 0,asin,count
0,B003VWJ2K8,163
1,B0002E1G5C,143
2,B0002F7K7Y,116
3,B003VWKPHC,114
4,B0002H0A3S,93
5,B0002CZVXM,74
6,B0006NDF8A,71
7,B0009G1E0K,69
8,B0002E2KPC,68
9,B0002GLDQM,67


In [13]:
reviews_per_reviewer = reviews.groupBy('reviewerID').count()

In [14]:
from pyspark.sql.functions import col, udf, avg
from pyspark.sql.types import DoubleType

usefulness_ratio = udf(
    lambda (useful, out_of): useful / float(out_of + 1), 
    returnType=DoubleType())

usefulness = (reviews
  .select('reviewerID', usefulness_ratio(col('helpful')).alias('usefulness'))
  .groupBy('reviewerID')
  .agg(avg(col('usefulness')).alias('usefulness')))

In [15]:
rankings = (usefulness
    .join(reviews_per_reviewer, 'reviewerID')
    .select('reviewerID', (col('usefulness') * col('count')).alias('rank'))
    .filter(col('rank') > 1))

In [16]:
most_reviewed_product = reviews.groupBy('asin').count().sort('count', ascending=False).take(1)[0][0]

best_reviewers = (reviews
   .filter(col('asin') == most_reviewed_product)
   .join(rankings, 'reviewerID')
   .select('asin', 'reviewerID', 'rank', 'summary'))

good_reviews = best_reviewers.filter(col('overall') > 3).sort('rank', ascending=False)

bad_reviews = best_reviewers.filter(col('overall') <= 3).sort('rank', ascending=False)

In [17]:
bad_reviews.toPandas().head(10)

Unnamed: 0,asin,reviewerID,rank,summary
0,B003VWJ2K8,AG3PVU7DCXZTK,3.9,"Excellent tuner, some issues with the ball joi..."
1,B003VWJ2K8,A3IRXJOT9PY6SE,3.437229,"Two were dead after 3 days, third times the ch..."
2,B003VWJ2K8,A15TYOEWBQYF0X,3.333333,I have another brand that works so much easier...
3,B003VWJ2K8,AYTKUTAP0VA53,1.833333,Broken head
4,B003VWJ2K8,A2RVY2GDMZHH4,1.5,Works great if it doesn't break


In [70]:
meta.toPandas().head(10)

Unnamed: 0,_corrupt_record,asin,brand,categories,description,imUrl,price,related,salesRank,title
0,,6428320,,"[[Musical Instruments, Instrument Accessories,...",,http://ecx.images-amazon.com/images/I/41EpRmh8...,17.95,,"(None, None, None, None, None, None, None, Non...","Six Sonatas For Two Flutes Or Violins, Volume ..."
1,,14072149,,[[Musical Instruments]],Composer: J.S. Bach.Peters Edition.For two vio...,http://ecx.images-amazon.com/images/I/41m6ygCq...,18.77,"(None, [B0058DK7RA], None, [B0058DK7RA])","(None, None, None, None, None, None, None, Non...",Double Concerto in D Minor By Johann Sebastian...
2,,41291905,,"[[Musical Instruments, Instrument Accessories,...",Vivaldi's famous set of four violin concertos ...,http://ecx.images-amazon.com/images/I/41maAqSO...,,,"(None, None, None, None, None, None, None, Non...",Hal Leonard Vivaldi Four Seasons for Piano (Or...
3,,41913574,,[[Musical Instruments]],444 pages. \nReprint of corrected and revised ...,http://ecx.images-amazon.com/images/I/513kRMv%...,49.99,,,"Aida: Opera in Quattro Atti, Partitura -- Aida..."
4,,201891859,,"[[Musical Instruments, Instrument Accessories,...",,http://ecx.images-amazon.com/images/I/41SXCAzs...,,,"(None, None, None, None, None, None, None, Non...",Nocturnes
5,,577088726,,"[[Musical Instruments, Instrument Accessories,...","Sisyphus Redux for solo alto flute, , Ferneyho...",http://g-ecx.images-amazon.com/images/G/01/x-s...,,,,Sisyphus Redux for solo alto flute
6,,634029363,,"[[CDs & Vinyl, Special Interest, Instructional...",Learn to play eight of Clapton's best acoustic...,http://ecx.images-amazon.com/images/I/51TA56RV...,18.99,"([0634029355, 0634029347, 0793520843, 63044989...","(None, None, None, None, None, None, None, Non...",Hal Leonard Eric Clapton - Acoustic Classics (...
7,,634029355,,"[[CDs & Vinyl, Special Interest, Instructional...",Learn the riffs and solos of Eric Clapton's ea...,http://ecx.images-amazon.com/images/I/51XTMRTM...,13.48,"([0634029347, 0634029363, B00C5JH890, B0057CON...","(None, None, None, None, None, None, None, Non...",Hal Leonard Eric Clapton - The Early Years (DVD)
8,,634029347,,"[[CDs & Vinyl, Special Interest, Instructional...","Riffs and solos from 8 classic Clapton songs, ...",http://ecx.images-amazon.com/images/I/51N2Q8Z4...,16.19,"([0634029355, 0634029363, B008FIPNQK, B002DGEG...","(None, None, None, None, None, None, None, Non...",Hal Leonard Eric Clapton - The Solo Years DVD
9,,634029231,,"[[Movies & TV, Movies], [Musical Instruments, ...",,http://ecx.images-amazon.com/images/I/51FXTXCW...,,,"(None, None, None, None, None, None, None, Non...",Best of Stevie Ray Vaughan Signature Licks Gui...


In [42]:
# find out other products these reviews also reviewed

meta = reviews = (spark
    .read
    .json('./data/metadata/meta_Musical_Instruments.json.gz')
    .filter('categories is not null'))

In [74]:
meta_with_related = meta.filter('related is not null')

In [67]:
from pyspark.sql.functions import udf, col

last = udf(lambda categories: categories[0][-1])

product_to_category = meta.select('asin', last(col('categories')).alias('category'))

In [136]:
product_to_category.filter('asin = "0006428320"').show()

+----------+-------------------+
|      asin|           category|
+----------+-------------------+
|0006428320|Sheet Music Folders|
+----------+-------------------+



In [144]:
sheet_music_folders = (product_to_category
 .sort('category', acending=False)
 .limit(90000)
#  .filter(col('category') == 'Sheet Music Folders')
)
sheet_music_folders.count()

84892

In [162]:
saddles = product_to_category.sort('category', ascending=False).limit(90000).filter('category = "Saddles"').select('asin')

In [165]:
reviews.join(saddles, reviews.asin == saddles.asin).count()

24

In [119]:
reviews.groupBy('reviewerID').count().sort('count', ascending= False).show()

+--------------+-----+
|    reviewerID|count|
+--------------+-----+
| ADH0O8UVJOT10|   42|
|A1L7M2JXN4EZCR|   38|
|A15TYOEWBQYF0X|   38|
|A2EZWZ8MBEDOLN|   36|
|A2NYK9KWFMJV4Y|   34|
|A1MVH1WLYDHZ49|   32|
|A1SD1C8XK3Z3V1|   32|
|A1GMWTGXW682GB|   29|
|A34O0KQV4QXWNQ|   28|
|A1LQC225SE8UNI|   27|
|A22Z554ZQ8NFPC|   25|
| AJK15Q9JOEHRH|   25|
|A1DVUFG2QSJ6IK|   24|
|A3M1PLEYNDEYO8|   24|
| AKYDGCKCY7H9F|   23|
| A2F92AOWTIUIB|   23|
| A781ITP3HE2N5|   22|
|A3UXW18DP4WSD6|   21|
|A1YP96MT6W9FTB|   21|
|A3ITN3125FJETP|   21|
+--------------+-----+
only showing top 20 rows



In [134]:
reviews.select('asin').distinct().count()

900

In [138]:
print(meta.select('asin').distinct().count())
print(reviews.count())

84892
10261


In [155]:
products = reviews.select(col('asin').alias('asin2')).distinct()
refined_dictionary = (products 
 .join(sheet_music_folders, products.asin2 == sheet_music_folders.asin)
 .select('asin', 'category'))

In [156]:
refined_dictionary.show()

+----------+--------------------+
|      asin|            category|
+----------+--------------------+
|B000MWWT6E|Distortion & Over...|
|B000P5NXWM|             Saddles|
|B000RNB720|Guitar & Bass Acc...|
|B002R2IUEW|               Picks|
|B002T45X1G|Condenser Microph...|
|B005MR6IHK|              Tuning|
|B0002D0Q2W|        Equalization|
|B00267OCTA|Multiple-Guitar S...|
|B004U1QDL0|Chorus, Flange & ...|
|B0002CZUUG|          Solid Body|
|B001SC4I16|            Ukuleles|
|B009MBT68U|  Sheet Music Stands|
|B0002D02IU|Steel-string Acou...|
|B0002E1O3G|Electric Guitar S...|
|B0002E1O90|Acoustic Guitar S...|
|B001E3SFKO|Windsreens & Pop ...|
|B002IC1D5E|Electric Guitar S...|
|B003BFYDBS|              Straps|
|B004PFWYL4|Electric Guitar B...|
|B000AC6DVS|Guitar & Bass Acc...|
+----------+--------------------+
only showing top 20 rows



In [None]:
meta.select('asin', col('related'))

In [68]:


products_in_same_subcategory = meta.select('asin', same_category(col('related'), col('asin')))

+---------------+----------+-----+--------------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+
|_corrupt_record|      asin|brand|          categories|         description|               imUrl|price|             related|           salesRank|               title|
+---------------+----------+-----+--------------------+--------------------+--------------------+-----+--------------------+--------------------+--------------------+
|           null|0006428320| null|[WrappedArray(Mus...|                null|http://ecx.images...|17.95|                null|[null,null,null,n...|Six Sonatas For T...|
|           null|0014072149|     |[WrappedArray(Mus...|Composer: J.S. Ba...|http://ecx.images...|18.77|[null,WrappedArra...|[null,null,null,n...|Double Concerto i...|
|           null|0041291905| null|[WrappedArray(Mus...|Vivaldi's famous ...|http://ecx.images...| null|                null|[null,null,null,n...|Hal Leonard Vival...

In [20]:
# find related products from meta data

related_reviews = (meta
   .filter(col('asin') == 'B0002E1G5C')
   .select('related'))
related_reviews.show(truncate = False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [54]:
# extract the category of the products

categories = (meta
   .filter(col('asin') == 'B0002E1G5C')
   .select('categories'))
categories.show(truncate = False)

+------------------------------------------------------------------------------------------------------+
|categories                                                                                            |
+------------------------------------------------------------------------------------------------------+
|[WrappedArray(Musical Instruments, Instrument Accessories, Guitar & Bass Accessories, String Winders)]|
+------------------------------------------------------------------------------------------------------+



In [6]:
# building a full recommender system

seed = 1800009193L
(split_60_df, split_a_20_df, split_b_20_df) = reviews.randomSplit([6.0, 2.0, 2.0], seed)

training_df = split_60_df.cache()
validation_df = split_a_20_df.cache()
test_df = split_b_20_df.cache()

print('Training: {0}, validation: {1}, test: {2}\n'.format(
  training_df.count(), validation_df.count(), test_df.count())
)
training_df.show(3)
validation_df.show(3)
test_df.show(3)

Training: 6151, validation: 2048, test: 2062

+----------+-------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|      asin|helpful|overall|          reviewText| reviewTime|    reviewerID|        reviewerName|             summary|unixReviewTime|
+----------+-------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|1384719342| [0, 0]|    5.0|Nice windscreen p...|02 14, 2014|A2C00NNG1ZQQG2|RustyBill "Sunday...|GOOD WINDSCREEN F...|    1392336000|
|1384719342| [0, 0]|    5.0|Not much to write...|02 28, 2014|A2IBPI20UZIR0U|cassandra tu "Yea...|                good|    1393545600|
|1384719342| [1, 1]|    5.0|The primary job o...|08 28, 2013|A195EZSQDW3E21|Rick Bennette "Ri...|It Does The Job Well|    1377648000|
+----------+-------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
only showing top

In [13]:
reviews.printSchema()

root
 |-- asin: string (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)



In [29]:
# convert reveiwerID from string to int

reviewer_list = reviews.groupBy('reviewerID').count()
reviewer_list.sort('count', ascending = False).toPandas().head(10)

from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="reviewerID", outputCol="reviewerIndex")
indexed = indexer.fit(reviews).transform(reviews)

indexer2 = StringIndexer(inputCol="asin", outputCol="asinIndex")
indexed = indexer2.fit(indexed).transform(indexed)

indexed.toPandas()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,reviewerIndex,asinIndex
0,1384719342,"[0, 0]",5.0,"Not much to write about here, but it does exac...","02 28, 2014",A2IBPI20UZIR0U,"cassandra tu ""Yeah, well, that's just like, u...",good,1393545600,72.0,781.0
1,1384719342,"[13, 14]",5.0,The product does exactly as it should and is q...,"03 16, 2013",A14VAT5EAX3D9S,Jake,Jake,1363392000,359.0,781.0
2,1384719342,"[1, 1]",5.0,The primary job of this device is to block the...,"08 28, 2013",A195EZSQDW3E21,"Rick Bennette ""Rick Bennette""",It Does The Job Well,1377648000,436.0,781.0
3,1384719342,"[0, 0]",5.0,Nice windscreen protects my MXL mic and preven...,"02 14, 2014",A2C00NNG1ZQQG2,"RustyBill ""Sunday Rocker""",GOOD WINDSCREEN FOR THE MONEY,1392336000,1216.0,781.0
4,1384719342,"[0, 0]",5.0,This pop filter is great. It looks and perform...,"02 21, 2014",A94QU4C90B1AX,SEAN MASLANKA,No more pops when I record my vocals.,1392940800,1137.0,781.0
5,B00004Y2UT,"[0, 0]",5.0,So good that I bought another one. Love the h...,"12 21, 2012",A2A039TZMZHH9Y,"Bill Lewey ""blewey""",The Best Cable,1356048000,54.0,629.0
6,B00004Y2UT,"[0, 0]",5.0,"I have used monster cables for years, and with...","01 19, 2014",A1UPZM995ZAH90,Brian,Monster Standard 100 - 21' Instrument Cable,1390089600,348.0,629.0
7,B00004Y2UT,"[0, 0]",3.0,I now use this cable to run from the output of...,"11 16, 2012",AJNFQI3YR6XJ5,"Fender Guy ""Rick""",Didn't fit my 1996 Fender Strat...,1353024000,324.0,629.0
8,B00004Y2UT,"[0, 0]",5.0,Perfect for my Epiphone Sheraton II. Monster ...,"07 6, 2008",A3M1PLEYNDEYO8,"G. Thomas ""Tom""",Great cable,1215302400,12.0,629.0
9,B00004Y2UT,"[0, 0]",5.0,Monster makes the best cables and a lifetime w...,"01 8, 2014",AMNTZU1YQN1TH,Kurt Robair,Best Instrument Cables On The Market,1389139200,185.0,629.0


In [30]:
seed = 1800009193L
(split_60_df, split_a_20_df, split_b_20_df) = indexed.randomSplit([6.0, 2.0, 2.0], seed)

training_df = split_60_df.cache()
validation_df = split_a_20_df.cache()
test_df = split_b_20_df.cache()

print('Training: {0}, validation: {1}, test: {2}\n'.format(
  training_df.count(), validation_df.count(), test_df.count())
)
training_df.toPandas()

Training: 6151, validation: 2048, test: 2062



Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,reviewerIndex,asinIndex
0,1384719342,"[0, 0]",5.0,Nice windscreen protects my MXL mic and preven...,"02 14, 2014",A2C00NNG1ZQQG2,"RustyBill ""Sunday Rocker""",GOOD WINDSCREEN FOR THE MONEY,1392336000,1216.0,781.0
1,1384719342,"[0, 0]",5.0,"Not much to write about here, but it does exac...","02 28, 2014",A2IBPI20UZIR0U,"cassandra tu ""Yeah, well, that's just like, u...",good,1393545600,72.0,781.0
2,1384719342,"[1, 1]",5.0,The primary job of this device is to block the...,"08 28, 2013",A195EZSQDW3E21,"Rick Bennette ""Rick Bennette""",It Does The Job Well,1377648000,436.0,781.0
3,1384719342,"[13, 14]",5.0,The product does exactly as it should and is q...,"03 16, 2013",A14VAT5EAX3D9S,Jake,Jake,1363392000,359.0,781.0
4,B00004Y2UT,"[0, 0]",5.0,Monster makes the best cables and a lifetime w...,"01 8, 2014",AMNTZU1YQN1TH,Kurt Robair,Best Instrument Cables On The Market,1389139200,185.0,629.0
5,B00004Y2UT,"[0, 0]",5.0,So good that I bought another one. Love the h...,"12 21, 2012",A2A039TZMZHH9Y,"Bill Lewey ""blewey""",The Best Cable,1356048000,54.0,629.0
6,B00005ML71,"[0, 0]",4.0,I got it to have it if I needed it. I have fou...,"04 22, 2014",A35QFQI0M46LWO,Christopher C,It works great but I hardly use it.,1398124800,425.0,870.0
7,B00005ML71,"[0, 0]",5.0,I bought this to use in my home studio to cont...,"12 31, 2012",A17SLR18TUMULM,Straydogger,It works!,1356912000,651.0,870.0
8,B00005ML71,"[0, 0]",5.0,"I love it, I used this for my Yamaha ypt-230 a...","06 16, 2013",A1C0O09LOLVI39,Michael,awesome,1371340800,55.0,870.0
9,B000068NSX,"[0, 0]",4.0,This Fender cable is the perfect length for me...,"08 13, 2013",AKSFZ4G1AXYFC,"C.E. ""Frank""",Durable Instrument Cable,1376352000,93.0,538.0


In [33]:
# TODO: Replace <FILL IN> with appropriate code
# This step is broken in ML Pipelines: https://issues.apache.org/jira/browse/SPARK-14489
from pyspark.ml.recommendation import ALS

# Let's initialize our ALS learner
als = ALS()

# Now we set the parameters for the method
als.setMaxIter(5)\
    .setSeed(seed)\
    .setRegParam(0.1)\
    .setUserCol('reviewerIndex')\
    .setItemCol('asinIndex')\
    .setRatingCol('overall')
    
# Now let's compute an evaluation metric for our test dataset
from pyspark.ml.evaluation import RegressionEvaluator

# Create an RMSE evaluator using the label and predicted columns
reg_eval = RegressionEvaluator(predictionCol="prediction", labelCol="overall", metricName="rmse")

tolerance = 0.03
ranks = [20, 24, 28]
errors = [0, 0, 0]
models = [0, 0, 0]
err = 0
min_error = float('inf')
best_rank = -1
for rank in ranks:
    # Set the rank here:
    als.setRank(rank)
    # Create the model with these parameters.
    model = als.fit(training_df)
    # Run the model to create a prediction. Predict against the validation_df.
    predict_df = model.transform(validation_df)
    
    # Remove NaN values from prediction (due to SPARK-14489)
    predicted_ratings_df = predict_df.filter(predict_df.prediction != float('nan'))

    # Run the previously created RMSE evaluator, reg_eval, on the predicted_ratings_df DataFrame
    error = reg_eval.evaluate(predicted_ratings_df)
    errors[err] = error
    models[err] = model
    print 'For rank %s the RMSE is %s' % (rank, error)
    if error < min_error:
        min_error = error
        best_rank = err
    err += 1

als.setRank(ranks[best_rank])
print 'The best model was trained with rank %s' % ranks[best_rank]
my_model = models[best_rank]

For rank 20 the RMSE is 2.18499637444
For rank 24 the RMSE is 1.91118127249
For rank 28 the RMSE is 1.9408043433
The best model was trained with rank 24


In [35]:
# TODO: Replace <FILL_IN> with the appropriate code
# In ML Pipelines, this next step has a bug that produces unwanted NaN values. We
# have to filter them out. See https://issues.apache.org/jira/browse/SPARK-14489
predict_df = my_model.transform(test_df)

# Remove NaN values from prediction (due to SPARK-14489)
predicted_test_df = predict_df.filter(predict_df.prediction != float('nan'))

# Run the previously created RMSE evaluator, reg_eval, on the predicted_test_df DataFrame
test_RMSE = reg_eval.evaluate(predicted_test_df)

print('The model had a RMSE on the test set of {0}'.format(test_RMSE))

The model had a RMSE on the test set of 1.98658845486


In [37]:
# TODO: Replace <FILL_IN> with the appropriate code.
# Compute the average rating
import pyspark.sql.functions as F

avg_rating_df = training_df.groupBy().avg('overall')

# Extract the average rating value. (This is row 0, column 0.)
training_avg_rating = avg_rating_df.collect()[0][0]

print('The average rating for movies in the training set is {0}'.format(training_avg_rating))

# Add a column with the average rating
test_for_avg_df = test_df.withColumn('prediction', F.lit(training_avg_rating))

# Run the previously created RMSE evaluator, reg_eval, on the test_for_avg_df DataFrame
test_avg_RMSE = reg_eval.evaluate(test_for_avg_df)

print("The RMSE on the average set is {0}".format(test_avg_RMSE))

The average rating for movies in the training set is 4.4999187124
The RMSE on the average set is 0.894400260161
