In [268]:
import pandas as pd
import pyspark
from pyspark.sql.functions import col
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import SQLContext
from batcave import get_amazon_list_ids
import pyspark.sql.functions as F
from pyspark.sql.types import BooleanType

Importing Amazon reviews through Spark

In [2]:
spark = (pyspark.sql.SparkSession.builder
    .master("local")
    .getOrCreate())

In [6]:
movies_df = spark.read.json('data/reviews_Movies_and_TV.json')

In [6]:
books_df = spark.read.json('data/reviews_Books.json')

In [6]:
metadata = spark.read.json('data/metadata.json')

These are complete datasets from a project by [Julien McAuley.](http://jmcauley.ucsd.edu/data/amazon/links.html) To help narrow my results, I started by scraping the top 100 DC & Marvel Graphic Novels for their ASIN, so that I could start with an understanding of where to find the reviews I want to target. 

In [10]:
#DC Comics - top ID's
dc_url_1 = "https://www.amazon.com/gp/bestsellers/books/193766/ref=pd_zg_hrsr_books"
dc_url_2 = "https://www.amazon.com/Best-Sellers-Books-DC-Comics-Graphic-Novels/zgbs/books/193766/ref=zg_bs_pg_2?_encoding=UTF8&pg=2"
dc_ids_1 = get_amazon_list_ids(dc_url_1)
dc_ids_2 = get_amazon_list_ids(dc_url_2)

In [9]:
# Marvel Comics - top ID's
marvel_url_1 = "https://www.amazon.com/gp/bestsellers/books/4400/ref=pd_zg_hrsr_books"
marvel_url_2 = "https://www.amazon.com/Best-Sellers-Books-Marvel-Comics-Graphic-Novels/zgbs/books/4400/ref=zg_bs_pg_2?_encoding=UTF8&pg=2"
marvel_ids_1 = get_amazon_list_ids(marvel_url_1)
marvel_ids_2 = get_amazon_list_ids(marvel_url_2)

In [11]:
all_ids = dc_ids_1 + dc_ids_2 + marvel_ids_1 + marvel_ids_2

Saving these id's temporarily to a csv, to preserve them.

In [None]:
import csv

with open('data/top_dc_marvel_comics.csv', 'w') as f:
    wr = csv.writer(f)
    wr.writerow(all_ids)

In [66]:
top_comic_meta = metadata.filter(col("asin").isin(all_ids)).toPandas()
top_comic_meta.shape

KeyboardInterrupt: 

In [362]:
top_comic_meta.head(20)

Unnamed: 0,_corrupt_record,asin,brand,categories,description,imUrl,price,related,salesRank,title
0,,014038572X,,[[Books]],"According to Ponyboy, there are two kinds of p...",http://ecx.images-amazon.com/images/I/61Xc9GWp...,4.99,"([0078205409, 0440932211, 0547534264, 01403896...","(None, None, None, None, None, 3106, None, Non...",The Outsiders
1,,0785117210,,[[Books]],,http://ecx.images-amazon.com/images/I/51upWJjH...,10.49,"([0785123202, 0785157050, 078512179X, 07851329...","(None, None, None, None, None, 19065, None, No...",House of M
2,,0785121056,,[[Books]],,http://ecx.images-amazon.com/images/I/61iW2SXT...,11.49,"([0785131272, 0785131280, 0785166432, 07851565...","(None, None, None, None, None, 95214, None, No...",Infinity War
3,,140122427X,,[[Books]],,http://ecx.images-amazon.com/images/I/81g0BQDw...,20.0,"([1401228798, 1401230970, 1401233902, 14012349...","(None, None, None, None, None, 793, None, None...",Fables: The Deluxe Edition Book One
4,,1401229697,,[[Books]],"Starred Review. A stunning, moving story about...",http://ecx.images-amazon.com/images/I/71%2BCEZ...,11.01,"([1935429000, 160309038X, 1603090746, 17704604...","(None, None, None, None, None, 6414, None, Non...",Daytripper
5,,1401230067,,[[Books]],,http://ecx.images-amazon.com/images/I/81IQpmRp...,11.49,"([1401233023, 1401235190, 1401236901, 14012380...","(None, None, None, None, None, 11659, None, No...","John Constantine, Hellblazer, Vol. 1: Original..."
6,,1401233791,,[[Books]],Dennis O'Neil is the influential writer of com...,http://ecx.images-amazon.com/images/I/81hLFMDQ...,16.09,"([1401235360, 1401237215, 1401232744, 14012338...","(None, None, None, None, None, 9262, None, Non...","Batman: Knightfall, Vol. 1"
7,,1401233384,,[[Books]],"Geoff Johns, a Detroit native, brings a Hollyw...",http://ecx.images-amazon.com/images/I/81LZ1mW0...,10.0,"([1401234054, 1401230016, 1401234488, 14012343...","(None, None, None, None, None, 21061, None, No...",Flashpoint
8,,1401237789,,[[Books]],Praise for Scott Snyder'sBatman: Court of Owls...,http://ecx.images-amazon.com/images/I/91DDC3%2...,9.36,"([1401235425, 1401246028, 1401242529, 14012402...","(None, None, None, None, None, 1361, None, Non...",Batman Vol. 2: The City of Owls (The New 52)
9,,1401242383,,[[Books]],Warren Ellis is a prolific writer whose works ...,http://ecx.images-amazon.com/images/I/81adYxKE...,47.73,"([1401238998, 1401242758, 1616552379, 07851656...","(None, None, None, None, None, 18519, None, No...",The Planetary Omnibus


In [189]:
top_comic_meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 10 columns):
_corrupt_record    0 non-null object
asin               16 non-null object
brand              0 non-null object
categories         16 non-null object
description        9 non-null object
imUrl              16 non-null object
price              16 non-null float64
related            16 non-null object
salesRank          15 non-null object
title              15 non-null object
dtypes: float64(1), object(9)
memory usage: 1.3+ KB


In [123]:
top_meta = top_comic_meta.drop([0,12,13,14, 15])

In [233]:
top_meta['related'].head()

1    ([0785123202, 0785157050, 078512179X, 07851329...
2    ([0785131272, 0785131280, 0785166432, 07851565...
3    ([1401228798, 1401230970, 1401233902, 14012349...
4    ([1935429000, 160309038X, 1603090746, 17704604...
5    ([1401233023, 1401235190, 1401236901, 14012380...
Name: related, dtype: object

In [239]:
more_comic_ids = list(set([val for meta in top_meta.related.tolist() for val in meta[0]]))

In [240]:
more_comic_meta = metadata.filter(col("asin").isin(more_comic_ids)).toPandas()

In [232]:
more_comic_meta['related'].head()

0    ([0375714545, 0618871713, 1570614598, 18918304...
1    ([0316107093, 0316107107, 0316107298, 03161073...
2    ([1606904388, 0345506391, 1606901605, 16069021...
3    ([1932664165, 1620101130, 1620100045, 14424659...
4    ([1560974273, 0375404538, 0375714545, 03073773...
Name: related, dtype: object

In [258]:
more_related = list(set([val for meta in top_meta.related.tolist() for val in meta[0] if meta[0] is not None]))

In [260]:
more_meta = metadata.filter(col("asin").isin(more_related)).toPandas()

In [342]:
more_meta['related'].head()

0    ([0375714545, 0618871713, 1570614598, 18918304...
1    ([0316107093, 0316107107, 0316107298, 03161073...
2    ([1606904388, 0345506391, 1606901605, 16069021...
3    ([1932664165, 1620101130, 1620100045, 14424659...
4    ([1560974273, 0375404538, 0375714545, 03073773...
Name: related, dtype: object

In [308]:
def asin_starts_with(asin, digit='1607'):
    """Filter based on beginning ASIN numeric value"""
    if type(asin) == str:
        if asin.startswith(digit):
            return True
    return False
    

asin_starts_udf = F.udf(asin_starts_with, BooleanType())

In [309]:
image_meta = metadata.filter(asin_starts_udf(metadata['asin'], F.lit('1607')))

In [310]:
image_meta.show(5)

KeyboardInterrupt: 

In [319]:
query = """
SELECT 
    asin,
    title,
    categories
    FROM metadata
    WHERE asin LIKE '160706%'
"""

In [320]:
metadata.createOrReplaceTempView('metadata')

In [321]:
image_meta = spark.sql(query).toPandas()

In [327]:
image_meta.head()

Unnamed: 0,asin,title,categories
0,1607060051,Screamland,[[Books]]
1,1607060191,Ted McKeever Library Book 3: Metropol (Bk. 3),[[Books]]
2,1607060086,Bruce: The Little Blue Spruce,[[Books]]
3,1607060043,"Outlaw Territory, Vol. 1",[[Books]]
4,160706023X,Zombie Cop,[[Books]]


In [328]:
query_2 = """
SELECT 
    asin,
    title,
    categories
    FROM metadata
    WHERE asin LIKE '07851%'
"""

In [329]:
metadata.createOrReplaceTempView('metadata')

In [330]:
marvel_meta = spark.sql(query_2).toPandas()

In [363]:
query_3 = """
SELECT 
    asin,
    title,
    categories
    FROM metadata
    WHERE asin LIKE '14012%'
"""

In [364]:
metadata.createOrReplaceTempView('metadata')

In [365]:
dc_meta = spark.sql(query_3).toPandas()

In [367]:
dc_meta.head()

Unnamed: 0,asin,title,categories
0,1401200168,Green Lantern: Emerald Dawn II,[[Books]]
1,1401200141,"Atom, The - Archives, Volume 2 (DC Archive Edi...",[[Books]]
2,1401200338,High Roads (Cliffhanger!),[[Books]]
3,1401200346,Batman/Deathblow: After the Fire (Batman Beyon...,[[Books]]
4,1401200370,Batman: Absolution,[[Books]]


In [343]:
top_merge = top_comic_meta[['asin', 'title', 'categories']]
m1_merge = more_comic_meta[['asin', 'title', 'categories']]
m2_merge = more_meta[['asin', 'title', 'categories']]

In [380]:
all_comic_df = pd.concat([top_merge, m1_merge, m2_merge, image_meta, marvel_meta, dc_meta])
all_comic_df['categories'] = all_comic_df['categories'].astype(str)
all_comic_df.head()

Unnamed: 0,asin,title,categories
0,014038572X,The Outsiders,[['Books']]
1,0785117210,House of M,[['Books']]
2,0785121056,Infinity War,[['Books']]
3,140122427X,Fables: The Deluxe Edition Book One,[['Books']]
4,1401229697,Daytripper,[['Books']]


In [381]:
all_comics_df = all_comic_df.drop_duplicates()

In [382]:
all_comics_df[all_comics_df['categories'] != '[[Books]]']

Unnamed: 0,asin,title,categories
0,014038572X,The Outsiders,[['Books']]
1,0785117210,House of M,[['Books']]
2,0785121056,Infinity War,[['Books']]
3,140122427X,Fables: The Deluxe Edition Book One,[['Books']]
4,1401229697,Daytripper,[['Books']]
5,1401230067,"John Constantine, Hellblazer, Vol. 1: Original...",[['Books']]
6,1401233791,"Batman: Knightfall, Vol. 1",[['Books']]
7,1401233384,Flashpoint,[['Books']]
8,1401237789,Batman Vol. 2: The City of Owls (The New 52),[['Books']]
9,1401242383,The Planetary Omnibus,[['Books']]


In [383]:
all_comics_df = all_comics_df.drop([13,14,15,380,369,777,1092], axis=0)

In [387]:
all_comics_df.to_csv('data/all_comic_asin.csv')

In [385]:
comic_reviews = books_df.filter(col('asin').isin(all_comics_df.asin.tolist())).toPandas()

Py4JJavaError: An error occurred while calling o11514.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 86 in stage 63.0 failed 1 times, most recent failure: Lost task 86.0 in stage 63.0 (TID 2389, localhost, executor driver): TaskResultLost (result lost from block manager)
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:299)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3257)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3254)
	at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3364)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3363)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3254)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [291]:
metadata.printSchema()

root
 |-- _corrupt_record: string (nullable = true)
 |-- asin: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- description: string (nullable = true)
 |-- imUrl: string (nullable = true)
 |-- price: double (nullable = true)
 |-- related: struct (nullable = true)
 |    |-- also_bought: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- also_viewed: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- bought_together: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- buy_after_viewing: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |-- salesRank: struct (nullable = true)
 |    |-- Appliances: long (nullable = true)
 |    |-- Arts, Crafts & Sewing: long (nullable = true)
 |    |-- Automotive: long (nullable = true)
 |    |-- Baby: long (nullable = true)
 |    |-- Beauty: long (nullable = true)


In [85]:
marvel_reco = marvel_meta['related'].tolist()
temp = [also[0] for also in marvel_reco]

In [86]:
all_marvel_recos = [item for sublist in temp for item in sublist]

In [88]:
all_marvel_reco_df = metadata.filter(col('asin').isin(all_marvel_recos))

In [90]:
all_marvel_reco_df.count()

426

In [246]:
metadata_split = metadata.withColumn('bought', metadata.related.also_bought)

In [253]:
testing = metadata_split.limit(5).toPandas()

[None,
 None,
 None,
 ['0000032050',
  'B00D0DJAEG',
  '0000032042',
  'B00D0F450I',
  'B00D2JTMS2',
  'B00D0FDUAY',
  'B00D2JSRFQ',
  '0000032034',
  'B00D0D5F6S',
  'B00D2JRWWA',
  'B00D0FIIJM',
  'B00D0FCQQI',
  'B00EXVN9PU',
  'B0041EOTJO',
  'B004PYEE8G',
  'B001GTKPDQ',
  'B00EON0SJ2',
  'B005HMHOQ4',
  'B002XZMGGQ'],
 ['B002BZX8Z6',
  'B00JHONN1S',
  '0000031895',
  'B00D2K1M3O',
  '0000031852',
  'B00D0WDS9A',
  'B00D10CLVW',
  'B00D103F8U',
  'B003AVEU6G',
  'B00D2K0PA0',
  'B002GZGI4E',
  'B00D0ZF44Y',
  'B008F0SMUC',
  'B00D0GCI8S',
  'B008F0SU0Y',
  'B002YSCPZY',
  '0448408775',
  'B002R0FABA',
  'B008GHWNWC',
  'B002R0FA24',
  'B001GTKPEK',
  'B006XA7KZO',
  'B001GZUQ9S',
  'B00613VNL0',
  'B003IEDM9Q',
  'B003LTOZK8',
  'B003AVNY6I',
  'B008UBQZKU',
  'B001AQD8VQ',
  'B003ILA0L2',
  'B00AFDOPDA',
  'B002R0F7FE']]

In [174]:
metadata.dtypes

[('_corrupt_record', 'string'),
 ('asin', 'string'),
 ('brand', 'string'),
 ('categories', 'array<array<string>>'),
 ('description', 'string'),
 ('imUrl', 'string'),
 ('price', 'double'),
 ('related',
  'struct<also_bought:array<string>,also_viewed:array<string>,bought_together:array<string>,buy_after_viewing:array<string>>'),
 ('salesRank',
  'struct<Appliances:bigint,Arts, Crafts & Sewing:bigint,Automotive:bigint,Baby:bigint,Beauty:bigint,Books:bigint,Camera &amp; Photo:bigint,Cell Phones & Accessories:bigint,Clothing:bigint,Computers & Accessories:bigint,Electronics:bigint,Gift Cards Store:bigint,Grocery & Gourmet Food:bigint,Health & Personal Care:bigint,Home &amp; Kitchen:bigint,Home Improvement:bigint,Industrial & Scientific:bigint,Jewelry:bigint,Kitchen & Dining:bigint,Magazines:bigint,Movies & TV:bigint,Music:bigint,Musical Instruments:bigint,Office Products:bigint,Patio, Lawn & Garden:bigint,Pet Supplies:bigint,Prime Pantry:bigint,Shoes:bigint,Software:bigint,Sports &amp; Outd

In [79]:
more_df = metadata.filter(col('asin').startswith('0785'))

In [28]:
titles = meta_df.select('asin','title').collect()

In [45]:
fant = metadata.filter(col('asin').startswith('B07'))

In [80]:
more_df.count()

6806

In [182]:
meta_df.count()

2077

In [44]:
len(titles)

2077

In [43]:
import csv

with open('all_dc.csv', 'w') as f:
    w = csv.writer(f)
    for val in titles:
        w.writerow((val[0], val[1]))

In [177]:
dc_reviews.to_csv('dc_comic_top100.csv')

In [140]:
dc_reviews.drop(['helpful','reviewText','reviewTime','reviewerName','summary','unixReviewTime'], axis=1, inplace=True)

In [141]:
dc_reviews['asin'].value_counts()

1401232590    307
1401220347    294
1401223176    284
0345391810    178
1401229697     84
1401232051     81
1401229352     72
1401235360     52
1401238963     37
1401242383     31
140123884X     13
1401247113      8
B00CU06XFY      3
Name: asin, dtype: int64

In [36]:
comic_readers = list(set(dc_reviews['reviewerID'].tolist()))

In [32]:
movies_df.show(3)

+----------+-------+-------+--------------------+-----------+--------------+------------------+--------------------+--------------+
|      asin|helpful|overall|          reviewText| reviewTime|    reviewerID|      reviewerName|             summary|unixReviewTime|
+----------+-------+-------+--------------------+-----------+--------------+------------------+--------------------+--------------+
|0000143502| [0, 0]|    5.0|This has some gre...|01 17, 2013|A3R5OBKS7OM2IR|Rebecca L. Johnson| Alton... nough said|    1358380800|
|0000143529| [0, 0]|    5.0|This is a great p...| 10 2, 2013|A3R5OBKS7OM2IR|Rebecca L. Johnson|         Ah Alton...|    1380672000|
|0000143561| [2, 4]|    2.0|I have to admit t...|07 17, 2008| AH3QC2PC1VTGP|   Great Home Cook|Don't waste your ...|    1216252800|
+----------+-------+-------+--------------------+-----------+--------------+------------------+--------------------+--------------+
only showing top 3 rows



In [38]:
movie_reviews = movies_df.filter(col("reviewerID").isin(comic_readers)).toPandas()

In [39]:
movie_reviews.shape

(31343, 9)

In [50]:
movie_reviews.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,5119367,"[0, 0]",5.0,I've watched this movie every six months or so...,"06 11, 2012",AE62TIY3899YL,"Daniel J. Silva ""Superman_242""",Best of the Bible Collection AND best telling ...,1339372800
1,307141950,"[4, 4]",5.0,I suppose giving this video a 5-star is rather...,"01 24, 2003",A2XVRCU5DQBULH,"L. Varnau ""nerff20""",From the voice of Little Critter...,1043366400
2,307142493,"[3, 3]",5.0,It is rather ironic to think that the elfin Mi...,"12 3, 2003",A2NJO6YE954DBH,Lawrance M. Bernabo,The charming story of how Kris Kringle became ...,1070409600
3,307514161,"[1, 1]",5.0,"In just telling the story in the song ""Rudolph...","12 25, 2005",A2NJO6YE954DBH,Lawrance M. Bernabo,The deservedly beloved 1964 Bass-Rakin televis...,1135468800
4,307514161,"[0, 0]",5.0,I bought a bunch of these old holiday classics...,"12 19, 2010",A3PZ88WU7RPTP6,TeamQuinnHQ,Mine!,1292716800


In [51]:
movie_reviews.drop(['helpful','reviewText','reviewTime','reviewerName','summary','unixReviewTime'], axis=1, inplace=True)

In [55]:
dc_reviews.head()

Unnamed: 0,asin,overall,reviewerID
0,014038572X,5.0,A2INGRA4M7D0QH
1,014038572X,5.0,AC1MHHZVWJE1P
2,014038572X,5.0,A38Z66H58RIJ64
3,014038572X,1.0,A77UMI1QYXJF9
4,014038572X,5.0,A3BVJTBO17JTZE


In [93]:
def new_id_column(df, column, suffix_val, new_name):
    """Take in column with unique indexes, return new index values"""
    unique_vals = list(set(df[column].tolist()))
    new_ids = [int(str(i) + suffix_val) for i in range(1,len(unique_vals)+1)]
    new_id_dict = {k:v for k,v in zip(unique_vals, new_ids)}
    df[new_name] = df[column].apply(lambda x: new_id_dict[x])

In [77]:
asins = list(set(dc_reviews['asin'].tolist()))
len(asins)

14

In [78]:
dc_id = [int(str(i) + '00') for i in range(1,len(asins)+1)]
len(dc_id)

14

In [79]:
new_id_dict = {k:v for k,v in zip(asins, dc_id)}
dc_reviews['amazon_id'] = dc_reviews['asin'].apply(lambda x: new_id_dict[x])
dc_reviews.head()

Unnamed: 0,asin,overall,reviewerID,amazon_id
0,014038572X,5.0,A2INGRA4M7D0QH,1300
1,014038572X,5.0,AC1MHHZVWJE1P,1300
2,014038572X,5.0,A38Z66H58RIJ64,1300
3,014038572X,1.0,A77UMI1QYXJF9,1300
4,014038572X,5.0,A3BVJTBO17JTZE,1300


In [80]:
movie_asins = list(set(movie_reviews['asin'].tolist()))
len(movie_asins)

15964

In [81]:
m_id = [int(str(i) + '99') for i in range(1,len(movie_asins)+1)]
len(m_id)

15964

In [83]:
m_id_dict = {k:v for k,v in zip(movie_asins, m_id)}
movie_reviews['amazon_id'] = movie_reviews['asin'].apply(lambda x: m_id_dict[x])
movie_reviews.head()

Unnamed: 0,asin,overall,reviewerID,amazon_id
0,5119367,5.0,AE62TIY3899YL,1065299
1,307141950,5.0,A2XVRCU5DQBULH,1174199
2,307142493,5.0,A2NJO6YE954DBH,621099
3,307514161,5.0,A2NJO6YE954DBH,839799
4,307514161,5.0,A3PZ88WU7RPTP6,839799


In [84]:
all_reviews = pd.concat([dc_reviews, movie_reviews], axis=0)

In [87]:
all_reviews.drop('asin', axis=1, inplace=True)

In [88]:
all_reviews.head()

Unnamed: 0,overall,reviewerID,amazon_id
0,5.0,A2INGRA4M7D0QH,1300
1,5.0,AC1MHHZVWJE1P,1300
2,5.0,A38Z66H58RIJ64,1300
3,1.0,A77UMI1QYXJF9,1300
4,5.0,A3BVJTBO17JTZE,1300


In [94]:
new_id_column(all_reviews, 'reviewerID', suffix_val='88', new_name='amazon_user_id')

In [96]:
all_reviews.drop('reviewerID', axis=1, inplace=True)

In [169]:
all_reviews.head()

Unnamed: 0,overall,amazon_id,amazon_user_id
0,5.0,1300,73188
1,5.0,1300,57288
2,5.0,1300,53288
3,1.0,1300,22088
4,5.0,1300,33688


In [107]:
all_reviews_spark = spark.createDataFrame(all_reviews)

In [108]:
all_reviews_spark.persist()

DataFrame[overall: double, amazon_id: bigint, amazon_user_id: bigint]

In [158]:
(train, test) = all_reviews_spark.randomSplit([.8,.2])

In [190]:
# Build the recommendation model using ALS
als = ALS(userCol='amazon_user_id', itemCol='amazon_id', ratingCol='overall', nonnegative=True)

als_model = als.fit(train)

In [191]:
pred = als_model.transform(test)

In [192]:
pred_df = pred.toPandas()

In [193]:
pred_df = pred_df.dropna(axis=0)

In [194]:
pred = spark.createDataFrame(pred_df)

In [195]:
al = als_model.extractParamMap

In [196]:
print(al)

<bound method Params.extractParamMap of ALS_c17a9a3dcb6b>


In [197]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="overall",
                                predictionCol="prediction")

rmse = evaluator.evaluate(pred)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.3328492850220897


In [None]:
& (pred_df['amazon_id'].astype(str).str.endswith('00'))

In [231]:
pred_df.loc[pred_df['amazon_user_id'] == 94688]

Unnamed: 0,overall,amazon_id,amazon_user_id,prediction
529,5.0,300,94688,2.584244


In [122]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


als_model =  ALS(userCol="amazon_user_id", itemCol="amazon_id", ratingCol="overall")

                 
params = ParamGridBuilder().addGrid(als_model.regParam, [0.01,0.001,0.1]).addGrid(als_model.rank, [4,10,50]).build()


## instantiating crossvalidator estimator
cv = CrossValidator(estimator=als_model, estimatorParamMaps=params,evaluator=evaluator,parallelism=4)
best_model = cv.fit(train)    

# We see the best model has a rank of 50, so we will use that in our future models with this dataset
best_model.bestModel.rank

50

In [None]:
query = """
SELECT 
    asin,
    title
FROM metadta
"""

In [None]:
metadta.createOrReplaceTempView('metadta')

In [None]:
met_df = spark.sql(query)

In [None]:
books = met_df.select("asin", met_df.salesRank['Books']
                     .alias('book_ranks'), "title").filter('book_ranks is not null').collect()

In [None]:
print(image_comics)

In [None]:
image_comics = []
for i in range(len(books)):
    if books[i]['book_ranks'] == 4395:
        image_comics.append(books[i])

* non-negative = TRUE
* search latent features
* See the cluster
* Clusters that containt both 
* Combine clusters 
* Reduce ranks for increased 