In [92]:
import pandas as pd
import csv
import re

import pyspark
from pyspark.sql.functions import col
from pyspark.sql import SQLContext

from batcave import get_amazon_list_ids

### Importing Amazon reviews through Spark

These are complete datasets from a project by [Julien McAuley.](http://jmcauley.ucsd.edu/data/amazon/links.html) 

In [2]:
spark = (pyspark.sql.SparkSession.builder
    .master("local")
    .getOrCreate())

In [3]:
movies_df = spark.read.json('data/reviews_Movies_and_TV.json')

In [4]:
books_df = spark.read.json('data/reviews_Books.json')

In [5]:
metadata = spark.read.json('data/metadata.json')

### Scraping Amazon for ASIN starting point
To help narrow my results, I started by scraping the top 100 DC & Marvel Graphic Novels for their ASIN, so that I could start with an understanding of where to find the reviews I want to target. 

In [10]:
#DC Comics - top ID's
dc_url_1 = "https://www.amazon.com/gp/bestsellers/books/193766/ref=pd_zg_hrsr_books"
dc_url_2 = "https://www.amazon.com/Best-Sellers-Books-DC-Comics-Graphic-Novels/zgbs/books/193766/ref=zg_bs_pg_2?_encoding=UTF8&pg=2"
dc_ids_1 = get_amazon_list_ids(dc_url_1)
dc_ids_2 = get_amazon_list_ids(dc_url_2)

In [9]:
# Marvel Comics - top ID's
marvel_url_1 = "https://www.amazon.com/gp/bestsellers/books/4400/ref=pd_zg_hrsr_books"
marvel_url_2 = "https://www.amazon.com/Best-Sellers-Books-Marvel-Comics-Graphic-Novels/zgbs/books/4400/ref=zg_bs_pg_2?_encoding=UTF8&pg=2"
marvel_ids_1 = get_amazon_list_ids(marvel_url_1)
marvel_ids_2 = get_amazon_list_ids(marvel_url_2)

In [11]:
all_ids = dc_ids_1 + dc_ids_2 + marvel_ids_1 + marvel_ids_2

Saving these id's temporarily to a csv, to preserve and make for easier refreshing of this notebook without re-scraping.

In [None]:
with open('data/top_dc_marvel_comics.csv', 'w') as f:
    wr = csv.writer(f)
    wr.writerow(all_ids)

# Getting the right reviews
My thought process for getting down to the reviews of those who have scored both comic books and movies is as follows:
* Find comic book ASIN's in the metadata dataset through exploration and looking at related items. Call it "all_comic_ids".
* Find any reviews for "all_comics_ids" in the book reviews dataset. This subset is "comic_reviews"
* Take reviewer ids from "comic_reviews" and find them in movie/tv reviews dataset. This subset is "movie_reviews"
* Take the reviewer ids from "movie_reviews" and filter out any missing from "comic_reviews", calling this subset "comic_reviews_both"

In [8]:
with open("data/top_dc_marvel_comics.csv") as f:
    reader = csv.reader(f)
    all_ids = []
    for row in reader:
        all_ids.append(row)

### 1. Finding ASINs
After reading back in list of ASINs, finding them in metadata and reviewing the data. I am loading it out to a Pandas dataframe for now as it will most likely be smaller, so faster to work with in this way:

In [13]:
top_comic_meta = metadata.filter(col("asin").isin(all_ids[0])).toPandas()

In [14]:
top_comic_meta.shape

(16, 10)

In [15]:
top_comic_meta

Unnamed: 0,_corrupt_record,asin,brand,categories,description,imUrl,price,related,salesRank,title
0,,014038572X,,[[Books]],"According to Ponyboy, there are two kinds of p...",http://ecx.images-amazon.com/images/I/61Xc9GWp...,4.99,"([0078205409, 0440932211, 0547534264, 01403896...","(None, None, None, None, None, 3106, None, Non...",The Outsiders
1,,0785117210,,[[Books]],,http://ecx.images-amazon.com/images/I/51upWJjH...,10.49,"([0785123202, 0785157050, 078512179X, 07851329...","(None, None, None, None, None, 19065, None, No...",House of M
2,,0785121056,,[[Books]],,http://ecx.images-amazon.com/images/I/61iW2SXT...,11.49,"([0785131272, 0785131280, 0785166432, 07851565...","(None, None, None, None, None, 95214, None, No...",Infinity War
3,,140122427X,,[[Books]],,http://ecx.images-amazon.com/images/I/81g0BQDw...,20.0,"([1401228798, 1401230970, 1401233902, 14012349...","(None, None, None, None, None, 793, None, None...",Fables: The Deluxe Edition Book One
4,,1401229697,,[[Books]],"Starred Review. A stunning, moving story about...",http://ecx.images-amazon.com/images/I/71%2BCEZ...,11.01,"([1935429000, 160309038X, 1603090746, 17704604...","(None, None, None, None, None, 6414, None, Non...",Daytripper
5,,1401230067,,[[Books]],,http://ecx.images-amazon.com/images/I/81IQpmRp...,11.49,"([1401233023, 1401235190, 1401236901, 14012380...","(None, None, None, None, None, 11659, None, No...","John Constantine, Hellblazer, Vol. 1: Original..."
6,,1401233791,,[[Books]],Dennis O'Neil is the influential writer of com...,http://ecx.images-amazon.com/images/I/81hLFMDQ...,16.09,"([1401235360, 1401237215, 1401232744, 14012338...","(None, None, None, None, None, 9262, None, Non...","Batman: Knightfall, Vol. 1"
7,,1401233384,,[[Books]],"Geoff Johns, a Detroit native, brings a Hollyw...",http://ecx.images-amazon.com/images/I/81LZ1mW0...,10.0,"([1401234054, 1401230016, 1401234488, 14012343...","(None, None, None, None, None, 21061, None, No...",Flashpoint
8,,1401237789,,[[Books]],Praise for Scott Snyder'sBatman: Court of Owls...,http://ecx.images-amazon.com/images/I/91DDC3%2...,9.36,"([1401235425, 1401246028, 1401242529, 14012402...","(None, None, None, None, None, 1361, None, Non...",Batman Vol. 2: The City of Owls (The New 52)
9,,1401242383,,[[Books]],Warren Ellis is a prolific writer whose works ...,http://ecx.images-amazon.com/images/I/81adYxKE...,47.73,"([1401238998, 1401242758, 1616552379, 07851656...","(None, None, None, None, None, 18519, None, No...",The Planetary Omnibus


In review, see that some of the ASINs that were scraped aren't aligning to relevant material. This could generally be from those numbers being adjusted at some systematically.

In [16]:
top_meta = top_comic_meta.drop([0, 12, 13, 14, 15])

To help get more information, I used the "Related" comics feature to grab more ASINs and will use those to help determine if I can find more.

In [25]:
top_meta['related'].head()

1    ([0785123202, 0785157050, 078512179X, 07851329...
2    ([0785131272, 0785131280, 0785166432, 07851565...
3    ([1401228798, 1401230970, 1401233902, 14012349...
4    ([1935429000, 160309038X, 1603090746, 17704604...
5    ([1401233023, 1401235190, 1401236901, 14012380...
Name: related, dtype: object

Putting the 'related' ASINs in a list, then creating a new dataframe for them.

In [18]:
more_comic_ids = list(set([val for meta in top_meta.related.tolist() for val in meta[0]]))

In [24]:
more_comic_meta = metadata.filter(col("asin").isin(more_comic_ids)).toPandas()

In [26]:
more_comic_meta['related'].head()

0    ([0375714545, 0618871713, 1570614598, 18918304...
1    ([0316107093, 0316107107, 0316107298, 03161073...
2    ([1606904388, 0345506391, 1606901605, 16069021...
3    ([1932664165, 1620101130, 1620100045, 14424659...
4    ([1560974273, 0375404538, 0375714545, 03073773...
Name: related, dtype: object

Repeating the process one more time to get more samples and better determine some shortcuts.

In [28]:
more_related = list(set([val for meta in top_meta.related.tolist() for val in meta[0] if meta[0] is not None]))

In [29]:
more_meta = metadata.filter(col("asin").isin(more_related)).toPandas()

In [30]:
more_meta['related'].head()

0    ([0375714545, 0618871713, 1570614598, 18918304...
1    ([0316107093, 0316107107, 0316107298, 03161073...
2    ([1606904388, 0345506391, 1606901605, 16069021...
3    ([1932664165, 1620101130, 1620100045, 14424659...
4    ([1560974273, 0375404538, 0375714545, 03073773...
Name: related, dtype: object

Based on reviewing what I was able to gather so far, I found a few patterns to help narrow down a subsection that contains comic books:
* A large section of comic books from the publisher Marvel Comics begin with '07851'
* A large section of comic books from the publisher DC Comics begin with '14012'
* A large section of comic books from the publisher Image Comics begin with '160706'

As these are three of the largest publishers, I am going to use an SQL query to find those in these ranges, review the data, and if it looks to fit what I expect, proceed with combining them with my previous findings into one set. 

In [41]:
image_query = """
SELECT 
    asin,
    title,
    categories
FROM metadata
    WHERE asin LIKE '160706%'
"""

In [42]:
metadata.createOrReplaceTempView('metadata')

In [43]:
image_meta = spark.sql(image_query).toPandas()

In [44]:
image_meta.head()

Unnamed: 0,asin,title,categories
0,1607060051,Screamland,[[Books]]
1,1607060191,Ted McKeever Library Book 3: Metropol (Bk. 3),[[Books]]
2,1607060086,Bruce: The Little Blue Spruce,[[Books]]
3,1607060043,"Outlaw Territory, Vol. 1",[[Books]]
4,160706023X,Zombie Cop,[[Books]]


In [38]:
marvel_query = """
SELECT 
    asin,
    title,
    categories
FROM 
    metadata
WHERE asin LIKE '07851%'
"""

In [39]:
metadata.createOrReplaceTempView('metadata')

In [40]:
marvel_meta = spark.sql(marvel_query).toPandas()

In [49]:
marvel_meta.head()

Unnamed: 0,asin,title,categories
0,0785100016,Wolverine: Killing,[[Books]]
1,0785100555,"The Punisher, a man named Frank",[[Books]]
2,078510027X,Spider-Man: Round Robin : The Sidekick's Revenge,[[Books]]
3,0785100245,Daredevil - Fall from Grace,[[Books]]
4,0785100717,The Best of Marvel 1994,[[Books]]


In [45]:
dc_query = """
SELECT 
    asin,
    title,
    categories
    FROM metadata
    WHERE asin LIKE '14012%'
"""

In [46]:
metadata.createOrReplaceTempView('metadata')

In [47]:
dc_meta = spark.sql(dc_query).toPandas()

In [48]:
dc_meta.head()

Unnamed: 0,asin,title,categories
0,1401200168,Green Lantern: Emerald Dawn II,[[Books]]
1,1401200141,"Atom, The - Archives, Volume 2 (DC Archive Edi...",[[Books]]
2,1401200338,High Roads (Cliffhanger!),[[Books]]
3,1401200346,Batman/Deathblow: After the Fire (Batman Beyon...,[[Books]]
4,1401200370,Batman: Absolution,[[Books]]


Awesome! All of those looked good in further exploration of each. With that, I combined all of the previous results with these new ones to have my working set of comic book ASINs.

In [50]:
top_merge = top_comic_meta[['asin', 'title', 'categories']]
m1_merge = more_comic_meta[['asin', 'title', 'categories']]
m2_merge = more_meta[['asin', 'title', 'categories']]

In [79]:
all_comic_df = pd.concat([top_merge, m1_merge, m2_merge, image_meta, marvel_meta, dc_meta])
all_comic_df['categories'] = all_comic_df['categories'].astype(str)
all_comic_df.head()

Unnamed: 0,asin,title,categories
0,014038572X,The Outsiders,[['Books']]
1,0785117210,House of M,[['Books']]
2,0785121056,Infinity War,[['Books']]
3,140122427X,Fables: The Deluxe Edition Book One,[['Books']]
4,1401229697,Daytripper,[['Books']]


Next, I did some additional cleaning of my comic books ASINs:
* Dropped any duplicate rows
* Found anything that was not categorized as a Book and removed it, as well as a couple more edge cases that I wanted to drop. 
* Exported this list to a csv to preserve my efforts and not have to extract from the metadata dataset any further.

In [80]:
all_comics_df = all_comic_df.drop_duplicates()

In [81]:
all_comics_df['categories'] = all_comics_df['categories'].apply(lambda x: re.sub("\[|\]|\'", "", x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [87]:
all_comics_df.head()

Unnamed: 0,asin,title,categories
1,0785117210,House of M,Books
2,0785121056,Infinity War,Books
3,140122427X,Fables: The Deluxe Edition Book One,Books
4,1401229697,Daytripper,Books
5,1401230067,"John Constantine, Hellblazer, Vol. 1: Original...",Books


In [83]:
all_comics_df[all_comics_df['categories'] != 'Books']

Unnamed: 0,asin,title,categories
15,B00CU06XFY,,"Books, Comics & Graphic Novels, Graphic Novels..."
380,B00CENWUYK,,"Movies & TV, Movies"
369,1401209017,,
777,1401216420,,
1092,1401220428,Vertigo Tarot Deck Set 20th Anniversary Edition,Toys & Games


In [85]:
all_comics_df = all_comics_df.drop([0,12,13,14,15,380,369,777,1092], axis=0)

In [86]:
all_comics_df.to_csv('data/all_comic_asin.csv')

### 2. Find comic book reviews in Books

In [88]:
comic_reviews = books_df.filter(col('asin').isin(all_comics_df.asin.tolist()))

In [89]:
print(f"Number of comic book reviews: {comic_reviews.count()}")

Number of comic book reviews: 57543


In [104]:
comic_reviews.show(1)

+----------+-------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|      asin|helpful|overall|          reviewText| reviewTime|    reviewerID|        reviewerName|             summary|unixReviewTime|
+----------+-------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|0316107255| [1, 1]|    4.0|PENGUIN DREAMS AN...|02 19, 2014|A3NQU1649SH0Q4|Allen Smalling "E...|Okay, but no coll...|    1392768000|
+----------+-------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
only showing top 1 row



In [106]:
comic_reviews_export = comic_reviews.select(['asin', 'overall', 'reviewText', 'reviewerID'])

In [139]:
# Exporting data to make easier to work with
comic_reviews_export.repartition(1).write.json('data/all_comic_reviews.json')

In [144]:
comic_reviews_df = spark.read.json("data/all_comic_reviews.json")

In [145]:
comic_reviews_df.show(1)

+----------+-------+--------------------+--------------+
|      asin|overall|          reviewText|    reviewerID|
+----------+-------+--------------------+--------------+
|0316107255|    4.0|PENGUIN DREAMS AN...|A3NQU1649SH0Q4|
+----------+-------+--------------------+--------------+
only showing top 1 row



### 3. Find movie/tv reviews by comic book reviewers

In [114]:
good_users = list(set(comic_reviews_df.select('reviewerID').collect()))

In [116]:
# Ids are still stored in Spark Rows, so this removes them into a unique list
good_user_ids = [g[0] for g in good_users]

In [117]:
mtv_reviews = movies_df.filter(col('reviewerID').isin(good_user_ids))

In [118]:
print(f"Number of unique movie/tv reviews for comic book reviewers:{mtv_reviews.count()}")

Number of unique movie/tv reviews for comic book reviewers:149189


In [119]:
movie_and_comic_reviewers = list(set(mtv_reviews.select('reviewerID').collect()))

In [120]:
print(f"Number of users who have rated both comics and movies/tv: {len(movie_and_comic_reviewers)}")

Number of users who have rated both comics and movies/tv: 10366


In [121]:
mtv_reviews_export = mtv_reviews.select(['asin', 'overall', 'reviewText', 'reviewerID'])

Saving the metadata on asociated movies/tv for later model interpretation:

In [163]:
mtv_asins = list(set(mtv_reviews.select('asin').collect()))
mtv_asins = [a[0] for a in mtv_asins]
all_mtv_asins = metadata.filter(col('asin').isin(mtv_asins))

In [165]:
all_mtv_asins.repartition(1).write.json('all_mtv_asin')

In [147]:
# Exporting data to make easier to work with
mtv_reviews_export.repartition(5).write.json('data/all_movietv_jsons')

In [148]:
mtv_reviews_df = spark.read.json('data/all_movietv_jsons/*.json')

In [149]:
mtv_reviews_df.show(1)

+----------+-------+--------------------+--------------+
|      asin|overall|          reviewText|    reviewerID|
+----------+-------+--------------------+--------------+
|0767823796|    4.0|&quot;John Carpen...|A2NJO6YE954DBH|
+----------+-------+--------------------+--------------+
only showing top 1 row



### 4. Filtering comic reviews by those who rated both comics & movies/tv

In [131]:
# Unpacking reviews from previous reviewers list
movie_and_comic_reviewers = [reviewer[0] for reviewer in movie_and_comic_reviewers]

In [135]:
# Quick visual check on these ids
movie_and_comic_reviewers[:5]

['A1R7Q24DI4OJE0',
 'A2KGYA96OKM5YH',
 'A1RK6OE88XA0TF',
 'A14VECUBFP6K7D',
 'A1ZYOFUCY9KQX7']

In [155]:
mtv_and_comic_reviews = comic_reviews_df.filter(col('reviewerID').isin(movie_and_comic_reviewers))

Comic reviews from reviewers who also reviewed movies/tv

In [156]:
mtv_and_comic_reviews.count()

30628

In [157]:
mtv_and_comic_reviews.show(1)

+----------+-------+--------------------+--------------+
|      asin|overall|          reviewText|    reviewerID|
+----------+-------+--------------------+--------------+
|0316107255|    4.0|PENGUIN DREAMS AN...|A3NQU1649SH0Q4|
+----------+-------+--------------------+--------------+
only showing top 1 row



In [159]:
# Exporting this new set of comic reviews to JSON
mtv_and_comic_reviews.repartition(1).write.json("data/comic_and_movie")

Looks like a good start! From here, I moved to the my next notebook to merge, clean, and model my data further. 

# Formal content cutoff here, move rest of materials to second notebook

In [253]:
testing = metadata_split.limit(5).toPandas()

In [174]:
metadata.dtypes

[('_corrupt_record', 'string'),
 ('asin', 'string'),
 ('brand', 'string'),
 ('categories', 'array<array<string>>'),
 ('description', 'string'),
 ('imUrl', 'string'),
 ('price', 'double'),
 ('related',
  'struct<also_bought:array<string>,also_viewed:array<string>,bought_together:array<string>,buy_after_viewing:array<string>>'),
 ('salesRank',
  'struct<Appliances:bigint,Arts, Crafts & Sewing:bigint,Automotive:bigint,Baby:bigint,Beauty:bigint,Books:bigint,Camera &amp; Photo:bigint,Cell Phones & Accessories:bigint,Clothing:bigint,Computers & Accessories:bigint,Electronics:bigint,Gift Cards Store:bigint,Grocery & Gourmet Food:bigint,Health & Personal Care:bigint,Home &amp; Kitchen:bigint,Home Improvement:bigint,Industrial & Scientific:bigint,Jewelry:bigint,Kitchen & Dining:bigint,Magazines:bigint,Movies & TV:bigint,Music:bigint,Musical Instruments:bigint,Office Products:bigint,Patio, Lawn & Garden:bigint,Pet Supplies:bigint,Prime Pantry:bigint,Shoes:bigint,Software:bigint,Sports &amp; Outd

In [36]:
comic_readers = list(set(dc_reviews['reviewerID'].tolist()))

In [38]:
movie_reviews = movies_df.filter(col("reviewerID").isin(comic_readers)).toPandas()

In [39]:
movie_reviews.shape

(31343, 9)

In [50]:
movie_reviews.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,5119367,"[0, 0]",5.0,I've watched this movie every six months or so...,"06 11, 2012",AE62TIY3899YL,"Daniel J. Silva ""Superman_242""",Best of the Bible Collection AND best telling ...,1339372800
1,307141950,"[4, 4]",5.0,I suppose giving this video a 5-star is rather...,"01 24, 2003",A2XVRCU5DQBULH,"L. Varnau ""nerff20""",From the voice of Little Critter...,1043366400
2,307142493,"[3, 3]",5.0,It is rather ironic to think that the elfin Mi...,"12 3, 2003",A2NJO6YE954DBH,Lawrance M. Bernabo,The charming story of how Kris Kringle became ...,1070409600
3,307514161,"[1, 1]",5.0,"In just telling the story in the song ""Rudolph...","12 25, 2005",A2NJO6YE954DBH,Lawrance M. Bernabo,The deservedly beloved 1964 Bass-Rakin televis...,1135468800
4,307514161,"[0, 0]",5.0,I bought a bunch of these old holiday classics...,"12 19, 2010",A3PZ88WU7RPTP6,TeamQuinnHQ,Mine!,1292716800


In [51]:
movie_reviews.drop(['helpful','reviewText','reviewTime','reviewerName','summary','unixReviewTime'], axis=1, inplace=True)

In [55]:
dc_reviews.head()

Unnamed: 0,asin,overall,reviewerID
0,014038572X,5.0,A2INGRA4M7D0QH
1,014038572X,5.0,AC1MHHZVWJE1P
2,014038572X,5.0,A38Z66H58RIJ64
3,014038572X,1.0,A77UMI1QYXJF9
4,014038572X,5.0,A3BVJTBO17JTZE


In [93]:
def new_id_column(df, column, suffix_val, new_name):
    """Take in column with unique indexes, return new index values"""
    unique_vals = list(set(df[column].tolist()))
    new_ids = [int(str(i) + suffix_val) for i in range(1,len(unique_vals)+1)]
    new_id_dict = {k:v for k,v in zip(unique_vals, new_ids)}
    df[new_name] = df[column].apply(lambda x: new_id_dict[x])

In [77]:
asins = list(set(dc_reviews['asin'].tolist()))
len(asins)

14

In [78]:
dc_id = [int(str(i) + '00') for i in range(1,len(asins)+1)]
len(dc_id)

14

In [79]:
new_id_dict = {k:v for k,v in zip(asins, dc_id)}
dc_reviews['amazon_id'] = dc_reviews['asin'].apply(lambda x: new_id_dict[x])
dc_reviews.head()

Unnamed: 0,asin,overall,reviewerID,amazon_id
0,014038572X,5.0,A2INGRA4M7D0QH,1300
1,014038572X,5.0,AC1MHHZVWJE1P,1300
2,014038572X,5.0,A38Z66H58RIJ64,1300
3,014038572X,1.0,A77UMI1QYXJF9,1300
4,014038572X,5.0,A3BVJTBO17JTZE,1300


In [80]:
movie_asins = list(set(movie_reviews['asin'].tolist()))
len(movie_asins)

15964

In [81]:
m_id = [int(str(i) + '99') for i in range(1,len(movie_asins)+1)]
len(m_id)

15964

In [83]:
m_id_dict = {k:v for k,v in zip(movie_asins, m_id)}
movie_reviews['amazon_id'] = movie_reviews['asin'].apply(lambda x: m_id_dict[x])
movie_reviews.head()

Unnamed: 0,asin,overall,reviewerID,amazon_id
0,5119367,5.0,AE62TIY3899YL,1065299
1,307141950,5.0,A2XVRCU5DQBULH,1174199
2,307142493,5.0,A2NJO6YE954DBH,621099
3,307514161,5.0,A2NJO6YE954DBH,839799
4,307514161,5.0,A3PZ88WU7RPTP6,839799


In [84]:
all_reviews = pd.concat([dc_reviews, movie_reviews], axis=0)

In [87]:
all_reviews.drop('asin', axis=1, inplace=True)

In [88]:
all_reviews.head()

Unnamed: 0,overall,reviewerID,amazon_id
0,5.0,A2INGRA4M7D0QH,1300
1,5.0,AC1MHHZVWJE1P,1300
2,5.0,A38Z66H58RIJ64,1300
3,1.0,A77UMI1QYXJF9,1300
4,5.0,A3BVJTBO17JTZE,1300


In [94]:
new_id_column(all_reviews, 'reviewerID', suffix_val='88', new_name='amazon_user_id')

In [96]:
all_reviews.drop('reviewerID', axis=1, inplace=True)

In [169]:
all_reviews.head()

Unnamed: 0,overall,amazon_id,amazon_user_id
0,5.0,1300,73188
1,5.0,1300,57288
2,5.0,1300,53288
3,1.0,1300,22088
4,5.0,1300,33688


In [107]:
all_reviews_spark = spark.createDataFrame(all_reviews)

In [108]:
all_reviews_spark.persist()

DataFrame[overall: double, amazon_id: bigint, amazon_user_id: bigint]

In [158]:
(train, test) = all_reviews_spark.randomSplit([.8,.2])

In [190]:
# Build the recommendation model using ALS
als = ALS(userCol='amazon_user_id', itemCol='amazon_id', ratingCol='overall', nonnegative=True)

als_model = als.fit(train)

In [191]:
pred = als_model.transform(test)

In [192]:
pred_df = pred.toPandas()

In [193]:
pred_df = pred_df.dropna(axis=0)

In [194]:
pred = spark.createDataFrame(pred_df)

In [195]:
al = als_model.extractParamMap

In [196]:
print(al)

<bound method Params.extractParamMap of ALS_c17a9a3dcb6b>


In [197]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="overall",
                                predictionCol="prediction")

rmse = evaluator.evaluate(pred)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.3328492850220897


In [None]:
& (pred_df['amazon_id'].astype(str).str.endswith('00'))

In [231]:
pred_df.loc[pred_df['amazon_user_id'] == 94688]

Unnamed: 0,overall,amazon_id,amazon_user_id,prediction
529,5.0,300,94688,2.584244


In [122]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


als_model =  ALS(userCol="amazon_user_id", itemCol="amazon_id", ratingCol="overall")

                 
params = ParamGridBuilder().addGrid(als_model.regParam, [0.01,0.001,0.1]).addGrid(als_model.rank, [4,10,50]).build()


## instantiating crossvalidator estimator
cv = CrossValidator(estimator=als_model, estimatorParamMaps=params,evaluator=evaluator,parallelism=4)
best_model = cv.fit(train)    

# We see the best model has a rank of 50, so we will use that in our future models with this dataset
best_model.bestModel.rank

50

In [None]:
query = """
SELECT 
    asin,
    title
FROM metadta
"""

In [None]:
metadta.createOrReplaceTempView('metadta')

* non-negative = TRUE
* search latent features
* See the cluster
* Clusters that containt both 
* Combine clusters 
* Reduce ranks for increased 