In [2]:
import pyspark
import pyspark.sql.functions as F

from batcave import get_item_image

In [3]:
spark = (pyspark.sql.SparkSession.builder
    .master("local")
    .getOrCreate())

In [4]:
all_reviews = spark.read.json('data/all_reviews_fixed_titles.json')

### Getting thumbnail images for web interface
Lucky for me, the Amazon review data I have been working with has links to the tumbnail images for each product.  I wrote a function using ```urllib``` to download those images locally for use later. I downloaded all of the top 1,000 most frequently rated movies (as these will be my initial basis I will be drawing from) and all of the covers for comic books to give it the full scope of options to pull from when recommending. 

In [5]:
top_movie_url_query = """
SELECT
    DISTINCT asin
,   title
,   imUrl
,   count
FROM
    all_reviews
WHERE 
    item_id LIKE "%44"
ORDER BY 
    count DESC
LIMIT 1000
"""

In [6]:
all_reviews.createOrReplaceTempView('all_reviews')
movie_image_urls = spark.sql(top_movie_url_query).toPandas()

In [7]:
movie_missing_urls = movie_image_urls[movie_image_urls['imUrl'].isna()]
movie_active_urls = movie_image_urls[movie_image_urls['imUrl'].isna() == False]

In [8]:
movie_image_urls.head()

Unnamed: 0,asin,title,imUrl,count
0,B001KVZ6HK,Marvel's: The Avengers,http://ecx.images-amazon.com/images/I/617wvf8S...,170
1,B008JFUPFI,Man of Steel,http://ecx.images-amazon.com/images/I/51vg6U6f...,163
2,B00005JPY0,The Dark Knight [Theatrical Release],http://ecx.images-amazon.com/images/I/51pGxSkk...,141
3,B009934S5M,Star Trek Into Darkness,http://ecx.images-amazon.com/images/I/51K7eJ6I...,140
4,B0001VL0K2,The Lord of the Rings: The Motion Picture Tril...,http://ecx.images-amazon.com/images/I/51z4rwl-...,133


In [None]:
get_item_image(comic_active_urls, 'images/movies/')

In [4]:
top_comic_url_query = f"""
SELECT
    DISTINCT asin
,   imUrl
,   count
FROM
    all_reviews
WHERE 
    item_id LIKE "%22"
ORDER BY 
    count DESC
"""

In [5]:
all_reviews.createOrReplaceTempView('all_reviews')
comic_image_urls = spark.sql(top_comic_url_query).toPandas()

In [6]:
comic_missing_urls = comic_image_urls[comic_image_urls['imUrl'].isna()]
comic_active_urls = comic_image_urls[comic_image_urls['imUrl'].isna() == False]

In [7]:
comic_active_urls.head()

Unnamed: 0,asin,imUrl,count
0,1607060760,http://ecx.images-amazon.com/images/I/51m-0BhI...,314
1,1607065967,http://ecx.images-amazon.com/images/I/51CnU-l0...,193
2,1401220347,http://ecx.images-amazon.com/images/I/81tnMIDL...,137
3,1401235417,http://ecx.images-amazon.com/images/I/81%2B3jj...,127
4,1607066017,http://ecx.images-amazon.com/images/I/515jeiad...,120


In [8]:
get_item_image(comic_active_urls, 'images/comics/')

Since I just needed to manually find covers for these last two, I searched for them by ASIN on Amazon and downloaded to the same directory:

In [10]:
comic_missing_urls

Unnamed: 0,asin,imUrl,count
245,785142592,,18
1014,785148744,,7


### Selecting movies to use
My goal for the web app interface is to offer a diverse selection of movies for users to select to help gather a recommendation.  To best solve this, I need to manually review this set of most reviewed movies and create a subsection to give a good mix of genres. I want to limit the amount of movies that are from comic source material as to avoid the really obvious connections (eg. You love Iron Man and get a recommendation for Iron Man, far too obvious a leap). 

In [10]:
movie_image_urls[100:150]

Unnamed: 0,asin,title,imUrl,count
100,B0064NTZJO,Drive,http://ecx.images-amazon.com/images/I/51eawS9K...,49
101,B00005JMXX,The Day After Tomorrow,http://ecx.images-amazon.com/images/I/51JSE1F1...,49
102,B00005JMUA,"Kill Bill, Vol. 2",http://ecx.images-amazon.com/images/I/51PEQ7JV...,49
103,B00BBAQD6S,DCU: Superman: Unbound,http://ecx.images-amazon.com/images/I/51lii-Z-...,49
104,0780622545,Dark City,http://ecx.images-amazon.com/images/I/317GJBPN...,49
105,0792833171,The Wizard of Oz,http://ecx.images-amazon.com/images/I/51GYW80G...,49
106,B00111YM5Q,30 Days of Night,http://ecx.images-amazon.com/images/I/511KbazW...,49
107,0783216084,Jaws,http://ecx.images-amazon.com/images/I/514JQY1K...,49
108,0783241038,Galaxy Quest,http://ecx.images-amazon.com/images/I/51K4TAW5...,48
109,B00005JLRT,Star Trek - Nemesis,http://ecx.images-amazon.com/images/I/519EJJQ3...,48


* Remove any titles I don't want to feature
* Do further name cleaning
* Figure out way to get featured images for both
* Hulk vs?, Iron Man: The Art of Iron Man 2
* Bad name features still in place: (DVD), (Widescreen Edition) - anything in parenthesis! [Blu-ray]

In [29]:
movie_image_urls.loc[383]['imUrl']

'http://ecx.images-amazon.com/images/I/515ZHBJZdgL._SY300_.jpg'

In [47]:
movies = [97, 127, 208, 355, 436, 56, 92, 96, 153,
 159, 243, 230, 268, 354, 356, 445, 579, 58, 72, 149,
 115, 296, 144, 158, 349, 343, 521, 557,
 558, 561, 197, 246, 383, 376, 406, 107, 140,
 176, 267, 337, 335, 427, 460, 488,
 509, 551, 5, 48, 27,22, 80, 78, 5, 79, 105]

In [48]:
len(movies)

55

In [49]:
movie_image_urls.loc[movies]['title']

97                                            The Matrix
127                                      Children of Men
208                                        Ghostbusters 
355                                    Starship Troopers
436                      The Hunger Games: Catching Fire
56                                No Country for Old Men
92                                            Fight Club
96                                            Black Swan
153                                   The Social Network
159                                         Pulp Fiction
243                              It's a Wonderful Life  
230                                          Casablanca 
268                                   Brokeback Mountain
354                              Silver Linings Playbook
356                                            Enchanted
445                North by Northwest - Special Edition 
579                                  Gone With the Wind 
58                             

15.0