In [1]:
import pyspark
import pyspark.sql.functions as F
import shutil
from batcave.scrape_and_clean import get_item_image, row_to_html_card
from pyspark.sql.types import StringType, IntegerType

In [2]:
spark = (pyspark.sql.SparkSession.builder
    .master("local")
    .getOrCreate())

In [3]:
all_reviews = spark.read.json('data/all_reviews_fixed_titles.json')

### Getting thumbnail images for web interface
Lucky for me, the Amazon review data I have been working with has links to the tumbnail images for each product.  I wrote a function using ```urllib``` to download those images locally for use later. I downloaded all of the top 1,000 most frequently rated movies (as these will be my initial basis I will be drawing from) and all of the covers for comic books to give it the full scope of options to pull from when recommending. 

In [4]:
top_movie_url_query = """
SELECT
    DISTINCT asin
,   title
,   imUrl
,   item_id
,   count
FROM
    all_reviews
WHERE 
    item_id LIKE "%44"
ORDER BY 
    count DESC
LIMIT 1000
"""

In [5]:
all_reviews.createOrReplaceTempView('all_reviews')
movie_image_urls = spark.sql(top_movie_url_query).toPandas()

In [6]:
movie_missing_urls = movie_image_urls[movie_image_urls['imUrl'].isna()]
movie_active_urls = movie_image_urls[movie_image_urls['imUrl'].isna() == False]

In [17]:
movie_image_urls.head(150)

Unnamed: 0,asin,title,imUrl,item_id,count
0,B001KVZ6HK,Marvel's: The Avengers,http://ecx.images-amazon.com/images/I/617wvf8S...,5444,170
1,B008JFUPFI,,http://ecx.images-amazon.com/images/I/51vg6U6f...,39044,163
2,B00005JPY0,The Dark Knight [Theatrical Release],http://ecx.images-amazon.com/images/I/51pGxSkk...,2077544,141
3,B009934S5M,Star Trek Into Darkness,http://ecx.images-amazon.com/images/I/51K7eJ6I...,2097544,140
4,B0001VL0K2,The Lord of the Rings: The Motion Picture Tril...,http://ecx.images-amazon.com/images/I/51z4rwl-...,2326944,133
5,B005LAIHXQ,Prometheus,http://ecx.images-amazon.com/images/I/51qVYCJy...,2934544,128
6,B0049P1VHS,The Walking Dead: Season 1,http://ecx.images-amazon.com/images/I/51d7SYfB...,3130944,116
7,B00005JPS8,Iron Man,http://ecx.images-amazon.com/images/I/515wjJQt...,567644,115
8,B004LWZWFQ,,http://ecx.images-amazon.com/images/I/61GJlemx...,449144,109
9,B00003CWT6,The Lord of the Rings: The Fellowship of the R...,http://ecx.images-amazon.com/images/I/51Dj%2BQ...,949544,106


In [None]:
get_item_image(comic_active_urls, 'images/movies/')

In [8]:
top_comic_url_query = f"""
SELECT
    DISTINCT asin
,   imUrl
,   count
,   item_id
FROM
    all_reviews
WHERE 
    item_id LIKE "%22"
ORDER BY 
    count DESC
"""

In [9]:
all_reviews.createOrReplaceTempView('all_reviews')
comic_image_urls = spark.sql(top_comic_url_query).toPandas()

In [10]:
comic_missing_urls = comic_image_urls[comic_image_urls['imUrl'].isna()]
comic_active_urls = comic_image_urls[comic_image_urls['imUrl'].isna() == False]

In [11]:
comic_active_urls.head()

Unnamed: 0,asin,imUrl,count,item_id
0,1607060760,http://ecx.images-amazon.com/images/I/51m-0BhI...,314,501122
1,1607065967,http://ecx.images-amazon.com/images/I/51CnU-l0...,193,205422
2,1401220347,http://ecx.images-amazon.com/images/I/81tnMIDL...,137,279922
3,1401235417,http://ecx.images-amazon.com/images/I/81%2B3jj...,127,303922
4,1607066017,http://ecx.images-amazon.com/images/I/515jeiad...,120,478622


In [None]:
get_item_image(comic_active_urls, 'images/comics/')

Since I just needed to manually find covers for these last two, I searched for them by ASIN on Amazon and downloaded to the same directory:

In [12]:
comic_missing_urls

Unnamed: 0,asin,imUrl,count,item_id
225,785142592,,18,73222
958,785148744,,7,325822


### Selecting movies to use
My goal for the web app interface is to offer a diverse selection of movies for users to select to help gather a recommendation.  To best solve this, I need to manually review this set of most reviewed movies and create a subsection to give a good mix of genres. I want to limit the amount of movies that are from comic source material as to avoid the really obvious connections (eg. You love Iron Man and get a recommendation for Iron Man, far too obvious a leap). 

* Remove any titles I don't want to feature
* Do further name cleaning
* Figure out way to get featured images for both
* Hulk vs?, Iron Man: The Art of Iron Man 2
* Bad name features still in place: (DVD), (Widescreen Edition) - anything in parenthesis! [Blu-ray]

After review, I chose these 60 movies to make up my application, ranging in genre and age, but all not based on any preceding comic book material:
<br>
<em>Note: these were the indexs from my original exploration, the actual results may vary</em>

In [13]:
movies = [22, 5, 30, 35, 39, 48, 57, 58, 74, 78, 79, 80, 90, 93, 95, 101, 105,
106, 112, 120, 127, 140, 158, 161, 162, 165, 190, 194, 203, 207, 223,
234,235, 241, 252,268, 270, 278, 287, 312, 323, 325, 336, 337, 348, 
350, 351, 352, 365, 370, 392, 400, 415, 417, 429, 431, 445, 448, 463, 
476]

In [14]:
len(movies)

60

In [15]:
# Index does change with every load, so these results might not show up the same
movie_image_urls.loc[movies].sort_values('asin')

Unnamed: 0,asin,title,imUrl,item_id,count
235,0767834739,Resident Evil: Apocalypse,http://ecx.images-amazon.com/images/I/51TtNGBA...,2570444,36
106,0783216084,Jaws,http://ecx.images-amazon.com/images/I/514JQY1K...,589444,49
223,0783221487,Dune,http://ecx.images-amazon.com/images/I/41N8H8B2...,214744,37
190,0783227434,Army of Darkness,http://ecx.images-amazon.com/images/I/51DQ0H42...,2752144,39
58,0788882988,No Country for Old Men,http://ecx.images-amazon.com/images/I/51pWB3JZ...,2448244,62
252,0790729385,The Exorcist,http://ecx.images-amazon.com/images/I/51KP0KGV...,2088644,34
463,0790743213,North by Northwest - Special Edition,http://ecx.images-amazon.com/images/I/518E4AG3...,642444,26
336,1404983082,Memoirs of a Geisha,http://ecx.images-amazon.com/images/I/51jhontI...,63544,30
203,156219464X,Grave of the Fireflies,http://ecx.images-amazon.com/images/I/41GQW3TK...,933144,38
287,6301972023,Maltese Falcon / Movie,http://ecx.images-amazon.com/images/I/21GB4JKM...,1122644,32


In [24]:
def move(src, dest):
    shutil.move(src, dest)

In [32]:
# Adjust to match directory you're working in
for index, item in movie_image_urls.loc[movies].iterrows():
    try:
        file = item['asin'] + item['imUrl'][-4:]
        src =  '~/Documents/intocomics/images/movies/' + file
        dest = '~/Documents/intocomics/images/final_movies/' + file
        move(src, dest)
    except:
        continue

### Web design helper function
This will give me 15 nice rows of content for my webpage. In a different format, using something like this would be best to link a database to the data and have it populate my page.  Unfortunately, I don't have the time for that level of web development, but I can write a function to make the individual card images I need with Python! The funciton used below takes in the dataframe and returns a text document with all the html needed:

In [10]:
with open('all_movie_cards.html', 'w+') as f:
    for movie in movies:
        f.write(row_to_html_card(movie_image_urls.loc[movie]))