In [102]:
import pyspark
import pyspark.sql.functions as F
import shutil
from batcave.scrape_and_clean import get_item_image, row_to_html_card
from pyspark.sql.types import StringType, IntegerType

In [103]:
spark = (pyspark.sql.SparkSession.builder
    .master("local")
    .getOrCreate())

In [104]:
all_reviews = spark.read.json('data/all_reviews_fixed_titles.json')

### Getting thumbnail images for web interface
Lucky for me, the Amazon review data I have been working with has links to the tumbnail images for each product.  I wrote a function using ```urllib``` to download those images locally for use later. I downloaded all of the top 1,000 most frequently rated movies (as these will be my initial basis I will be drawing from) and all of the covers for comic books to give it the full scope of options to pull from when recommending. 

In [105]:
top_movie_url_query = """
SELECT
    DISTINCT asin
,   title
,   imUrl
,   item_id
,   count
FROM
    all_reviews
WHERE 
    item_id LIKE "%44"
ORDER BY 
    count DESC
LIMIT 1000
"""

In [106]:
all_reviews.createOrReplaceTempView('all_reviews')
movie_image_urls = spark.sql(top_movie_url_query).toPandas()

In [6]:
movie_missing_urls = movie_image_urls[movie_image_urls['imUrl'].isna()]
movie_active_urls = movie_image_urls[movie_image_urls['imUrl'].isna() == False]

In [9]:
movie_image_urls.to_csv('movies.csv')

In [107]:
top_comic_url_query = f"""
SELECT
    DISTINCT asin
,   imUrl
,   count
,   item_id
FROM
    all_reviews
WHERE 
    item_id LIKE "%22"
ORDER BY 
    count DESC
"""

In [108]:
all_reviews.createOrReplaceTempView('all_reviews')
comic_image_urls = spark.sql(top_comic_url_query).toPandas()

In [109]:
comic_missing_urls = comic_image_urls[comic_image_urls['imUrl'].isna()]
comic_active_urls = comic_image_urls[comic_image_urls['imUrl'].isna() == False]

In [110]:
comic_active_urls.head()

Unnamed: 0,asin,imUrl,count,item_id
0,1607060760,http://ecx.images-amazon.com/images/I/51m-0BhI...,314,501122
1,1607065967,http://ecx.images-amazon.com/images/I/51CnU-l0...,193,205422
2,1401220347,http://ecx.images-amazon.com/images/I/81tnMIDL...,137,279922
3,1401235417,http://ecx.images-amazon.com/images/I/81%2B3jj...,127,303922
4,1607066017,http://ecx.images-amazon.com/images/I/515jeiad...,120,478622


In [111]:
get_item_image(comic_active_urls, 'images/comics/')

KeyboardInterrupt: 

Since I just needed to manually find covers for these last two, I searched for them by ASIN on Amazon and downloaded to the same directory:

In [33]:
comic_missing_urls

Unnamed: 0,asin,imUrl,count,item_id
225,785142592,,18,73222
958,785148744,,7,325822


### Selecting movies to use
My goal for the web app interface is to offer a diverse selection of movies for users to select to help gather a recommendation.  To best solve this, I need to manually review this set of most reviewed movies and create a subsection to give a good mix of genres. I want to limit the amount of movies that are from comic source material as to avoid the really obvious connections (eg. You love Iron Man and get a recommendation for Iron Man, far too obvious a leap). 

* Remove any titles I don't want to feature
* Do further name cleaning
* Figure out way to get featured images for both
* Hulk vs?, Iron Man: The Art of Iron Man 2
* Bad name features still in place: (DVD), (Widescreen Edition) - anything in parenthesis! [Blu-ray]

After review, I chose these 60 movies to make up my application, ranging in genre and age, but all not based on any preceding comic book material:
<br>
<em>Note: these were the indexs from my original exploration, the actual results may vary</em>

In [112]:
movies = [5, 23, 35, 40, 51, 58, 74, 80, 82, 94, 95, 96, 106, 107, 114, 124, 133, 151, 167, 169, 171,
          175, 201, 209, 215, 228, 233, 244, 251, 258, 275, 287, 295, 297, 320, 322, 328, 334, 359, 360,
          374, 375, 378, 382, 388, 424, 427, 459, 463, 465, 468, 496, 497, 516, 547, 693, 708, 736, 49,
          33]

In [113]:
len(movies)

60

In [114]:
# Index does change with every load, so these results might not show up the same
select_movies = movie_image_urls.loc[movies].sort_values('asin')

In [115]:
select_movies[['title', 'item_id']]

Unnamed: 0,title,item_id
516,Fifth Element,1375044
215,Ghostbusters,3007644
328,Dogma,3202944
468,The Shawshank Redemption,601444
106,Jaws,589444
375,Jurassic Park,1422044
424,Psycho,2461844
114,Galaxy Quest,2088944
151,Snow White and the Seven Dwarfs - Platinum Edi...,217244
51,"The Chronicles of Narnia: The Lion, the Witch ...",2905844


In [101]:
get_item_image(select_movies, 'images/movies/')

In [89]:
def move(src, dest):
    shutil.move(src, dest)

In [32]:
# Adjust to match directory you're working in
for index, item in movie_image_urls.loc[movies].iterrows():
    try:
        file = item['asin'] + item['imUrl'][-4:]
        src =  '~/Documents/intocomics/images/movies/' + file
        dest = '~/Documents/intocomics/images/final_movies/' + file
        move(src, dest)
    except:
        continue

### Web design helper function
This will give me 15 nice rows of content for my webpage. In a different format, using something like this would be best to link a database to the data and have it populate my page.  Unfortunately, I don't have the time for that level of web development, but I can write a function to make the individual card images I need with Python! The funciton used below takes in the dataframe and returns a text document with all the html needed:

In [10]:
with open('all_movie_cards.html', 'w+') as f:
    for movie in movies:
        f.write(row_to_html_card(movie_image_urls.loc[movie]))