# Recommendation scraping

=========================================================================================================

**AUTHOR**: Mengshan Jin

**CREATION DATE**: 08/05/2017

=========================================================================================================


**PROGRAM DESCRIPTION**: Scraping "People who liked this also liked..." from IMDB page for each movie

**INPUT DATASETS**: 01_Data/Outputs/imdb_with_storyline.csv

**OUTPUT DATASETS**: 01_Data/Outputs/imdb_storyline_recs.csv


=========================================================================================================

**PROGRAM CHANGE HISTORY**

Date|Author|Change|
----|------|------|

In [1]:
import urllib2
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re

In [3]:
imdb_with_storyline = pd.read_csv("../Outputs/imdb_with_storyline.csv")

In [4]:
imdb_with_storyline.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,storyline
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,"When his brother is killed in a robbery, parap..."
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,"After Elizabeth, Will, and Captain Barbossa re..."
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,A cryptic message from the past sends James Bo...
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,Despite his tarnished reputation after the eve...
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,12.0,7.1,,0,


In [30]:
imdb_with_storyline['movie_imdb_id'] = imdb_with_storyline['movie_imdb_link'].apply(lambda x: x.split('title/')[1].split('/')[0])

In [31]:
imdb_with_storyline.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,storyline,movie_imdb_id
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,"When his brother is killed in a robbery, parap...",tt0499549
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,"After Elizabeth, Will, and Captain Barbossa re...",tt0449088
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,A cryptic message from the past sends James Bo...,tt2379713
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,Despite his tarnished reputation after the eve...,tt1345836
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,12.0,7.1,,0,,tt5289954


In [39]:
def recommendations_scraper(link):
    html = urllib2.urlopen(link)
    soup = BeautifulSoup(html, "lxml")
    try:
        recs = soup.findAll("div", {"class":"rec_item"})
        rec_list = [recs[i].find('a').get('href').split('title/')[1].split('/')[0] for i in range(12)]
    except IndexError:
        rec_list = [recs[j].find('a').get('href').split('title/')[1].split('/')[0] for j in range(i)] + [np.nan for j in range(i,12)]
    return rec_list

In [54]:
imdb_with_storyline[['recommendation_' + str(i+1) for i in range(12)]] = pd.DataFrame([x for x in imdb_with_storyline['movie_imdb_link'].apply(lambda x: recommendations_scraper(x))])

In [55]:
imdb_with_storyline.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,recommendation_3,recommendation_4,recommendation_5,recommendation_6,recommendation_7,recommendation_8,recommendation_9,recommendation_10,recommendation_11,recommendation_12
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,tt0120338,tt0454876,tt1454468,tt3659388,tt0816711,tt1951264,tt0371746,tt0416449,tt0119654,tt0848228
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,tt0325980,tt0903624,tt1170358,tt2310332,tt0121766,tt1201607,tt0371746,tt0121765,tt1300854,tt0800369
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,tt0381061,tt2381249,tt0113189,tt1229238,tt0143145,tt0246460,tt0117060,tt0120347,tt0317919,tt0097742
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,tt1375666,tt0816692,tt0482571,tt0167261,tt0120737,tt1853728,tt0167260,tt0086190,tt0137523,tt0172495
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,,,,


In [56]:
imdb_with_storyline.to_csv("../Outputs/imdb_storyline_recs.csv", index=False)