In [167]:
from bs4 import BeautifulSoup
from mechanicalsoup import StatefulBrowser
import re
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
import numpy as np

In [4]:
def create_search_link(movie_name):
    """
    Creates link for IMDB search
    
    Parameters
    ----------
    movie: String
        The name of the movie
    
    Returns
    -------
    url: String
    """
    url = "http://www.imdb.com/find?q=" + '+'.join(movie_name.split()) + "&s=all"
    return url

In [184]:
sl = create_search_link("The Dark Knight")
print(sl)
page = get_page(sl)

http://www.imdb.com/find?q=The+Dark+Knight&s=all


In [182]:
def get_page(url):
    """
    Accesses url of IMDB page.
    
    Parameters
    ----------
    url: String
        The URL of the IMDB page to be accessed.
    
    Returns
    -------
    page resource
    """
    bwsr = StatefulBrowser()
    bwsr.open(url)
    # /title/tt0468569/?ref_=fn_al_tt_1
    search = re.compile(r'/title/tt.*')
    link = bwsr.links(url_regex = search)[0]
    page = bwsr.open("http://www.imdb.com" + link['href'])
    #movie_id = link['href'].split("/?ref")[0]
    #page = bwsr.open("http://www.imdb.com" + movie_id + "/reviews?ref_=tt_urv")
    return page.soup

In [210]:
def parse_summary(page):
    """
    Given the page of an IMDb movie, return a list of filtered tokens.
    
    Parameters
    ----------
    page: soup
        The page's soup
    
    Returns
    -------
    tokens: list of strings
    """
    bsoup = page
    movie = {}
    movie["title"] = bsoup.find(class_='title_wrapper').find('h1').text.split("\xa0")[0]
    movie["year"] = bsoup.find(id="titleYear").text[1:-1]
    movie["rating"] = bsoup.find(class_="ratingValue").find('strong').find('span').text
    result = bsoup.find(class_="summary_text")
    summary = result.text
    for p in string.punctuation:
        summary = summary.replace(p, "")
    tokens = word_tokenize(summary)
    movie["tokens"] = [i for i in tokens if i not in stopwords.words()]
    return movie

In [211]:
db["The Dark Knight"] = parse_summary(page)

In [212]:
db["The Dark Knight"]

{'rating': '9.0',
 'title': 'The Dark Knight',
 'tokens': ['When',
  'menace',
  'known',
  'Joker',
  'emerges',
  'mysterious',
  'past',
  'wreaks',
  'havoc',
  'chaos',
  'people',
  'Gotham',
  'Dark',
  'Knight',
  'must',
  'accept',
  'one',
  'greatest',
  'psychological',
  'physical',
  'tests',
  'ability',
  'fight',
  'injustice'],
 'year': '2008'}