In [1]:
import bs4; print( 'bs4 ' + bs4.__version__)
from bs4 import BeautifulSoup, SoupStrainer

import nltk; print( 'nltk ' + nltk.__version__)
from nltk import word_tokenize, pos_tag, RegexpParser;
from nltk.tokenize import sent_tokenize

import re; print('re ' + re.__version__)
import requests; print('requests ' + requests.__version__)

bs4 4.6.3
nltk 3.3
re 2.2.1
requests 2.19.1


# Homework 5 Write Up

All code can be found below

### 1. Compile a list of static links (permalinks) to individual user movie reviews from one particular
website. This will be your working dataset for this assignment, as well as for assignments 7 and
8, which together will make up your semester project.
 - It does not matter if you use a crawler or if you manually collect the links, but you will
need at least 100 movie review links. Note that, as of this writing, the robots.txt file of
IMDB.com allows the crawling of user reviews.
 - Each link should be to a web page that has only one user review of only one movie, e.g.,
the user review permalinks on the IMDB site.
 - Choose reviews of movies that are all in the same genre, e.g., sci-fi, mystery, romance,
superhero, etc.
 - Make sure your collection includes reviews of several movies in your chosen genre and
that it includes a mix of negative and positive reviews.

>I decided to use various dramas portraied by Tom Hanks. I store the review pages for each of a few of his movies in a variable called `review_home_urls`. Down below I then get all of the reviews and store them in a variable called `all_reviews`. We will have 150 different reviews from these three movies (Green Mile, Forest Gump, and Cast Away, The Terminal, Catch Me If You Can, Road To Perdition)



### 2. Extract noun phrase (NP) chunks from your reviews using the following procedure:
 - In Python, use BeautifulSoup to grab the main review text from each link.
 - Next run each review text through a tokenizer, and then try to NP-chunk it with a
shallow parser.
 - You probably will have too many unknown words, owing to proper names of characters,
actors, and so on that are not in your working dictionary. Make sure the main names
that are relevant to the movies in your collection of reviews are added to the working
lexicon, and then run the NP chunker again.

> I have tokenized and tagged each review and that put them through a shallow parser. The output is pretty good from what I can tell as getting the NPs. I was unable to improve the output.


### 3. Output all the chunks in a single list for each review, and submit that output for this assignment.
Also submit a brief written summary of what you did (describe your selection of genre, your
source of reviews, how many you collected, and by what means).

> I decided to use dramatic films featuring Tom Hanks mostly because of my adoration for the man as an actor. I used IMDB.com. I manually selected the "main pages" for each movie's review and then iterated over each page to gather links to all the reviews on that page. I have found that each "main page" provide exactly `25 reviews`.

# Code

In [2]:
# The home page for various movies' reviews, from which will get the links for individual reviews
review_home_urls = {
    'green_mile': 'https://www.imdb.com/title/tt0120689/reviews?ref_=tt_ql_3',
    'forest_gump': 'https://www.imdb.com/title/tt0109830/reviews?ref_=tt_ov_rt',
    'cast_away': 'https://www.imdb.com/title/tt0162222/reviews?ref_=tt_ov_rt',
    'terminal': 'https://www.imdb.com/title/tt0362227/reviews?ref_=tt_ql_3',
    'catch_me_if_you_can': 'https://www.imdb.com/title/tt0264464/reviews?ref_=tt_ql_3',
    'road_to_perdition': 'https://www.imdb.com/title/tt0257044/reviews?ref_=tt_ql_3',
}

In [3]:
def get_text_from_url(url):
    return requests.get(url).text
text = get_text_from_url(review_home_urls['green_mile'])

In [4]:
def get_all_links_from_html(html):
    tags = BeautifulSoup(html, 'html.parser', parse_only=SoupStrainer('a', href=True))
    urls = [str(tag.attrs['href']) for tag in tags]
    return urls
all_links = get_all_links_from_html(text)

In [5]:
def get_review_urls_from_links(links):
    url_template = 'https://www.imdb.com{}'
    # url_template = 'http://www.gutenberg.org/files/{}/{}-h/{}-h.htm'
    return [url_template.format(link) for link in links]

urls = get_review_urls_from_links(all_links);  

In [6]:
def relevent_link(link):
    if '/review/' in link:
        return True
    return False
def get_relevent_links(links):
    relevent_links = filter(relevent_link, all_links)
    unique_relevent_links = set(relevent_links)
    return list(unique_relevent_links)
relevent_urls = get_relevent_links(urls)
len(relevent_urls)

25

In [7]:
def strain_content(name, attrs):
    if name == 'div' and dict(attrs).get('class', None) == 'content':
        return True
    return False
def clean_review_text(text):
    return re.split('\\n\\n\s+\d+ out of \d+', text)[0]
def get_review_from_url(url):
    html = get_text_from_url(url)
    tags = BeautifulSoup(html, 'html.parser', parse_only=SoupStrainer(strain_content))
    review = clean_review_text(tags.text)
    return review

In [8]:
def get_review_from_site(url):
    reviews = []

    reviews_home_text = get_text_from_url(url)
    all_links = get_all_links_from_html(reviews_home_text)
    relevent_links = get_relevent_links(all_links)

    
    
    
    review_urls = get_review_urls_from_links(relevent_links)
    for url in review_urls:
        reviews.append(get_review_from_url(url))
        # break
    return reviews

In [9]:
def get_reviews_from_all_sites():
    all_reviews = []
    review_titles = review_home_urls.keys()
    for title in review_titles:
        review_home_url = review_home_urls[title]
        all_reviews = all_reviews + get_review_from_site(review_home_url)
    return all_reviews

In [10]:
def pos_tagging_for_review(sentence):
    cleaned_review = sentence.lower()
    tokenized_review = word_tokenize(cleaned_review)
    return pos_tag(tokenized_review)
def get_chunking_for_sentence(sentence):
    tagged_sentence = pos_tagging_for_review(sentence)

    grammar = "NP: {<DT>?<JJ>*<NN>}"

    cp = nltk.RegexpParser(grammar)
    result = cp.parse(tagged_sentence)
    return result

def get_chunking_for_review(review):
    sentences = sent_tokenize(review)
    result = [get_chunking_for_sentence(sentence) for sentence in sentences]
    return result

#get_chunking_for_review(all_reviews[0])

In [11]:
def get_chunking_for_reviews(reviews):
    return [get_chunking_for_review(review) for review in reviews]

In [12]:
def get_noun_phrases_for_review(review):
    main_trees = get_chunking_for_review(review)
    subtrees = []
    for main_tree in main_trees:
        for subtree in main_tree.subtrees():
            if subtree.label() == 'NP':
                subtrees.append(subtree)
    return subtrees[1:]

In [13]:
def get_noun_phrases_for_reviews(reviews):
    return [get_noun_phrases_for_review(review) for review in reviews]


# Question 1

In [14]:
all_reviews = get_reviews_from_all_sites()

In [15]:
print(len(all_reviews))

150


# Question 2

In [16]:
noun_phrases_reviews = get_noun_phrases_for_reviews(all_reviews)


In [17]:
for noun_phrases in noun_phrases_reviews[0:6]:
    for i in range(2):
        print(noun_phrases[i])
    print()

(NP another/DT movie/NN)
(NP work/NN)

(NP all/DT manner/NN)
(NP a/DT crime/NN)

(NP ****starring/NN)
(NP david/JJ morse/NN)

(NP the/DT performance/NN)
(NP career/NN)

(NP this/DT picture/NN)
(NP i/NN)

(NP a/DT masterwork/NN)
(NP film/NN)



# Question 3

In [18]:
noun_phrases_reviews

[[Tree('NP', [('another', 'DT'), ('movie', 'NN')]),
  Tree('NP', [('work', 'NN')]),
  Tree('NP', [('frank', 'JJ'), ('darabont', 'NN')]),
  Tree('NP', [('prison', 'NN')]),
  Tree('NP', [('dramas', 'NN')]),
  Tree('NP', [('the', 'DT'), ('green', 'JJ'), ('mile', 'NN')]),
  Tree('NP', [('the', 'DT'), ('shawshank', 'NN')]),
  Tree('NP', [('redemption', 'NN')]),
  Tree('NP', [('real', 'JJ'), ('class', 'NN')]),
  Tree('NP', [('this', 'DT'), ('film', 'NN')]),
  Tree('NP', [('michael', 'NN')]),
  Tree('NP', [('clarke', 'NN')]),
  Tree('NP', [('the', 'DT'), ('performance', 'NN')]),
  Tree('NP', [('career', 'NN')]),
  Tree('NP', [('this', 'DT'), ('film', 'NN')]),
  Tree('NP', [('the', 'DT'), ('edge', 'NN')]),
  Tree('NP', [('seat', 'NN')]),
  Tree('NP', [('a', 'DT'), ('strong', 'JJ'), ('support', 'NN')]),
  Tree('NP', [('a', 'DT'), ('talented', 'JJ'), ('ensemble', 'JJ'), ('cast.darabont', 'NN')]),
  Tree('NP', [('a', 'DT'), ('screenwriter', 'NN')]),
  Tree('NP', [('the', 'DT'), ('film', 'NN')]),
