In [81]:
import bs4; print( 'bs4 ' + bs4.__version__)
from bs4 import BeautifulSoup, SoupStrainer

import nltk; print( 'nltk ' + nltk.__version__)
from nltk import word_tokenize, pos_tag, RegexpParser;
from nltk.tokenize import sent_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import sklearn; print( 'sklearn ' + sklearn.__version__)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering, KMeans

import matplotlib; print( 'matplotlib ' + matplotlib.__version__)
import matplotlib.pyplot as plt

import re; print('re ' + re.__version__)
import requests; print('requests ' + requests.__version__)

import numpy as np; print('np ' + np.__version__)

bs4 4.6.3
nltk 3.3
sklearn 0.19.1
matplotlib 2.2.2
re 2.2.1
requests 2.19.1
np 1.14.2


# Homework 8: Sentiment Analysis

### Write Up

First we get the reviews from various Forest Gump dramas. Then we cluster these reviews into three different clusters (using K-Means). Finally we take the sentiment from each cluster.

Results:

```
Cluster 0: mean: -0.06 | median: 0.0 | max: 0.4 | min: -0.6
Cluster 1: mean: 0.10 | median: 0.0 | max: 0.6 | min: -0.2
Cluster 2: mean: -0.03 | median: 0.0 | max: 0.4 | min: -0.6
```

The clusters seem to be fairly neutral in general. Interesting that the mean is negative in two of the clusters but the reviews are mostly favorable (5 stars usually). I think this is due to some words showing up such as "death" which seems to drop the score a bit. There are some sad dramas in here after all (think Green Mile and Road To Perdition).

### Code from Homework 5

In [2]:
# The home page for various movies' reviews, from which will get the links for individual reviews
review_home_urls = {
    'green_mile': 'https://www.imdb.com/title/tt0120689/reviews?ref_=tt_ql_3',
    'forest_gump': 'https://www.imdb.com/title/tt0109830/reviews?ref_=tt_ov_rt',
    'cast_away': 'https://www.imdb.com/title/tt0162222/reviews?ref_=tt_ov_rt',
    'terminal': 'https://www.imdb.com/title/tt0362227/reviews?ref_=tt_ql_3',
    'catch_me_if_you_can': 'https://www.imdb.com/title/tt0264464/reviews?ref_=tt_ql_3',
    'road_to_perdition': 'https://www.imdb.com/title/tt0257044/reviews?ref_=tt_ql_3',
}

In [3]:
def get_text_from_url(url):
    return requests.get(url).text
text = get_text_from_url(review_home_urls['green_mile'])

In [4]:
def get_all_links_from_html(html):
    tags = BeautifulSoup(html, 'html.parser', parse_only=SoupStrainer('a', href=True))
    urls = [str(tag.attrs['href']) for tag in tags]
    return urls
all_links = get_all_links_from_html(text)

In [5]:
def get_review_urls_from_links(links):
    url_template = 'https://www.imdb.com{}'
    # url_template = 'http://www.gutenberg.org/files/{}/{}-h/{}-h.htm'
    return [url_template.format(link) for link in links]

urls = get_review_urls_from_links(all_links);  

In [6]:
def relevent_link(link):
    if '/review/' in link:
        return True
    return False
def get_relevent_links(links):
    relevent_links = filter(relevent_link, all_links)
    unique_relevent_links = set(relevent_links)
    return list(unique_relevent_links)
relevent_urls = get_relevent_links(urls)
len(relevent_urls)

25

In [7]:
def strain_content(name, attrs):
    if name == 'div' and dict(attrs).get('class', None) == 'content':
        return True
    return False
def clean_review_text(text):
    return re.split('\\n\\n\s+\d+ out of \d+', text)[0]
def get_review_from_url(url):
    html = get_text_from_url(url)
    tags = BeautifulSoup(html, 'html.parser', parse_only=SoupStrainer(strain_content))
    review = clean_review_text(tags.text)
    return review

In [8]:
def get_review_from_site(url):
    reviews = []

    reviews_home_text = get_text_from_url(url)
    all_links = get_all_links_from_html(reviews_home_text)
    relevent_links = get_relevent_links(all_links)

    
    
    
    review_urls = get_review_urls_from_links(relevent_links)
    for url in review_urls:
        reviews.append(get_review_from_url(url))
        # break
    return reviews

In [9]:
def get_reviews_from_all_sites():
    all_reviews = []
    review_titles = review_home_urls.keys()
    for title in review_titles:
        review_home_url = review_home_urls[title]
        all_reviews = all_reviews + get_review_from_site(review_home_url)
    return all_reviews

### Get All Reviews

In [12]:
all_reviews = get_reviews_from_all_sites()

### Run k-means

In [127]:
# Create a cluster from titles (helped by https://pythonprogramminglanguage.com/kmeans-text-clustering/)
def run_kmeans(data, true_k, n_terms=20):
    def create_vectorization(text):
        vectorizor = TfidfVectorizer(stop_words='english')
        return [vectorizor, vectorizor.fit_transform(text)]
    vectorizor, X = create_vectorization(data)

    def create_cluster_model(X):
        # model = AgglomerativeClustering(linkage='average', n_clusters=true_k)
        model = KMeans(n_clusters=true_k, max_iter=100)
        model.fit(X.toarray())

        return model
    model = create_cluster_model(X)
    
    def get_top_terms_per_cluster(model, vectorizor, true_k, n_terms):
        order_centroids = model.cluster_centers_.argsort()[:, ::-1]
        terms = vectorizor.get_feature_names()
        clusters = []
        for i in range(true_k):
            cluster_terms = []
            for ind in order_centroids[i, :24]:
                cluster_terms.append(terms[ind])
            clusters.append(cluster_terms)
        return clusters

    cluster_top_terms = get_top_terms_per_cluster(model, vectorizor, true_k, n_terms)
    def print_top_terms():
        print("Top terms per cluster:")
        for index, cluster in enumerate(cluster_top_terms):
            print()
            print('cluster', index)
            for term in cluster:
                print(' %s' % term)
    print_top_terms()
    
    return cluster_top_terms

top_terms = run_kmeans(all_reviews, 3)

Top terms per cluster:

cluster 0
 products
 films
 senses
 morality
 trigger
 treat
 multiplicity
 wake
 relations
 moral
 fear
 return
 sadness
 seek
 points
 sex
 meaning
 realities
 reactions
 furious
 expense
 unrealistic
 effects
 unfortunately

cluster 1
 movie
 seen
 movies
 time
 gave
 book
 forget
 like
 best
 long
 shawshank
 redemption
 recommend
 story
 great
 people
 performance
 think
 hours
 miracle
 tom
 say
 read
 marvel

cluster 2
 film
 mile
 green
 cast
 story
 movie
 duncan
 michael
 make
 darabont
 paul
 hanks
 coffey
 prison
 john
 edgecomb
 just
 tom
 king
 long
 like
 death
 scenes
 hours


### Sentiment Analysis

In [79]:
sid = None
try:
    sid = SentimentIntensityAnalyzer('vader_lexicon')
except:
    nltk.download('vader_lexicon')
    sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/kjprice/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [78]:
sid.polarity_scores('):{')

{'neg': 1.0, 'neu': 0.0, 'pos': 0.0, 'compound': -0.5106}

In [128]:
all_scores = []
for terms in top_terms:
    scores = []
    for term in terms:
        score = sid.polarity_scores(term)
        scores.append(score['compound'])
    all_scores.append(np.array(scores))
for i, scores in enumerate(all_scores):
    print('Cluster {}: '.format(i), end='')

    mean = scores.mean()
    print('mean: {0:.1}'.format(mean), end=' | ')

    median = np.median(scores)
    print('median: {}'.format(median), end=' | ')

    _max = scores.max()
    print('max: {0:.1}'.format(_max), end=' | ')

    _min = scores.min()
    print('min: {0:.1}'.format(_min))

Cluster 0: mean: -0.06 | median: 0.0 | max: 0.4 | min: -0.6
Cluster 1: mean: 0.1 | median: 0.0 | max: 0.6 | min: -0.2
Cluster 2: mean: -0.03 | median: 0.0 | max: 0.4 | min: -0.6


In [124]:
all_scores[0][12]

-0.5994

In [125]:
top_terms[0][12]

'death'