In [13]:
import bs4; print( 'bs4 ' + bs4.__version__)
from bs4 import BeautifulSoup, SoupStrainer

import nltk; print( 'nltk ' + nltk.__version__)
from nltk import word_tokenize, pos_tag, RegexpParser;
from nltk.tokenize import sent_tokenize

import sklearn; print( 'sklearn ' + sklearn.__version__)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering, KMeans

import matplotlib; print( 'matplotlib ' + matplotlib.__version__)
import matplotlib.pyplot as plt

import re; print('re ' + re.__version__)
import requests; print('requests ' + requests.__version__)

bs4 4.6.3
nltk 3.3
sklearn 0.19.1
matplotlib 2.2.2
re 2.2.1
requests 2.19.1


In [2]:
# The home page for various movies' reviews, from which will get the links for individual reviews
review_home_urls = {
    'green_mile': 'https://www.imdb.com/title/tt0120689/reviews?ref_=tt_ql_3',
    'forest_gump': 'https://www.imdb.com/title/tt0109830/reviews?ref_=tt_ov_rt',
    'cast_away': 'https://www.imdb.com/title/tt0162222/reviews?ref_=tt_ov_rt',
    'terminal': 'https://www.imdb.com/title/tt0362227/reviews?ref_=tt_ql_3',
    'catch_me_if_you_can': 'https://www.imdb.com/title/tt0264464/reviews?ref_=tt_ql_3',
    'road_to_perdition': 'https://www.imdb.com/title/tt0257044/reviews?ref_=tt_ql_3',
}

In [3]:
def get_text_from_url(url):
    return requests.get(url).text
text = get_text_from_url(review_home_urls['green_mile'])

In [4]:
def get_all_links_from_html(html):
    tags = BeautifulSoup(html, 'html.parser', parse_only=SoupStrainer('a', href=True))
    urls = [str(tag.attrs['href']) for tag in tags]
    return urls
all_links = get_all_links_from_html(text)

In [5]:
def get_review_urls_from_links(links):
    url_template = 'https://www.imdb.com{}'
    # url_template = 'http://www.gutenberg.org/files/{}/{}-h/{}-h.htm'
    return [url_template.format(link) for link in links]

urls = get_review_urls_from_links(all_links);  

In [6]:
def relevent_link(link):
    if '/review/' in link:
        return True
    return False
def get_relevent_links(links):
    relevent_links = filter(relevent_link, all_links)
    unique_relevent_links = set(relevent_links)
    return list(unique_relevent_links)
relevent_urls = get_relevent_links(urls)
len(relevent_urls)

25

In [7]:
def strain_content(name, attrs):
    if name == 'div' and dict(attrs).get('class', None) == 'content':
        return True
    return False
def clean_review_text(text):
    return re.split('\\n\\n\s+\d+ out of \d+', text)[0]
def get_review_from_url(url):
    html = get_text_from_url(url)
    tags = BeautifulSoup(html, 'html.parser', parse_only=SoupStrainer(strain_content))
    review = clean_review_text(tags.text)
    return review

In [8]:
def get_review_from_site(url):
    reviews = []

    reviews_home_text = get_text_from_url(url)
    all_links = get_all_links_from_html(reviews_home_text)
    relevent_links = get_relevent_links(all_links)

    
    
    
    review_urls = get_review_urls_from_links(relevent_links)
    for url in review_urls:
        reviews.append(get_review_from_url(url))
        # break
    return reviews

In [9]:
def get_reviews_from_all_sites():
    all_reviews = []
    review_titles = review_home_urls.keys()
    for title in review_titles:
        review_home_url = review_home_urls[title]
        all_reviews = all_reviews + get_review_from_site(review_home_url)
    return all_reviews

### Get All Reviews

In [12]:
all_reviews = get_reviews_from_all_sites()

### Run k-means

In [21]:
# Create a cluster from titles (helped by https://pythonprogramminglanguage.com/kmeans-text-clustering/)
def run_kmeans(data, true_k):
    def create_vectorization(text):
        vectorizor = TfidfVectorizer(stop_words='english')
        return [vectorizor, vectorizor.fit_transform(text)]
    vectorizor, X = create_vectorization(data)

    def create_cluster_model(X):
        # model = AgglomerativeClustering(linkage='average', n_clusters=true_k)
        model = KMeans(n_clusters=true_k, max_iter=100)
        model.fit(X.toarray())

        return model
    model = create_cluster_model(X)

    def get_top_terms_per_cluster(model):
        print("Top terms per cluster:")
        order_centroids = model.cluster_centers_.argsort()[:, ::-1]
        terms = vectorizor.get_feature_names()
        for i in range(true_k):
            print("Cluster %d:" % i),
            for ind in order_centroids[i, :24]:
                print(' %s' % terms[ind]),
            print
    get_top_terms_per_cluster(model)
    
    return model

In [23]:
run_kmeans(all_reviews, 3)

Top terms per cluster:
Cluster 0:
 movie
 movies
 recommend
 seen
 time
 story
 songs
 think
 just
 drama
 title
 especially
 mind
 watched
 best
 experience
 sure
 viewing
 goes
 10
 acting
 ve
 steven
 hours
Cluster 1:
 film
 mile
 movie
 green
 story
 paul
 coffey
 duncan
 king
 darabont
 john
 like
 long
 prison
 hanks
 make
 certainly
 death
 michael
 cast
 tom
 hours
 just
 edgecomb
Cluster 2:
 movie
 gave
 cast
 book
 mile
 green
 michael
 performance
 career
 excellent
 film
 vulnerable
 weak
 great
 redemption
 shawshank
 depth
 miracle
 long
 bring
 duncan
 emotion
 hanks
 extraordinary


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)