# Juliann Negron Homework 2 - Rotten Tomatoes

Your script should begin by defining two variables (after importing libraries, etc)
movie        a string variable indicating the movie for which reviews will be parsed
pageNum  the number of review pages to parse

For example, to parse the first 3 pages of “Gangs of New York” reviews, set movie = “gangs_of_new_york” and pageNum = 3. Your code should then request the three pages below, and parse them.
https://rottentomatoes.com/m/gangs_of_new_york/reviews?page=1 
https://rottentomatoes.com/m/gangs_of_new_york/reviews?page=2 
https://rottentomatoes.com/m/gangs_of_new_york/reviews?page=3 


In [56]:
# 1. importing useful libraries

import requests # to get the website
import time     # to force our code to wait a little before re-trying to grab a webpage
import re       # to grab the exact element we need
from bs4 import BeautifulSoup # to grab the html elements we need

# Import pandas to read in data
import numpy as np
import pandas as pd

# Import models and evaluation functions
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
#from sklearn import cross_validation
#from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

# Import vectorizers to turn text into numeric
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Import plotting
import matplotlib.pylab as plt
%matplotlib inline

In [60]:
# Defining a list of 50 movies and page number variable
movie_list = ['hamilton_2020', 'cats_2019', 'mamma_mia', 'into_the_woods_2014', 'annie_1981', 'les_miserables_2012', 'sweeney_todd_the_demon_barber_of_fleet_street_2007', 'hairspray', "1152276-rent", 'dreamgirls', 
              'phantom_of_the_opera', 'chicago', 'moulin_rouge_2001', "1079818-anastasia", 'prince_of_egypt', 'newsies', "1073037-hunchback_of_notre_dame", 'frozen_2013', 'the_last_5_years', "1012514-little_shop_of_horrors", 
              'grease', 'jersey_boys', 'evita', 'funny_girl', 'oliver', 'joseph_and_the_amazing_technicolor_dreamcoat', "1011605_king_and_i", 'west_side_story', 'guys_and_dolls', 'south_pacific', 
              'american_in_paris', 'singin_in_the_rain', 'oklahoma', 'brigadoon', 'kiss-me-kate1953', 'gigi', 'the_pajama_game', "1005152-damn_yankees", 'sound_of_music', 'beauty_and_the_beast_1991', 
              'aladdin', 'mary_poppins', 'the_lion_king', 'fiddler_on_the_roof', 'hair', "1003339-bye_bye_birdie", "1014453-music_man", 'once', "1205483_nine", 'xanadu']
PageNum = 4

# Starting with an empty list that we will add to in the loop
movie_review_data = []

# Creating for loop to loop through every entry in my movie list

for m in movie_list:

    # Starting with this url, will add to it later
    first_url = 'https://www.rottentomatoes.com/m/' + str(m) + '/reviews'
    
    # Creating a for loop to scape movie data from number of pages provided
    for i in range(1, PageNum + 1):

        # First page is different from the rest, so I made an if statement to give it a custom url
        if i == 1:
            current_page = first_url
            
        # All other pages have this layout for the url
        else:
            current_page = first_url + '?page='+str(i)
            
        # try to scrape times
        for k in range(5): 
            try:
                # get url content
                response = requests.get(current_page,headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36', })
                # get the html
                html=response.content
                # if we successuflly got the file, break the loop
                break 
            # requests.get() threw an exception, i.e., the attempt to get the response failed
            except:
                print ('failed attempt #',k)
                # wait 2 secs before trying again
                time.sleep(2)

        if not html:
            # couldnt get the page, ignore
            print('could not get page #', i)
            continue 
        
        # Making the HTML file more readable
        current_content = BeautifulSoup(html.decode('ascii', 'ignore'), 'lxml')
        
        # keep only <div> tags whose class contains the 'review_table_row' substring
        review_data = current_content.findAll('div', {'class':re.compile('review_table_row')})
    
        # For loop to save the review text and the rating into variables to add to list at the end
        for k in review_data:
            
            # Rating and text variables start as NA and stay as NA if variables are not found
            rating = 'NA'
            text = 'NA'
            
            # 1. If there is a rating, get it
            rate_data = k.find('div', {'class': re.compile('review_icon')})
            rating_text = str(rate_data.attrs['class'])
            
            # Inputting good or "fresh" movies as 1, "rotten" movies as 0
            if rating_text: 
                if 'fresh' in rating_text:
                    rating = 1   
                else:
                    rating = 0
        
            # 2. If there is review text, get it  
            text_data = k.find('div',{'class':'the_review'})
                
            if text_data: 
                text = text_data.text.strip() 
            
            # Appending to movie list as a list (creating a list of lists)
            movie_review_data.append([rating, text])

In [61]:
# Creating column names to build data frame
col_names = ('rating', 'text')

# Converting list of lists into data frame
movie_df = pd.DataFrame(movie_review_data, columns=col_names)

# Replacing all empty values in my data frame with "NaN"
nan_value = float("NaN") 
movie_df.replace("", nan_value, inplace=True)

# Removing all entries without review text
movie_data = movie_df.dropna()

In [74]:
#len(movie_df)
# Length of data frame used to be 2734 rows, now 2492 rows
print(movie_data)

      rating                                               text
0          1  There's a line where Hamitlon says "America, y...
1          1  Hamilton is above all an exploration of the co...
2          1  Rarely does an uber-hyped pop culture phenomen...
3          1  The film pulses with energy and life - and yes...
4          1  The vast majority of this is accurate, which i...
...      ...                                                ...
2721       0  Xanadu is a mushy and limp musical fantasy, so...
2722       0  Xanadu doesn't lend itself to quick or easy ch...
2724       1  Great tunes fill this flavorful bubble gum movie.
2728       0   campy joyful fun, with a wee bit too much cheese
2733       1  There's never been a movie quite like this, an...

[2492 rows x 2 columns]


In [68]:
# Part II: Working classifier

# Separating columns into two vectors, using X to predict Y
X_text = movie_data['text']
Y_rating = movie_data['rating']

In [69]:
# Create a vectorizer that will track text as counted features
count_vectorizer = CountVectorizer()

# Let the vectorizer learn what tokens exist in the text data
count_vectorizer.fit(X_text)

# Turn these tokens into a numeric matrix
X = count_vectorizer.transform(X_text)

# Create a model
logistic_regression = LogisticRegression()

# Use this model and our data to get 5-fold cross validation AUCs
accs = cross_val_score(logistic_regression, X, Y_rating, scoring="accuracy", cv=5)

# Print out the average AUC rounded to three decimal points
print("Accuracy of our classifier is " + str(round(np.mean(accs), 3)))

Accuracy of our classifier is 0.724


In [70]:
# Using TF-IDF
# Create a vectorizer that will track text as binary features
tfidf_vectorizer = TfidfVectorizer()

# Let the vectorizer learn what tokens exist in the text data
tfidf_vectorizer.fit(X_text)

# Turn these tokens into a numeric matrix
X = tfidf_vectorizer.transform(X_text)

# Create a model
logistic_regression = LogisticRegression()

# Use this model and our data to get 5-fold cross validation AUCs
aucs = cross_val_score(logistic_regression, X, Y_rating, scoring="accuracy", cv=5)

# Print out the average AUC rounded to three decimal points
print("Accuracy of our classifier is " + str(round(np.mean(aucs), 3)))

Accuracy of our classifier is 0.739


It appears that the TF-IDF method is more accurate than using full counts instead of a binary representation.