# Webscraping Code

### Import Required Packages

In [1]:
import os
import csv
import time
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from datetime import datetime
from selenium import webdriver 
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from googlesearch import search
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim.corpora import Dictionary
from sklearn.feature_extraction.text import TfidfVectorizer
import math
import re

### Set up the Selenium Chrome Driver

In [2]:
chromedriver = "/Users/liamparker/Downloads/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)

### Initial Dataset Construction

In [3]:
# Download dataset from Kaggle
kaggle_csv = '/Users/liamparker/Downloads/train.csv'
movie_df = pd.read_csv(kaggle_csv)

In [4]:
# Explore the columns in the Dataset
movie_df.columns

Index(['id', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'runtime', 'spoken_languages',
       'status', 'tagline', 'title', 'Keywords', 'cast', 'crew', 'revenue'],
      dtype='object')

In [5]:
# Perform initial pruning

# Remove non-english films
en_df = movie_df.loc[movie_df['original_language'] == 'en']

# Remove 0-budget films
en_df = en_df.loc[en_df['budget'] > 0]

# Remove films outside of 2010-2020
en_df['release_date'] = en_df['release_date'].apply(lambda x: datetime.strptime(x, '%m/%d/%y'))
en_df = en_df.loc[en_df['release_date'] > datetime(2010, 1, 1)]
en_df = en_df.loc[en_df['release_date'] < datetime(2020, 1, 1)]

# Drop unneeded columns
en_df = en_df.drop(columns = ['popularity', 'production_companies', 'id', 'belongs_to_collection', 
                              'production_countries', 'spoken_languages', 'status', 'original_language', 
                              'poster_path', 'title', 'tagline', 'cast', 'crew'])

### Find all trailer links using Selenium

In [7]:
results = []
counter = 0

for title in en_df['original_title']:
    query = title + str(' Trailer')
    results.append(next(search(query, tld="co.in", num=1, stop=1, pause=2)))
    if counter % 50 == 0:
        print('Iteration:', counter)
    counter += 1

Iteration: 0
Iteration: 50
Iteration: 100
Iteration: 150
Iteration: 200
Iteration: 250
Iteration: 300
Iteration: 350
Iteration: 400
Iteration: 450
Iteration: 500


In [8]:
# Join links to respective films in the dataset
links_df = pd.DataFrame(results, columns=['link'])
en_df = en_df.reset_index()
en_df = en_df.join(links_df, how='outer')
en_df = en_df.drop(columns = ['index'])

In [9]:
# This code is now obsolete
for i in range(len(en_df['homepage'])):
    if isinstance(en_df['homepage'][i], str):
        en_df['homepage'][i] = 1
    else:
        en_df['homepage'][i] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [10]:
# Remove all films that do not have "youtube" in the link for trailer
for i in range(len(en_df)):
    if not 'youtube' in en_df['link'][i]:
        en_df = en_df.drop(i)
en_df = en_df.reset_index()
len(en_df)

546

### Scrape data from Youtube using the Selenium Scraper

In [13]:
def scrape_data(link, num_iters):

    data = []
    
    # Specify Driver
    driver = Chrome(executable_path='/Users/liamparker/Downloads/chromedriver') 
    
    # Open Link
    wait = WebDriverWait(driver, 5)
    driver.get(link)
    
    # Retrieve every item in the link with TAG_NAME = "body" in order to scroll through the page 
    for item in range(num_iters): 
        vis = EC.visibility_of_element_located((By.TAG_NAME, "body"))
        wait.until(vis).send_keys(Keys.END)
        time.sleep(5)
        
    # Retrieve all comments by searching for CSS_SELECTOR = "#content"
    for comment in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#content"))):
        txt = comment.text
        data.append(txt)
    
    return data

In [14]:
# Compile a list of lists of comments using the above definition
full_comments = []
for i in range(1):
    try:
        full_comments.append(scrape_data(en_df['link'][i], 10))
    except:
        print(i)

In [635]:
# Create a dictionary to save comments list
comments_dict = {}

for i in range(len(full_comments)):
    comments_dict[en_df['original_title'][i]] = full_comments[i]

In [638]:
# Dump the dictionary
with open('saved_dictionary.pkl', 'wb') as f:
    pickle.dump(comments_dict, f)

In [None]:
# Retrieve the dictionary
with open('saved_dictionary.pkl', 'rb') as f:
    loaded_dict = pickle.load(f)

### Retrieve the Number of Views and Likes for Each Video

In [842]:
views = []
likes = []

# Views and Likes Counts are found in the first Comment Returned from Each List
for i in range(len(full_comments)):
    c = full_comments[i][0]
    views.append(re.split(r"\n(.*) views", c)[1])
    try:
        likes.append(re.split(r"\n(.*)\nDISLIKE", c)[1])
    except:
        likes.append(float("nan"))

In [848]:
# Drop duplicate index column created above
en_df = en_df.drop(columns = 'index')

In [849]:
# Append views and likes columns to the dataframe
views_df = pd.DataFrame(views, columns=['views'])
likes_df = pd.DataFrame(likes, columns=['likes'])
en_df = en_df.join(views_df, how='outer')
en_df = en_df.join(likes_df, how='outer')

In [850]:
# Splice strings returning numbers and converting K to 1,000 and M to 1,000,000
def value_to_float(x):
    if type(x) == float or type(x) == int:
        return x
    if 'K' in x:
        if len(x) > 1:
            return float(x.replace('K', '')) * 1000
        return 1000.0
    if 'M' in x:
        if len(x) > 1:
            return float(x.replace('M', '')) * 1000000
        return 1000000.0
    else:
        return float(x.replace(',', ''))
    return 0.0

In [851]:
# Perform the above operation
for i in range(len(en_df)):
    try:
        en_df['likes'][i] = value_to_float(en_df['likes'][i])
        en_df['views'][i] = value_to_float(en_df['views'][i])
    except:
        en_df['likes'][i] = float('nan')
        en_df['views'][i] = float('nan')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


### Perform Sentiment Analysis on the Comments

In [None]:
# Specify Tokenizer and Lemmatizer
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()

In [853]:
# Create function to perform VADER sentiment analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def sentiment_scores(sentence):
    sentiment = SentimentIntensityAnalyzer()
    sentiment_dict = sentiment.polarity_scores(sentence)
    return np.array([sentiment_dict['neg'], sentiment_dict['neu'], sentiment_dict['pos']])

In [None]:
# Compile Sentiments for Every Movie
negatives = []
neutrals = []
positives = []
positives_perc = []
negatives_perc = []

# Iterate through all comments
for j in range(len(full_comments)):
    if j % 50 == 0:
        # iterative update printing to track progress
        print(j)
    docs = loaded_dict[en_df['original_title'][j]][2:]
    
    # Append 0 if the comment list is empty
    if not docs:
        negatives.append(0)
        neutrals.append(0)
        positives.append(0)
        continue
    
    # Tokenize all of the items in the doc
    for i in range(len(docs)):
        docs[i] = tokenizer.tokenize(docs[i])
    
    # Perform lemmatization, and remove stopwords, single character words, numbers
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
    docs = [[token for token in doc if not token.isnumeric()] for doc in docs]
    docs = [[token for token in doc if len(token) > 1] for doc in docs]
    stop_words = set(stopwords.words('english'))
    docs = [[token for token in doc if not token in stop_words] for doc in docs]
    
    # Join tokens back together to be fed into VADER sentiment analysis
    for i in range(len(docs)):
        docs[i] = ' '.join(docs[i])
    
    neg = 0
    neu = 0
    pos = 0
    neg_perc = 0
    pos_perc = 0
    
    # Perform VADER sentiment analysis using the function built above
    for i in range(len(docs)):
        comp_neg, comp_neu, comp_pos = sentiment_scores(docs[i])
        neg += comp_neg
        neu += comp_neu
        pos += comp_pos
    
    # Append scores to lists
    negatives.append(neg/len(docs))
    neutrals.append(neu/len(docs))
    positives.append(pos/len(docs))

In [861]:
# Join lists to the dataframe
neg_df = pd.DataFrame(negatives, columns=['negative_sentiment'])
neu_df = pd.DataFrame(neutrals, columns=['neutral_sentiment'])
pos_df = pd.DataFrame(positives, columns=['positive_sentiment'])
pos_perc_df = pd.DataFrame(positives_perc, columns=['positive_percentage'])
neg_perc_df = pd.DataFrame(negatives_perc, columns=['negative_percentage'])
en_df = en_df.join(neg_df, how='outer')
en_df = en_df.join(neu_df, how='outer')
en_df = en_df.join(pos_df, how='outer')
en_df = en_df.join(pos_perc_df, how='outer')
en_df = en_df.join(neg_perc_df, how='outer')

### Export the DataFrame to be Used in Colab (see Model Construction)

In [859]:
en_df['target'] = en_df['revenue']
en_df = en_df.drop(columns = 'revenue')
en_df = en_df.drop(columns = 'negative_percentage')
en_df.to_excel('updated_df.xlsx')