# COGS 108 - Data Checkpoint

# Names

- Jonathan Cheung
- Joshua Chuang
- Joyce Hu
- Ester Tsai
- Sam Wong

<a id='research_question'></a>
# Research Question

For American films, are information like genre, release year, duration, director, production company, and textual analysis of the film's plot on Wikipedia correlated with its box office sales?

Is there a correlation between genre, release year, duration, director, production company, and textual analysis of Wikipedia plots for box office sales of American films? 

Which factor, among factors such as director, production company, genre, and duration, has the most impact on an American film's box office sale?

# Dataset(s)

- Dataset Name: "American Films.csv"
- Link to the dataset: Web scraped from Wikipedia
- Number of observations:

This dataset was scraped from the Wikipeida page "American Films by Genre" (https://en.wikipedia.org/wiki/Category:American_films_by_genre). It contains the variables 'Film name', 'Genre', 'Plot', 'Directed by', 'Written by', 'Story by', 'Produced by', 'Starring', 'Cinematography', 'Edited by', 'Music by', 'Production company', 'Distributed by', 'Release date', 'Running time', 'Budget', and 'Box office'. 

# Setup

In [1]:
# Pandas and numpy
import pandas as pd
import numpy as np

# Data visualization
import seaborn as sns
sns.set(font_scale=2, style="white")

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.style as style

# set plotting size parameter
plt.rcParams['figure.figsize'] = (12, 5)

# Webscraping
import requests
from bs4 import BeautifulSoup

# Textual and sentiment analysis
import re

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from textblob import TextBlob

import nltk
nltk.download('punkt')
from LeXmo import LeXmo


# Improve resolution
%config InlineBackend.figure_format ='retina'

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tsaie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tsaie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Data Cleaning

## 1) Data Overview / Final Result

In [2]:
df = pd.read_csv('')

## 2) Data Collection
### Specify the URLs to scrape

In [None]:
genre_link = {
    'Action': [
        'https://en.wikipedia.org/w/index.php?title=Category:American_action_films&pageuntil=Driving+Force+%281989+film%29#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_action_films&pagefrom=Driving+Force+%281989+film%29#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_action_films&pagefrom=Marksman%2C+The%0AThe+Marksman+%282005+film%29#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_action_films&pagefrom=Spider-Man+3#mw-pages'
    ]
    ,
    'Crime': [
        'https://en.wikipedia.org/wiki/Category:American_crime_films',
        'https://en.wikipedia.org/w/index.php?title=Category:American_crime_films&pagefrom=Dial+Red+O#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_crime_films&pagefrom=Ivy+%28Film%29%0AIvy+%281947+film%29#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_crime_films&pagefrom=One+Stolen+Night+%281929+film%29#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_crime_films&pagefrom=Tenderloin+%28film%29#mw-pages'
    ]
    ,
    'War': [
        'https://en.wikipedia.org/wiki/Category:American_war_films',
        'https://en.wikipedia.org/w/index.php?title=Category:American_war_films&pagefrom=Retreat%2C+Hell%21#mw-pages'
    ]
    ,
    'Romance': [
        'https://en.wikipedia.org/wiki/Category:American_romance_films',
        'https://en.wikipedia.org/w/index.php?title=Category:American_romance_films&pagefrom=Sporting+Venus%2C+The%0AThe+Sporting+Venus#mw-pages'
    ]
    ,
    'Thriller': [
        'https://en.wikipedia.org/wiki/Category:American_thriller_films',
        'https://en.wikipedia.org/w/index.php?title=Category:American_thriller_films&pagefrom=Godsend+%282004+film%29#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_thriller_films&pagefrom=Poltergeist+%28film%29%0APoltergeist+%281982+film%29#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_thriller_films&pagefrom=Winchester+%28film%29#mw-pages'
    ]
    ,
    'Horror': [
        'https://en.wikipedia.org/wiki/Category:American_horror_films',
        'https://en.wikipedia.org/w/index.php?title=Category:American_horror_films&pagefrom=Isle+of+the+Dead+%28film%29#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_horror_films&pagefrom=West+of+Hell#mw-pages'
    ]
    ,
    'Biographical': [
        'https://en.wikipedia.org/wiki/Category:American_biographical_films',
        'https://en.wikipedia.org/w/index.php?title=Category:American_biographical_films&pagefrom=I+Wanna+Dance+with+Somebody+%28film%29#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_biographical_films&pagefrom=Story+of+Alexander+Graham+Bell%0AThe+Story+of+Alexander+Graham+Bell#mw-pages'
    ]
    ,
    'Satirical': [
        'https://en.wikipedia.org/wiki/Category:American_satirical_films',
        'https://en.wikipedia.org/w/index.php?title=Category:American_satirical_films&pagefrom=Hospital%2C+The%0AThe+Hospital#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_satirical_films&pagefrom=Taintlight%0ATaintlight#mw-pages'
    ]
    ,
    'Science Fiction': [
        'https://en.wikipedia.org/wiki/Category:American_science_fiction_films',
        'https://en.wikipedia.org/w/index.php?title=Category:American_Western_(genre)_films&pagefrom=Big+Sombrero%2C+The%0AThe+Big+Sombrero+%28film%29#mw-pages'
    ]
    ,
    'Monster': [
        'https://en.wikipedia.org/wiki/Category:American_monster_movies',
        'https://en.wikipedia.org/w/index.php?title=Category:American_monster_movies&pagefrom=Fly%2C+The%0AThe+Fly+%281986+film%29#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_monster_movies&pagefrom=Nailbiter#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_monster_movies&pagefrom=World+Without+End+%28film%29#mw-pages'
    ]
    ,
    'Mystery': [
        'https://en.wikipedia.org/wiki/Category:American_mystery_films',
        'https://en.wikipedia.org/w/index.php?title=Category:American_mystery_films&pagefrom=Deceiver+%28film%29#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_mystery_films&pagefrom=House+of+Fear%2C+The%0AThe+House+of+Fear+%281915+film%29#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_mystery_films&pagefrom=Murder+She+Baked%0AMurder%2C+She+Baked#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_mystery_films&pagefrom=Seven+Footprints+To+Satan+%28Film%29%0ASeven+Footprints+to+Satan#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_mystery_films&pagefrom=X+Marks+The+Spot%0AX+Marks+the+Spot+%281942+film%29#mw-pages'
    ]
}

### Specify the additional information to include 
Note: The film name, genre, and plot are already included in the final dataset. 

In [2]:
info_list = [
    'Film name', 
    'Genre', 
    'Plot',
    'Directed by',
    'Written by',
    'Story by',
    'Produced by',
    'Starring',
    'Cinematography',
    'Edited by',
    'Music by',
    'Production company',
    'Distributed by',
    'Release date',
    'Running time',
    'Budget',
    'Box office'
]

### Web scrape Wikipedia and construct dataset

In [None]:
# Create a blank dataframe and assign column names
df = pd.DataFrame(columns=info_list)

# Loop through the genres
# category_links is a list of URLs for the same genre, but each URL contains around 200 individual films
for genre, category_links in genre_link.items():
    
    # Each movie genre has several URLs since not everything can fit on the same page
    # Loop through the URLs in category_links to access all the URLs for each movie genre
    for category_link in category_links:
        req = requests.get(category_link, timeout=50)
        soup = BeautifulSoup(req.content, 'html') # get contents of web page    
        
        soup.find('div', {'class': 'mw-content-ltr'}).find_all('a', href=True)
        
        # Each category_link has around 200 individual films' URLs
        # This for loop populates name_links dictionary with the film name as key and Wikipedia page URL as value
        name_links = {}
        for a in soup.find('div', {'class': 'mw-content-ltr'}).find_all('a', href=True):
            film_name = a.text
            link_end = a['href']
            link = 'https://en.wikipedia.org' + link_end
            if 'Categor' not in link: # skip if 'Category' or 'Categorization' is in the link
                name_links[film_name] = link
        
        # Loop through the individual films' URLs to extract wanted info
        for film_name, link in name_links.items():
            req = requests.get(link, timeout=50)
            soup = BeautifulSoup(req.content, 'html') # get contents of web page
            tag_contents = soup.select('p, span.mw-headline')
            
            # Extract the plot (can also be named "Premise" or "Synopsis") if it exists
            start_index_of_plot = -1
            end_index_of_plot = -1
            plot_exists = False

            for i, tag_content in enumerate(tag_contents):
                tag = tag_content.name
                content = tag_content.text

                if (plot_exists) & (tag == 'span'):
                    end_index_of_plot = i
                    break

                if (content == 'Plot') | (content == 'Premise') | (content == 'Synopsis'):
                    start_index_of_plot = i + 1
                    plot_exists = True

            plot = []
            for content in tag_contents[start_index_of_plot: end_index_of_plot]:
                par = content.text
                plot += [par]

            plot = "".join(plot).strip()
            
            # If plot exists, extract other info as well
            if len(plot) >= 1:
                
                film_dict = {'Film name': film_name, 'Genre': genre, 'Plot': plot}
                
                # Get other info on the film, if the info table eixsts 
                try:
                    req = requests.get(link, timeout=50)
                    soup = BeautifulSoup(req.content, 'html')
                    wiki_tables = soup.select('table', {'class': 'infobox vevent'})
                    wiki_table = wiki_tables[0] # get info table

                    # Loop through the tables until we have the info table we want
                    i = 0
                    while 'Directed by' not in wiki_table.text:
                        i += 1
                        wiki_table = wiki_tables[i]

                    # Some minor data cleaning
                    table_html = str(wiki_table).replace('<br/>', '/ ').replace('</li>', '/ ')
                    table_html = re.sub(r"\[\d+\]", "", table_html) # remove brackets (which provide link to references, but are not needed for our project)

                    # Use pd.read_html to create pandas dataframe of the info table
                    df = pd.read_html(table_html) 
                    df = pd.DataFrame(df[0]) # convert list to dataframe
                    df.columns = ['col_name', 'info']
                    df['col_name'] = df['col_name'].replace('/', ' ').replace('companies', 'company').replace('dates', 'date') # fix minor style error
                    df['info'] = df['info'].apply(lambda x: x[:-1] if str(x)[-1] == "/" else x) # fix minor style error

                    info_dict = {k: v for (k, v) in zip(list(df['col_name']), list(df['info'])) if k in info_list}
                    dict_to_append = {**film_dict, **info_dict}
                    
                # If the info table does not exist, then only append the film name, genre, and plot
                except:
                    dict_to_append = film_dict
                
                # Update the dataframe
                df = df.append(dict_to_append, ignore_index=True)    

## 3) Clean and Prepare the Dataset for Analysis

### Remove extra symbols

In [None]:
# Remove references from 'Plot' column              
df['Plot'] = df['Plot'].replace(r'\[\d+\]','', regex=True).replace(r'\[\w\]','', regex=True) 

### Correct typos

In [None]:
df.at[2753, 'Running time'] = '18 minutes'
df.at[1277, 'Running time'] = '7 minutes'
df.at[1189, 'Running time'] = '6 minutes'
df.at[942, 'Running time'] = '1 minute'

### Extract numerical info from text variables

In [None]:
# 'Duration (min)' extracts the number of minutes from 'Running time'
df['Duration (min)'] = df['Running time'].str.lower().str.extract(r'(?P<duration>[\d]+) min')

# 'Release year' extracts the year from 'Release date'
df['Release year'] = df['Release date'].str.extract(r'(\d{4})')

# 'Box office (mil)' extracts the dollar value in millions from 'Box office'
def get_box_office_million_dollar(string):
    if (type(string) != str) or ('$' not in string):
        return np.nan
    
    string = string.replace(',', '').replace('$', '')
    num = float(re.findall(r'(\d+\.?\d*)', string)[0])
    
    if "mil" in string:
        return num
    else:
        return num / 1_000_000

df['Box office (mil)'] = df['Box office'].apply(get_box_office_million_dollar)

### Sentiment Analysis

In [None]:
# VADER Sentiment Analysis

VADER_SentimentIntensityAnalyzer = SentimentIntensityAnalyzer()

def get_VADER_sentiment(text):
    return VADER_SentimentIntensityAnalyzer.polarity_scores(text)['compound']

df['VADER Sentiment'] = df['Plot'].apply(get_VADER_sentiment)



# Textblob Polarity and Subjectivity Analysis

def get_TextBlob_subjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def get_TextBlob_polarity(text):
    return TextBlob(text).sentiment.polarity

df['TextBlob Subjectivity'] = df['Plot'].apply(get_TextBlob_subjectivity)
df['TextBlob Polarity'] = df['Plot'].apply(get_TextBlob_polarity)

# END