# Web Scraping for Data

In [None]:
# import pandas & numpy library
import pandas as pd
import numpy as np

# Import seaborn and apply its plotting styles
import seaborn as sns
sns.set(font_scale=2, style="white")

# import matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.style as style
# set plotting size parameter
plt.rcParams['figure.figsize'] = (12, 5)

# packages helpful for webscraping
import requests
import bs4
from bs4 import BeautifulSoup
import re

#improve resolution
%config InlineBackend.figure_format ='retina'

# Web Scrape Wikipedia for Movie Plots, Genre, and Other Info
https://en.wikipedia.org/wiki/Category:American_films_by_genre

In [428]:
genre_link = {
    'Action': [
        'https://en.wikipedia.org/w/index.php?title=Category:American_action_films&pageuntil=Driving+Force+%281989+film%29#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_action_films&pagefrom=Driving+Force+%281989+film%29#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_action_films&pagefrom=Marksman%2C+The%0AThe+Marksman+%282005+film%29#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_action_films&pagefrom=Spider-Man+3#mw-pages'
    ]
    ,
    'Crime': [
        'https://en.wikipedia.org/wiki/Category:American_crime_films',
        'https://en.wikipedia.org/w/index.php?title=Category:American_crime_films&pagefrom=Dial+Red+O#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_crime_films&pagefrom=Ivy+%28Film%29%0AIvy+%281947+film%29#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_crime_films&pagefrom=One+Stolen+Night+%281929+film%29#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_crime_films&pagefrom=Tenderloin+%28film%29#mw-pages'
    ]
    ,
    'War': [
        'https://en.wikipedia.org/wiki/Category:American_war_films',
        'https://en.wikipedia.org/w/index.php?title=Category:American_war_films&pagefrom=Retreat%2C+Hell%21#mw-pages'
    ]
    ,
    'Romance': [
        'https://en.wikipedia.org/wiki/Category:American_romance_films',
        'https://en.wikipedia.org/w/index.php?title=Category:American_romance_films&pagefrom=Sporting+Venus%2C+The%0AThe+Sporting+Venus#mw-pages'
    ]
    ,
    'Thriller': [
        'https://en.wikipedia.org/wiki/Category:American_thriller_films',
        'https://en.wikipedia.org/w/index.php?title=Category:American_thriller_films&pagefrom=Godsend+%282004+film%29#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_thriller_films&pagefrom=Poltergeist+%28film%29%0APoltergeist+%281982+film%29#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_thriller_films&pagefrom=Winchester+%28film%29#mw-pages'
    ]
    ,
    'Horror': [
        'https://en.wikipedia.org/wiki/Category:American_horror_films',
        'https://en.wikipedia.org/w/index.php?title=Category:American_horror_films&pagefrom=Isle+of+the+Dead+%28film%29#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_horror_films&pagefrom=West+of+Hell#mw-pages'
    ]
    ,
    'Biographical': [
        'https://en.wikipedia.org/wiki/Category:American_biographical_films',
        'https://en.wikipedia.org/w/index.php?title=Category:American_biographical_films&pagefrom=I+Wanna+Dance+with+Somebody+%28film%29#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_biographical_films&pagefrom=Story+of+Alexander+Graham+Bell%0AThe+Story+of+Alexander+Graham+Bell#mw-pages'
    ]
    ,
    'Satirical': [
        'https://en.wikipedia.org/wiki/Category:American_satirical_films',
        'https://en.wikipedia.org/w/index.php?title=Category:American_satirical_films&pagefrom=Hospital%2C+The%0AThe+Hospital#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_satirical_films&pagefrom=Taintlight%0ATaintlight#mw-pages'
    ]
    ,
    'Science Fiction': [
        'https://en.wikipedia.org/wiki/Category:American_science_fiction_films',
        'https://en.wikipedia.org/w/index.php?title=Category:American_Western_(genre)_films&pagefrom=Big+Sombrero%2C+The%0AThe+Big+Sombrero+%28film%29#mw-pages'
    ]
    ,
    'Monster': [
        'https://en.wikipedia.org/wiki/Category:American_monster_movies',
        'https://en.wikipedia.org/w/index.php?title=Category:American_monster_movies&pagefrom=Fly%2C+The%0AThe+Fly+%281986+film%29#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_monster_movies&pagefrom=Nailbiter#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_monster_movies&pagefrom=World+Without+End+%28film%29#mw-pages'
    ]
    ,
    'Mystery': [
        'https://en.wikipedia.org/wiki/Category:American_mystery_films',
        'https://en.wikipedia.org/w/index.php?title=Category:American_mystery_films&pagefrom=Deceiver+%28film%29#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_mystery_films&pagefrom=House+of+Fear%2C+The%0AThe+House+of+Fear+%281915+film%29#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_mystery_films&pagefrom=Murder+She+Baked%0AMurder%2C+She+Baked#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_mystery_films&pagefrom=Seven+Footprints+To+Satan+%28Film%29%0ASeven+Footprints+to+Satan#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_mystery_films&pagefrom=X+Marks+The+Spot%0AX+Marks+the+Spot+%281942+film%29#mw-pages'
    ]
    ,
    'Satirical': [
        'https://en.wikipedia.org/wiki/Category:American_satirical_films',
        'https://en.wikipedia.org/w/index.php?title=Category:American_satirical_films&pagefrom=Hospital%2C+The%0AThe+Hospital#mw-pages',
        'https://en.wikipedia.org/w/index.php?title=Category:American_satirical_films&pagefrom=Taintlight%0ATaintlight#mw-pages'
    ]
    ,
    ''
}

In [258]:
info_list = [
    'Directed by',
    'Written by',
    'Story by',
    'Produced by',
    'Starring',
    'Cinematography',
    'Edited by',
    'Music by',
    'Production company',
    'Distributed by',
    'Release date',
    'Running time',
    'Budget',
    'Box office'
]

In [430]:
%%time

# Create a blank dataframe and assign column names
scraped_df = pd.DataFrame(columns=info_list)

# Loop through the genres
# category_links is a list of URLs for the same genre, but each URL contains around 200 individual films
for genre, category_links in genre_link.items():
    
    # Each movie genre has several URLs since not everything can fit on the same page
    # Loop through the URLs in category_links to access all the URLs for each movie genre
    for category_link in category_links:
        req = requests.get(category_link, timeout=50)
        soup = BeautifulSoup(req.content, 'html') # get contents of web page    
        
        soup.find('div', {'class': 'mw-content-ltr'}).find_all('a', href=True)
        
        # Each category_link has around 200 individual films' URLs
        # This for loop populates name_links dictionary with the film name as key and Wikipedia page URL as value
        name_links = {}
        for a in soup.find('div', {'class': 'mw-content-ltr'}).find_all('a', href=True):
            film_name = a.text
            link_end = a['href']
            link = 'https://en.wikipedia.org' + link_end
            if 'Categor' not in link: # skip if 'Category' or 'Categorization' is in the link
                name_links[film_name] = link
        
        # Loop through the individual films' URLs to extract wanted info
        for film_name, link in name_links.items():
            req = requests.get(link, timeout=50)
            soup = BeautifulSoup(req.content, 'html') # get contents of web page
            tag_contents = soup.select('p, span.mw-headline')
            
            # Extract the plot (can also be named "Premise" or "Synopsis") if it exists
            start_index_of_plot = -1
            end_index_of_plot = -1
            plot_exists = False

            for i, tag_content in enumerate(tag_contents):
                tag = tag_content.name
                content = tag_content.text

                if (plot_exists) & (tag == 'span'):
                    end_index_of_plot = i
                    break

                if (content == 'Plot') | (content == 'Premise') | (content == 'Synopsis'):
                    start_index_of_plot = i + 1
                    plot_exists = True

            plot = []
            for content in tag_contents[start_index_of_plot: end_index_of_plot]:
                par = content.text
                plot += [par]

            plot = "".join(plot).strip()
            
            # If plot exists, extract other info as well
            if len(plot) >= 1:
                
                film_dict = {'Film name': film_name, 'Genre': genre, 'Plot': plot}
                
                # Get other info on the film, if the info table eixsts 
                try:
                    req = requests.get(link, timeout=50)
                    soup = BeautifulSoup(req.content, 'html')
                    wiki_tables = soup.select('table', {'class': 'infobox vevent'})
                    wiki_table = wiki_tables[0] # get info table

                    # Loop through the tables until we have the info table we want
                    i = 0
                    while 'Directed by' not in wiki_table.text:
                        i += 1
                        wiki_table = wiki_tables[i]

                    # Some minor data cleaning
                    table_html = str(wiki_table).replace('<br/>', '/ ').replace('</li>', '/ ')
                    table_html = re.sub(r"\[\d+\]", "", table_html) # remove brackets (which provide link to references, but are not needed for our project)

                    # Use pd.read_html to create pandas dataframe of the info table
                    df = pd.read_html(table_html) 
                    df = pd.DataFrame(df[0]) # convert list to dataframe
                    df.columns = ['col_name', 'info']
                    df['col_name'] = df['col_name'].replace('/', ' ').replace('companies', 'company').replace('dates', 'date') # fix minor style error
                    df['info'] = df['info'].apply(lambda x: x[:-1] if str(x)[-1] == "/" else x) # fix minor style error

                    info_dict = {k: v for (k, v) in zip(list(df['col_name']), list(df['info'])) if k in info_list}
                    dict_to_append = {**film_dict, **info_dict}
                    
                # If the info table does not exist, then only append the film name, genre, and plot
                except:
                    dict_to_append = film_dict
                
                # Update the dataframe
                scraped_df = scraped_df.append(dict_to_append, ignore_index=True)

                
# Remove references from 'Plot' column              
scraped_df['Plot'] = scraped_df['Plot'].replace(r'\[\d+\]','', regex=True).replace(r'\[\w\]','', regex=True)           
scraped_df

Wall time: 14min 9s


Unnamed: 0,Film name,Genre,Plot,Directed by,Written by,Story by,Produced by,Starring,Cinematography,Edited by,Music by,Production company,Distributed by,Release date,Running time,Budget,Box office
0,"$30,000",Mystery,"A struggling attorney is given $30,000 by a yo...",Ernest C. Warde,Jack Cunningham / H.B. Daniel,,,J. Warren Kerrigan / Fritzi Brunette / Carl St...,Arthur L. Todd,,,,Hodkinson Pictures / Pathé Exchange,"February 29, 1920",50 minutes,,
1,13 Hours by Air,Mystery,Airline pilot Jack Gordon (Fred MacMurray) on ...,Mitchell Leisen,Kenyon Nicholson/ Bogart Rogers,,E. Lloyd Sheldon,Fred MacMurray/ Joan Bennett,Theodor Sparkuhl,Doane Harrison,Heinz Roemheld (composer)/ Irvin Talbot (condu...,,Paramount Pictures,"April 30, 1936",77 minutes,,
2,13 Lead Soldiers,Mystery,Dr Stedman is murdered by an intruder in his s...,Frank McDonald,Herman C. McNeile/ Irving Elman,,Bernard Small/ Ben Pivar,Tom Conway/ Maria Palmer/ Helen Westcott/ John...,George Robinson,Saul A. Goodkind,,,Twentieth Century-Fox Film Corporation,30 April 1948,64 minutes,"$150,000","$400,000 (worldwide)"
3,The 13th Letter,Mystery,"Doctor Pearson (Michael Rennie), who works at ...",Otto Preminger,,,Otto Preminger,Linda Darnell/ Charles Boyer/ Michael Rennie/ ...,Joseph LaShelle,Louis R. Loeffler,Alex North,,20th Century Fox,,85 minutes,"$1,075,000",
4,The 13th Man,Mystery,"Swifty Taylor, a journalist with the Globe Tim...",William Nigh,John W. Krafft,,Trem Carr (executive producer) Lon Young (asso...,Weldon Heyburn/ Inez Courtney/ Selmer Jackson/...,Paul Ivano,Russell F. Schoengarth,Abe Meyer,,Monogram Pictures,1937,70 minutes,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1145,The Yank,Satirical,"Irish American, Tom Murphy, is the first in hi...",Sean Lackey,Sean Lackey,,,Colm Meaney/ Fred Willard/ Kevin Farley/ Niki ...,Keith Nickoson,David Jingo,Chris Ainscough,,Vision Films (US) / Bulldog Film Distribution ...,"March 23, 2014",,,
1146,You Nazty Spy!,Satirical,The short begins with a title card disclaimer ...,Jules White,Felix Adler/ Clyde Bruckman,,Jules White,Moe Howard/ Larry Fine/ Curly Howard/ Richard ...,,,,,Columbia Pictures,"January 19, 1940",18:00,,
1147,Zelig,Satirical,"Set in the 1920s and 1930s, the film concerns ...",Woody Allen,Woody Allen,,Robert Greenhut,Woody Allen/ Mia Farrow,Gordon Willis,Susan E. Morse,Dick Hyman,,Warner Bros.,"July 15, 1983",79 minutes,,$11.8 million
1148,Zoolander,Satirical,"In New York City, male model Derek Zoolander i...",Ben Stiller,,Drake Sather/ Ben Stiller,Scott Rudin/ Ben Stiller/ Stuart Cornfeld,Ben Stiller/ Owen Wilson/ Will Ferrell/ Christ...,Barry Peterson,Greg Hayden,David Arnold,,Paramount Pictures,"September 28, 2001",90 minutes,$28 million,$60.8 million


In [434]:
scraped_df= scraped_df[scraped_df['Genre'] == 'Mystery']

In [435]:
scraped_df.to_csv('American Films 3.csv', index=False)

In [337]:
df.at[2753, 'Running time'] = '18 minutes'
df.at[1277, 'Running time'] = '7 minutes'
df.at[1189, 'Running time'] = '6 minutes'
df.at[942, 'Running time'] = '1 minute'

In [344]:
# 'Duration (min)' extracts the number of minutes from 'Running time'
df['Duration (min)'] = df['Running time'].str.lower().str.extract(r'(?P<duration>[\d]+) min')

# 'Release year' extracts the year from 'Release date'
df['Release year'] = df['Release date'].str.extract(r'(\d{4})')

# 'Box office (mil)' extracts the dollar value in millions from 'Box office'
def get_box_office_million_dollar(string):
    if (type(string) != str) or ('$' not in string):
        return np.nan
    
    string = string.replace(',', '').replace('$', '')
    num = float(re.findall(r'(\d+\.?\d*)', string)[0])
    
    if "mil" in string:
        return num
    else:
        return num / 1_000_000

df['Box office (mil)'] = df['Box office'].apply(get_box_office_million_dollar)

In [425]:
df.to_csv('Movie Genre and Plot.csv', index=False)

## Sentiment Analysis

In [343]:
# VADER Sentiment Analysis

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

VADER_SentimentIntensityAnalyzer = SentimentIntensityAnalyzer()

def get_VADER_sentiment(text):
    return VADER_SentimentIntensityAnalyzer.polarity_scores(text)['compound']

df['VADER Sentiment'] = df['Plot'].apply(get_VADER_sentiment)

# Textblob Polarity and Subjectivity Analysis

from textblob import TextBlob

def get_TextBlob_subjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def get_TextBlob_polarity(text):
    return TextBlob(text).sentiment.polarity

df['TextBlob Subjectivity'] = df['Plot'].apply(get_TextBlob_subjectivity)
df['TextBlob Polarity'] = df['Plot'].apply(get_TextBlob_polarity)

# df.sort_values(by='TextBlob Polarity')

Unnamed: 0,Film name,Genre,Plot,Directed by,Written by,Story by,Produced by,Starring,Cinematography,Edited by,...,Production company,Distributed by,Release date,Running time,Budget,Box office,Duration (min),VADER Sentiment,TextBlob Subjectivity,TextBlob Polarity
1997,Disciples (film),Horror,Demons and humans band together to fight for t...,Joe Hollow,Joe Hollow,,Wolfgang Meyer,Tony Todd/ Bill Moseley/ Angus Scrimm,Wolfgang Meyer,Joe Hollow,...,,ITN Distribution,"September 12, 2014",90 minutes,,,90,-0.8979,1.00,-1.000000
1988,Death-Scort Service,Horror,The film follows a group of prostitutes workin...,Sean Donohue,Sean Donohue/ Chris Woods,,Sean Donohue/ Shelby McIntyre/ John Miller/ Ch...,Krystal Pixie Adams/ Ashley Lynn Caputo/ Sean ...,Chris Woods,Chris Woods,...,,,"October 13, 2015",79 minutes,,,79,-0.9274,1.00,-0.875000
1689,Mind Games (1989 film),Thriller,"Rita and Dana Lund's marriage is in a crisis, ...",Bob Yari,Kenneth Dorward,,Louie Lawless/ Randolf Turrow,Maxwell Caulfield/ Edward Albert/ Shawn Weathe...,Arnie Sirlin,Robert Gordon,...,,Metro-Goldwyn-Mayer,"March 3, 1989",93 minutes,,,93,-0.8555,0.60,-0.787500
2024,Gates of Darkness,Horror,A dramatic mystery where a haunted teen endure...,Don E. Fauntleroy,Lesley-Anne Down/ Rob Hickman/ Chris Kanik/ Ra...,,Don E. Fauntleroy/ Rob Hickman/ Laurence Lasca...,Tobin Bell,Don E. Fauntleroy,Bryan Colvin,...,,,October 2019,91 minutes,,,91,-0.7717,0.65,-0.608333
817,"Mobs, Inc.",Crime,Captain John Braddock schools a group of polic...,William Asher,,,,Reed Hadley,,,...,,,"March 21, 1956",62 min.,,,62,-0.4767,0.90,-0.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1214,The Road to Freedom (film),War,"Two photojournalists, Sean (Joshua Fredric Smi...",Brendan Moriarty,,,Brendan Moriarty / Henry Bronson,Joshua Fredric Smith/ Scott Maguire,David Mun,Sean Halloran/ Margie Rogers,...,,Creative Freedom,,,$1 million,,,0.5267,1.00,0.800000
2719,The Great American Beauty Contest,Satirical,"A feminist enters a beauty contest, hoping to ...",Robert Day,Stanford Whitmore,,,JoAnna Cameron/ Eleanor Parker/ Robert Cumming...,James A. Crabe,,...,,,,74 minutes,"$410,000",,74,0.8860,0.40,0.800000
1727,Overexposed (film),Thriller,Morrison (Larry Brand) investigates when the c...,Larry Brand,,,Roger Corman,Catherine Oxenberg/ David Naughton/ Karen Black,,,...,,,1990,,,,,0.1280,1.00,0.850000
474,White Fang (1936 film),Action,"Based upon the legendary novel, a woman with h...",David Butler,,,Bogart Rogers/ Darryl F. Zanuck,Michael Whalen/ Jean Muir/ Slim Summerville/ C...,Arthur C. Miller,Irene Morra,...,,20th Century Fox,"July 17, 1936",70 minutes,,,70,-0.9549,1.00,1.000000


## Quantifying emotion of text with the LeXmo package

https://betterprogramming.pub/unlocking-emotions-in-text-using-python-6d062b48d71f

In [372]:
import nltk
nltk.download('punkt')
from LeXmo import LeXmo

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tsaie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tsaie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [381]:
df[df['Genre'] == 'Horror']

Unnamed: 0,Film name,Genre,Plot,Directed by,Written by,Story by,Produced by,Starring,Cinematography,Edited by,...,Distributed by,Release date,Running time,Budget,Box office,Duration (min),VADER Sentiment,TextBlob Subjectivity,TextBlob Polarity,Release year
1907,2 Jennifer,Horror,Spencer wants to create a sequel to one of his...,Hunter Johnson,Hunter Johnson,,,Hunter Johnson/ David Coupe/ Lara Jean Mummert,Hunter Johnson,Frank Merle,...,Gravitas Ventures/ Reality Entertainment/ Sect...,"June 3, 2016",84 minutes,,,84,0.9081,0.594444,0.355556,2016
1908,3 from Hell,Horror,The film opens with several news reports about...,Rob Zombie,Rob Zombie,,Mike Elliott/ Rob Zombie/ Tony Ciulla,Sheri Moon Zombie/ Bill Moseley/ Richard Brake...,David V. Daniel,Glenn Garland,...,Lionsgate,"September 16, 2019",115 minutes,$3 million,$2.2 million,115,-0.9960,0.321978,0.010897,2019
1909,6:45,Horror,Bobby has traveled via ferry to a small bed an...,Craig Singer,Robert Dean Klein,,,,Lucas Pitassi,Sam Adelman,...,Well Go USA (USA),"August 6, 2021",96 minutes,,,96,-0.9928,0.394029,-0.074358,2021
1910,13 Cameras,Horror,The film revolves around a newlywed couple who...,Victor Zarcoff,,,Jim Cummings/ Kevin McManus/ Matthew McManus/ ...,Neville Archambault/ Sean Carrigan/ PJ McCabe/...,Jess Dunlap,Derek Desmond,...,79th & Broadway Releasing,"April 15, 2016",87 minutes,,,87,0.0000,0.727273,0.068182,2016
1911,30 Miles from Nowhere,Horror,The film follows five ex-college buddies who r...,Caitlin Koller,Seana Kofoed,,Kelly Demaret/ Seana Kofoed,Carrie Preston/ Rob Benedict/ Cathy Shim/ Sean...,Ben McBurnett,John Quinn,...,4Digital Media,24 November 2018Monster Fest),84 minutes,,,84,-0.8807,0.000000,0.000000,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2221,Witchcraft IV: The Virgin Heart,Horror,Will Spanner (Charles Solomon) is the son of a...,James Merendino,James Merendino/ Michael Paul Girard,,Jerry Feifer/ Holly MacConkey,Charles Solomon Jr/ Julie Strain,Kevin Morrisey,Tony Miller,...,"Simitar Entertainment (US, DVD)",1992,95 min.,,,95,-0.9690,0.588889,0.138889,1992
2222,Witchcraft VII: Judgement Hour,Horror,"A vampire, Martin Hassa (Loren Schmalle), is a...",Michael Paul Girard,Jerry Feifer (story by)/ Peter Flemming,,Michael Feifer,David Byrnes/ April Breneman/ Loren Schmalle/ ...,Denis Maloney,Tony Miller,...,"Simitar Entertainment (US, DVD)",1995,95 min.,,,95,-0.9325,0.473333,-0.090000,1995
2223,The Wizard (1927 film),Horror,Paul Coriolos is a professor who seeks vengean...,Richard Rosson,,,,Edmund Lowe/ Leila Hyams/ Gustav von Seyfferti...,Frank B. Good,,...,Fox Film Corporation,"December 11, 1927",60 minutes,,,60,-0.9657,0.680000,0.010000,1927
2224,The Woman (2011 film),Horror,"After the events in Maine, an unidentified fer...",Lucky McKee,Jack Ketchum/ Lucky McKee,,Robert Tonino/ Andrew van den Houten,Pollyanna McIntosh/ Angela Bettis/ Sean Bridge...,Alex Vendler,Zach Passero,...,The Collective/ Bloody Disgusting/ Salient Media,"January 23, 2011 (US)",101 minutes,,,101,-0.9968,0.354221,-0.109848,2011


In [382]:
example_plot = df.iloc[1907]['Plot']
LeXmo.LeXmo(example_plot)

{'text': "Spencer wants to create a sequel to one of his favorite horror films, To Jennifer, and is hoping that doing so will launch his career as a filmmaker. As such, he is playing one of the movie's main characters and wants to find the perfect woman to serve as the film's titular character, even going so far as to insist that they only consider actresses that are named Jennifer. However as they find their perfect Jennifer and filming commences, Spencer's hold on reality grows more and more tenuous.",
 'anger': 0.010101010101010102,
 'anticipation': 0.0707070707070707,
 'disgust': 0.010101010101010102,
 'fear': 0.010101010101010102,
 'joy': 0.04040404040404041,
 'negative': 0.010101010101010102,
 'positive': 0.0707070707070707,
 'sadness': 0.010101010101010102,
 'surprise': 0.020202020202020204,
 'trust': 0.04040404040404041}

## Categorizing emotion of text with the text2emotion library
Documentation: https://colab.research.google.com/drive/1sCAcIGk2q9dL8dpFYddnsUin2MlhjaRw?usp=sharing#scrollTo=ivUkOaBPEQYr

Implementation example: https://towardsdatascience.com/text2emotion-python-package-to-detect-emotions-from-textual-data-b2e7b7ce1153

In [370]:
import text2emotion as te

In [368]:
example_plot = df.iloc[0]['Plot']
example_plot

"While picking up his son at his college dorm, Delta Team Leader Vic Davis meets his son's crush Erin Walton, the daughter of a Supreme Court justice. However, Vic notices an unusual number of people on the premises watching Erin closely. When Erin reveals that her driver is not her usual escort, Vic plunges into action to rescue Erin from a kidnapping plot that puts both his and his son's lives in jeopardy."

In [371]:
text = "I was asked to sign a third party contract a week out from stay. If it wasn't an 8 person group that took a lot of wrangling I would have cancelled the booking straight away. Bathrooms - there are no stand alone bathrooms. Please consider this - you have to clear out the main bedroom to use that bathroom. Other option is you walk through a different bedroom to get to its en-suite. Signs all over the apartment - there are signs everywhere - some helpful - some telling you rules. Perhaps some people like this but It negatively affected our enjoyment of the accommodation. Stairs - lots of them - some had slightly bending wood which caused a minor injury."
te.get_emotion(text)

AttributeError: module 'emoji' has no attribute 'UNICODE_EMOJI'

## (Ignore) Testing Individual Links

In [249]:
# https://en.wikipedia.org/wiki/Yeti:_Curse_of_the_Snow_Demon 

link = 'https://en.wikipedia.org/wiki/Cherry_Bomb_(film)'
req = requests.get(link)
soup = BeautifulSoup(req.content, 'html')
wiki_tables = soup.select('table', {'class': 'infobox vevent'})
wiki_table = wiki_tables[0] # get info table

# loop through the tables until we have the info table we want
i = 0
while 'Directed by' not in wiki_table.text:
    i += 1
    wiki_table = wiki_tables[i]

# Data cleaning
table_html = str(wiki_table).replace('<br/>', '/ ').replace('</li>', '/ ')
table_html = re.sub(r"\[\d+\]", "", table_html) # remove brackets (which provide link to references, but are not needed for our project)

# Use pd.read_html to create pandas dataframe of the info table
df = pd.read_html(table_html) 
df = pd.DataFrame(df[0]) # convert list to dataframe
df.columns = ['col_name', 'info']
df['col_name'] = df['col_name'].replace('/', ' ').replace('companies', 'company').replace('dates', 'date') # fix minor style error
df['info'] = df['info'].apply(lambda x: x[:-1] if str(x)[-1] == "/" else x) # fix minor style error
display(df)

info_dict = {k: v for (k, v) in zip(list(df['col_name']), list(df['info'])) if k in info_list}

info_dict

Unnamed: 0,col_name,info
0,DVD cover,DVD cover
1,Directed by,Kyle Day
2,Screenplay by,Garrett Hargrove
3,Story by,Kyle Day
4,Produced by,Kyle Day/ Garrett Hargrove/ Jason Latimer
5,Starring,Julin Jean/ Nick Manning/ John Gabriel Rodrigu...
6,Cinematography,Andrew Michael Barrera
7,Edited by,Kyle Day/ David Ward
8,Production/ company,Strike Anywhere Productions
9,Distributed by,Well Go USA


{'Directed by': 'Kyle Day',
 'Story by': 'Kyle Day',
 'Produced by': 'Kyle Day/ Garrett Hargrove/ Jason Latimer',
 'Starring': 'Julin Jean/ Nick Manning/ John Gabriel Rodriguez/ Allen Hackley/ Jeremy James Douglas Norton/ Aaron Alexander',
 'Cinematography': 'Andrew Michael Barrera',
 'Edited by': 'Kyle Day/ David Ward',
 'Distributed by': 'Well Go USA',
 'Running time': '82 minutes'}

In [None]:
df1 = pd.read_csv('')