## Data Collection
### Load libraries, packages

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import numpy as np
from os.path import join
import nltk
nltk.download(['stopwords', 'vader_lexicon','punkt'])
from matplotlib import pyplot as plt
from nltk.corpus import movie_reviews, stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from statistics import mean
from collections import Counter

[nltk_data] Downloading package stopwords to
[nltk_data]     /Volumes/Users/ly_k1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Volumes/Users/ly_k1/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Volumes/Users/ly_k1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Create metadata 
I first get all the titles of the movies available on the IMSDb website

In [29]:
#read the webpage and extract the name of the movies that have moviescripts
url = 'https://imsdb.com/all-scripts.html'
r = requests.get(url)
bs = BeautifulSoup(r.text)

td_tag = bs.find_all('td', {'valign': 'top'})[2]
a_tag = td_tag.find_all('a')

mov_title = []
mov_info = {}           #save the link of the movies' webpages on IMSDb

for row_idx in range(len(a_tag)):
    title = a_tag[row_idx].text.lower()
    link = 'https://imsdb.com' + a_tag[row_idx]['href']
 
    mov_title.append(title)
    mov_info[title] = link

I then get two data sets which have the metadata of the movies from the IMDB website.

In [30]:
imdb_data = pd.read_csv('title_akas.tsv', sep = '\t')
imdb_info_data = pd.read_csv('title_basics.tsv', sep = '\t')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [31]:
imdb_info_data

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
8872256,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2010,\N,\N,"Action,Drama,Family"
8872257,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
8872258,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
8872259,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


I need to identify which movies I need metadata for by getting their title ids (tconst) so I take multiple steps to filter the metadata:
1. Only take original titles
2. Only take films that have the same title as movies on IMSDb
3. Only take movies (remove tv shows, series, short films)

In [32]:
imdb_data = imdb_data[imdb_data['isOriginalTitle'] == '1']    #only take original titles
imdb_data = imdb_data[imdb_data['title'].str.lower().isin(mov_title)]     #only take movies in our list of movie titles
imdb_data.rename(columns = {'titleId':'tconst'},inplace = True)   #rename column for merging purpose
imdb_data = pd.merge(imdb_data, imdb_info_data, on = 'tconst')
imdb_data = imdb_data[imdb_data['titleType'] == 'movie']

There are many duplicated titles that make it hard to identify which movies are the movies that we need the metadata for. Therefore, I just take out all of the duplicated titles and compare the release year of the movies written on the IMSDb website with the release year in the IMDB dataset. If the title and the release year match then that movie should be the one that we need metadata for.

In [33]:
title_ids = []
duplicated_titles = list(imdb_data[imdb_data['title'].duplicated(keep = False)==True]['title'])

#for duplicated titles, try to get the release date on IMDSB to compare
pattern = r'Movie Release Date : \w* (\d{4})'
duplicated_titles_info = {}

for title in duplicated_titles:
    
    if title in duplicated_titles_info:
        continue
        
    title_url = mov_info[title.lower()]
    
    r = requests.get(title_url)
    bs = BeautifulSoup(r.text)
    
    target = bs.find_all('table',{'class':'script-details'})[0].text
    year = re.findall(pattern, target)
    if len(year) == 0:
        year = re.findall(r'Script Date : \w* (\d{4})', target)
    try:
        duplicated_titles_info[title] = year[0]
        
    except:
        duplicated_titles_info[title] = 'NA'
        
#get the right title ids for duplicated titles, print out titles that don't have release year on IMSDb, I'll double check those by hand
i = 0
for key in duplicated_titles_info:
    if duplicated_titles_info[key] == 'NA':
        print(i, key)
        i+= 1
    try:
        tconst = imdb_data.loc[(imdb_data['title'] == key) & (imdb_data['startYear'] == duplicated_titles_info[key]),'tconst'].values[0]
    except:
        continue
    title_ids.append(tconst)

0 Scarface
1 All About Eve
2 Lone Star
3 Fargo
4 Ghost Ship
5 Taxi Driver
6 Blade
7 Dark Star
8 Independence Day
9 Made
10 Friday the 13th
11 Jacob's Ladder
12 Sneakers
13 Body of Evidence
14 Speed
15 Casino
16 Copycat
17 Custody
18 U Turn
19 At First Sight
20 Casablanca
21 American Psycho
22 Cast Away
23 Bones
24 Traffic
25 Frequency
26 12
27 Equilibrium
28 Storytelling
29 Life as a House
30 They
31 Witness
32 Simone
33 Vertigo
34 Romeo & Juliet


In [34]:
missing_ids = ['tt0075314', 'tt0069945', 'tt0288477', 'tt0116282', 'tt0116629', 'tt0116905', 'tt0120611', 'tt0166110','tt0227005','tt0250081','tt0258153','tt0090329', 'tt0192947', 'tt7464054', 'tt0111257', 'tt0052357', 'tt0132512', 'tt0105435', 'tt0117509', 'tt0181865', 'tt0086250', 'tt0144084', 'tt0112641', 'tt0112722', 'tt0186151', 'tt0238380', 'tt0099871', 'tt0759956', 'tt0120399', 'tt0042192', 'tt0283632', 'tt0106453', 'tt0162222']
title_ids += missing_ids
title_ids += list(imdb_data[imdb_data['title'].duplicated(keep = False)==False]['tconst'])    #add the non-duplicated titles

Lastly, after having all of the title ids, I just create a dataframe for the metadata that I need, add more metadata based on the title ids and remove unnecessary pieces of data.

In [36]:
metadata = pd.DataFrame()
metadata = imdb_data[imdb_data['tconst'].isin(title_ids)]
metadata = metadata.reset_index().drop(['index'],axis = 1)

rating_data = pd.read_csv('title_ratings.tsv',sep = '\t')              #add ratings 
metadata = pd.merge(metadata, rating_data, on = 'tconst', how = 'left')
metadata.drop(['ordering','region','language','types','attributes','isOriginalTitle', 'titleType','primaryTitle', 'originalTitle','endYear'],axis = 1, inplace = True)
metadata

Unnamed: 0,tconst,title,isAdult,startYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0017496,Under Fire,0,1926,50,Western,,
1,tt0020564,Wall Street,0,1929,68,Drama,6.6,23.0
2,tt0022626,American Madness,0,1932,75,Drama,7.4,2034.0
3,tt0023969,Duck Soup,0,1933,69,"Comedy,Musical",7.8,59637.0
4,tt0036613,Arsenic and Old Lace,0,1943,118,"Comedy,Crime,Thriller",7.9,69969.0
...,...,...,...,...,...,...,...,...
680,tt6133532,AssAssINS,0,2016,78,"Action,Adventure,Comedy",,
681,tt6573718,Day of the Dead,0,\N,\N,Horror,,
682,tt6644200,A Quiet Place,0,2018,90,"Drama,Horror,Sci-Fi",7.5,504862.0
683,tt6647576,BEAN,0,2017,66,Documentary,8.3,45.0


### Web-scraping for movie scripts 

I first get the link of the movie scripts and remove the movies that appear to have the scripts online but turn out not to

In [37]:
#get link of the movie scripts
drop = []     #this is used for some movies that don't have available scripts
mov_script_urls = {}
for title in metadata['title']:
    
    title_url = mov_info[title.lower()]
    r = requests.get(title_url)
    bs = BeautifulSoup(r.text)
    try:
        a_tag = bs.find_all('p', {'align':'center'})[0].a
        if 'Script' in a_tag.text:
            mov_script_urls[title] = 'https://imsdb.com' + a_tag['href']
    except:
        drop.append(title)

metadata = metadata[~metadata['title'].isin(drop)]

I'm now doing web-scraping to get the scripts.

In [38]:
#fix some links
mov_script_urls['Thelma & Louise'] = 'https://imsdb.com/scripts/Thelma-%2526-Louise.html'
mov_script_urls['Benny & Joon'] = 'https://imsdb.com/scripts/Benny-%2526-Joon.html'
mov_script_urls['Bodies, Rest & Motion'] = 'https://imsdb.com/scripts/Bodies,-Rest-%2526-Motion.html'
mov_script_urls['Cowboys & Aliens'] = 'https://imsdb.com/scripts/Cowboys-%2526-Aliens.html'
mov_script_urls['How to Lose Friends & Alienate People'] = 'https://imsdb.com/scripts/How-to-Lose-Friends-%2526-Alienate-People.html'
mov_script_urls['Angels & Demons'] = 'https://imsdb.com/scripts/Angels-%2526-Demons.html'
mov_script_urls['Marley & Me'] = 'https://imsdb.com/scripts/Marley-%2526-Me.html'
mov_script_urls['Celeste & Jesse Forever'] = 'https://imsdb.com/scripts/Celeste-%2526-Jesse-Forever.html'
mov_script_urls['Papadopoulos & Sons'] = 'https://imsdb.com/scripts/Papadopoulos-%2526-Sons.html'

In [39]:
file_list = []
for title in metadata['title']:
    try:
        r = requests.get(mov_script_urls[title])
        bs = BeautifulSoup(r.text)
        try: 
            script = bs.pre.text
        except:
            script = bs.find_all('td',{'class':'scrtext'})[0].text
        if script == '':
            file_list.append(np.nan)
            continue
        file_name = title.lower().replace(' ','-')
        file_list.append(file_name)
        file_path = join('scripts', file_name)
        f = open(file_path,'w')
        f.write(script)
        f.close()
    except: 
        file_list.append(np.nan)

In [43]:
metadata.loc[:,'file_name'] = file_list
metadata = metadata.dropna(subset=['file_name']).drop_duplicates(subset = ['file_name']).reset_index().drop(['index'],axis = 1)
metadata.to_csv('metadata.csv')