## Scraping data

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://ofdollarsanddata.com/wall-street-movies/"

# sending get request to the URL and storing the response
response = requests.get(url)

# parsing the content of the HTML page
soup = BeautifulSoup(response.content, "html.parser")

# extracting the h2 and p tags
h2_tags = [h2.text for h2 in soup.find_all("h2")]
p_tags = [p.text for p in soup.find_all("p")]

# trim lists for relevant data
p_tags = p_tags[3:-9]
h2_tags = h2_tags[:-3]

# creating a dataframe from the extracted data
data = {"title": h2_tags, "description": p_tags}
df = pd.DataFrame(data)

# displaying the dataframe
display(df)

In [None]:
# Rearrange the order of columns.
def rearrange_col(df):
    df = df[df.columns[::-1]]
    return df
    
df = (df
 .assign(year=(lambda df_: df_.title.str.extract(r"\((\d{4})\)")),
         movie=(lambda df_: df_.title.str[3:-7].str.lstrip()))
 .drop(columns='title')
 .pipe(rearrange_col)
)

In [None]:
# df.to_csv('wall_street_movies.csv', index=False)

## Getting ratings
I manually added the ratings for each movie. The ratings were obtained from IMDb.

## Subtitles to markdown files
Scripts for some movies were not fully available on the Internet and I didn't want to pay any money. To solve this problem, I downloaded subtitles for each movie and converted the files from `.srt` to markdown `.md`.

In [None]:
# Convert .srt files to .md
import os

def srt_to_md(filename):
    # Open the .srt file for reading
    with open(filename, "r") as file:
        content = file.read()

    # Replace the newline characters with line breaks
    content = content.replace("\n", "<br>")

    # Write the transformed content to a .md file with the same name
    new_filename = os.path.splitext(filename)[0] + ".md"
    with open(new_filename, "w") as file:
        file.write(content)

# Get the list of all .srt files in the current directory
files = [f for f in os.listdir() if f.endswith(".srt")]

# Transform each .srt file to a .md file
for file in files:
    srt_to_md(file)

## Dataframe of movie and script
Here I'm creating another dataframe which contains the movie title based on the markdown file name and their corresponding script.

In [5]:
# Create a dataframe from markdown files.
import re

def read_file(filename):
    with open(filename, "r") as file:
        return file.read()

# Get the list of all .md files in the current directory
files = [f for f in os.listdir() if f.endswith(".md")]

# Read the content of each .md file
contents = [read_file(file) for file in files]

# Create a list of file names without the ".md" extension
movies = [file[:-3] for file in files]

# Create a DataFrame with the file names without the ".md" extension as the values for the first column "movies" and the contents as the values for the second column "script"
df = pd.DataFrame({"movies": movies, "script": contents})


## First round text cleaning
There are many unnecessary characters in the script text such as timestamps for each movie clip. I will tidy the script text in three phases.

In [6]:
def remove_numbers(df, column_name):
    # Define a regular expression pattern that matches a number immediately followed by a letter
    pattern = re.compile(r'\d+([a-zA-Z])|\d\$|\d+\(')
    df[column_name] = df[column_name].str.replace(pattern, r'\1').str.strip()
    return df

def replace_first_two_digits(df, column_name):
    pattern = re.compile(r'\b(\d{2})(\d{4})\b')
    df[column_name] = df[column_name].str.replace(pattern, r'\2')
    return df

def add_space_after_punctuation(df, column_name):
    pattern = re.compile(r'([a-zA-Z])[?\.]([a-zA-Z])')
    df[column_name] = df[column_name].str.replace(pattern, r'\1? \2')
    return df

def first_cleaning(df):
    return (df
     .assign(script=lambda df_: df_.script.str.replace('\d<br>\d{2}:\d{2}:\d{2},\d{3}\ -->\ \d{2}:\d{2}:\d{2},\d{3}<br>|ï»¿', '', regex=True),
             script1=lambda df_: df_.script.str.replace('(<br>)+\d|<br><br>|<br>|â™ªâ™ªâ™ª|\d-', ' ', regex=True),
             script2=lambda df_: df_.script1.str.replace('\d+\[\ \]|[â™ªâ™ªâ™ª]\ |</i>|<i>|â€¦|\ -\ ', '', regex=True),
             script3=lambda df_: df_.script2.str.replace('(<br>)+\d+|<br>-\ |<br>', ' ', regex=True),
             script4=lambda df_: df_.script3.str.replace('\[â™ªâ™ªâ™ª\]|(\d+)?\[((([A-Z])+ ?)+)+\]|\d+\*\*\ |\d+\.\.|\d+-\ ', '', regex=True),
             script5=lambda df_: df_.script4.str.replace('â€™', "'", regex=True),
             script6=lambda df_: df_.script5.str.replace(' [ ] ', '', regex=False),
             script7=lambda df_: df_.script6.str.replace('(\d+)?([A-Z])+:\ |<font\ color="#', '', regex=True),
            )
     .pipe(remove_numbers, 'script7')
     .pipe(replace_first_two_digits, 'script7')
     .pipe(add_space_after_punctuation, 'script7')
     .drop(columns=['script','script1','script2','script3','script4','script5','script6'])
     .rename(columns={'script7':'script'})
    )
    
dd = first_cleaning(df)

## Second round text cleaning

In [7]:
def second_cleaning(df):
    return (dd
     .assign(script=lambda df_: df_.script.str.replace('([A-Z])+\ \d(\ )?:\ ', '', regex=True),
             script1=lambda df_: df_.script.str.replace('</font>', '', regex=False),
             script2=lambda df_: df_.script1.str.replace('e020">', '', regex=False),
             script3=lambda df_: df_.script2.str.replace('(â™)?(\d+)?â™', '', regex=True),
             script4=lambda df_: df_.script3.str.replace('\*(\ )?\*\ ', '', regex=True),
             script5=lambda df_: df_.script4.str.replace('(\d+)?(\*)?\ (\d+\*)?(\ )?', ' ', regex=True),
             script6=lambda df_: df_.script5.str.replace('(\d+)?\[([A-Za-z])+(\])?\ (([A-Za-z])+\ ([A-Za-z])+(\])?(\ )?([A-Za-z])+\]\ )?', '', regex=True),
             script7=lambda df_: df_.script6.str.replace('\d+\"', '"', regex=True),
             script8=lambda df_: df_.script7.str.replace('\([a-zA-Z]+((\ [a-zA-Z]+)+)?\)|\)|\.{3}|--|\d+\#\ ', '', regex=True),
             script9=lambda df_: df_.script8.str.replace('Subtitles downloaded from www? OpenSubtitles? org ', '', regex=False),
             script10=lambda df_: df_.script9.str.replace(' #', '', regex=False),
             script11=lambda df_: df_.script10.str.replace('Sync for "Wall? Street.1987.BluRay? P? DTS? x CHD" ::nlsinh@gmail? com:: WALLSTREET hddanang? com ', '', regex=False).str.strip()
            )
     .drop(columns=['script','script1','script2','script3','script4','script5','script6','script7','script8','script9','script10'])
     .rename(columns={'script11':'script'})
    )

ff = second_cleaning(dd)

In [8]:
def third_cleaning(df):
    return (ff
     .assign(script=lambda df_: df_.script.str.replace('Best watched using Open Subtitles MKV Player', '', regex=False),
             script1=lambda df_: df_.script.str.replace('clanking]', '', regex=False),
             script2=lambda df_: df_.script1.str.replace(' Visiontext subtitles: Paul Sofer', '', regex=False),
             script3=lambda df_: df_.script2.str.replace('Cleaned, corrected and OCR issues fixed by Tronar Hiya,', '', regex=False),
             script4=lambda df_: df_.script3.str.replace('Sync for "Wall? Street.1987.BluRay? P.DTS? x CHD" ::nlsinh@gmail? com:: WALLSTREET hddanang? com ', '', regex=False).str.strip()
            )
     .drop(columns=['script','script1','script2','script3'])
     .rename(columns={'script4':'script'})
    )

gg = third_cleaning(ff)

## Make movies in both dataframes same
I want to have a single dataframe to work with, which means I’ll have to merge the two dataframes. Hence, I need to make sure that the names in the movie column match in both dataframes. This will make it possible for me to merge the dataframes by joining on the movie column.

In [29]:
#Give proper titles to the movie names.
rename_dict = {
        'AmericanPsycho':'American Psycho',
     'Arbitrage':'Arbitrage',
     'BoilerRoom':'Boiler Room',
     'Equity':'Equity',
     'InsideJob':'Inside Job',
     'MarginCall':'Margin Call',
     'OtherPeoplesMoney':'Other Peoples’ Money',
     'RogueTrader':'Rogue Trader',
     'TheBigShort':'The Big Short',
     'TheWizardOfLies':'The Wizard of Lies',
     'TheWolfofWallStreet':'The Wolf of Wall Street',
     'TradingPlaces':'Trading Places',
     'WallStreet':'Wall Street'
    }

def rename_column_values(df, column_name, rename_dict):
    df[column_name].replace(rename_dict)
    return df

gg = (gg
 .rename(columns={'movies':'movie'})
 .pipe(rename_column_values, 'movie', rename_dict)
)

In [35]:
# Merge the dataframes on the movie column
merged_df = real_df.merge(gg, on='movie', how='left')
merged_df

Unnamed: 0,movie,year,description,rating,script
0,Margin Call,2011,Set in the early stages of the 2008 financial ...,7.1,Is that them? Jesus Christ. Are they going to ...
1,Wall Street,1987,This is the classic film that started it all. ...,7.3,Easy! Excuse me! Good morning. Jackson Steinem...
2,The Big Short,2015,"Based on the book by Michael Lewis, The Big Sh...",7.8,"Frank. How are the wife and kids? You know, fo..."
3,Trading Places,1983,"Being the only pure comedy on this list, Tradi...",7.5,"Your breakfast, sir. Pork bellies! I have a hu..."
4,The Wolf of Wall Street,2013,"Directed by Martin Scorsese, The Wolf of Wall ...",8.2,The world of investing can be a jungle. Bulls....
5,American Psycho,2000,American Psycho is a cult classic in Wall Stre...,7.6,rare roasted partridge breast in raspberry cou...
6,Arbitrage,2012,Arbitrage is a 2012 film that tells the story ...,6.6,But you took a huge bet on the housing crisis ...
7,Equity,2016,Equity follows the story of Naomi Bishop (Anna...,5.6,The entire market is watching? Back with Dynac...
8,Inside Job,2010,Though this movie is technically a documentary...,8.2,Iceland is a stable democracy with a high stan...
9,Boiler Room,2000,Boiler Room tells the story of Seth Davis (Gio...,7.0,I read this article a while back that said Mic...


In [37]:
# Write merged data frame to CSV.
# merged_df.to_csv('finance_movies.csv', index=False)