# Scraping Movies Data Using BeautifulSoup

In [3]:
# Importing Libraries

from bs4 import BeautifulSoup
import requests

In [4]:
# Requesting Website for HTML code
website = 'https://subslikescript.com/movie/Titanic-120338'
print(website)
result = requests.get(website)
print(result)
content = result.text
soup = BeautifulSoup(content, 'lxml')


https://subslikescript.com/movie/Titanic-120338
<Response [200]>


In [5]:
# print(soup.prettify())  # prints the HTML of the website

In [7]:
# Locate the box that contains title and transcript
box = soup.find('article', class_='main-article')

# Locate title and transcript
title = box.find('h1').get_text()
transcript = box.find('div', class_='full-script').get_text(strip = True, separator= ' ')
print(title)
print(transcript[:200])

Titanic (1997) - full transcript
13 meters. You should see it. Okay, take her up and over the bow rail. Mir 2, we're going over the bow.
Stay with us. Okay, quiet. We're rolling. Seeing her coming out of the
darkness like a ghost shi


In [None]:
# Exporting data in a text file with the "title" name

with open(f'{title}.txt', 'w', encoding='utf-8') as file:
    file.write(transcript)

# Extracting the links of multiple movie transcripts

In [9]:
# Requesting Website for HTML code

root = 'https://subslikescript.com'  # This is the homepage of the website
website = f'{root}/movies' # Concatenating the homepage with the movies section. 
result = requests.get(website)
content = result.text
soup = BeautifulSoup(content, 'lxml')
print(website)
print(result)


https://subslikescript.com/movies
<Response [200]>


In [10]:
# Locate the box that contains a list of movies
box = soup.find('article', class_='main-article')


# Store each link in "links" list (href doesn't consider root aka "homepage", so we have to concatenate it later)
links = []
for link in box.find_all('a', href=True):  # find_all returns a list
    links.append(link['href'])

print(links)

['movie/How_to_Talk_to_Girls_at_Parties-3859310', 'movie/I_Am_Not_Him-3228886', 'movie/Drengen_der_gik_baglns-109673', 'movie/A_forza_di_sberle-71090', 'movie/Ms_amaneceres-1846671', 'movie/Friendships_Death-93050', 'movie/Rama_Rao_on_Duty-15028848', 'movie/The_Student-1865357', 'movie/Get_Charlie_Tully-68636', 'movie/Costa_Concordia_-_Chronik_einer_Katastrophe-16025668', 'movie/She_Didnt_Say_No-199014', 'movie/Le_silence_dabord-387593', 'movie/Dead_Scared-366555', 'movie/The_Pentagon_Wars-144550', 'movie/Sheng_zhe_wei_wang-5257730', 'movie/Blackout_Effect-135807', 'movie/The_Invisible_Life-1999246', 'movie/Broad_Peak-8983230', 'movie/The_Second_Civil_War-120086', 'movie/The_Ghost_and_the_Darkness-116409', 'movie/The_Flesh_of_the_Orchid-71297', 'movie/The_Tuskegee_Airmen-114745', 'movie/Demons_Never_Die-1777612', 'movie/The_Catholic_School-10345782', 'movie/Last_Call-1472583', 'movie/Code_Name_Wolverine-115138', 'movie/Dead_Silence-118942', 'movie/My_11th_Mother-1233503', 'movie/Flypap

In [11]:
# Extracting the movie transcript
# Loop through the "links" list and sending a request to each link
for link in links:
    result = requests.get(f'{root}/{link}')
    content = result.text
    soup = BeautifulSoup(content, 'lxml')
    
    # Locate the box that contains title and transcript
    box = soup.find('article', class_='main-article')
    
    # Locate title and transcript
    title = box.find('h1').get_text()
    transcript = box.find('div', class_='full-script').get_text(strip=True, separator=' ')
    
    # Exporting data in a text file with the "title" name
#    with open(f'{title}.txt', 'w', encoding='utf-8') as file:
#         file.write(transcript)
    

# Extracting links from pagination bar

In [12]:
# Requesting Website for HTML code

root = 'https://subslikescript.com'  # this is the homepage of the website
website = f'{root}/movies_letter-X'  # concatenating the homepage with the movies "letter-X" section. You can choose any section (e.g., letter-A, letter-B, ...)
result = requests.get(website)
content = result.text
soup = BeautifulSoup(content, 'lxml')

In [13]:
# Locate the box that contains the pagination bar
pagination = soup.find('ul', class_='pagination')
pages = pagination.find_all('li', class_='page-item')
last_page = pages[-2].text  # this is the number of pages that the website has inside the movies "letter X" section

In [14]:
# Extracting the links of multiple movie transcripts inside each page listed

# Loop through all tbe pages and sending a request to each link

for page in range(1, int(last_page)+1):
    result = requests.get(f'{website}?page={page}')  # structure --> https://subslikescript.com/movies_letter-X?page=2
    content = result.text
    soup = BeautifulSoup(content, 'lxml')

    # Locate the box that contains a list of movies
    box = soup.find('article', class_='main-article')

    # Store each link in "links" list (href doesn't consider root aka "homepage", so we have to concatenate it later)
    links = []
    for link in box.find_all('a', href=True):  # find_all returns a list
        links.append(link['href'])

    #################################################
    # Extracting the movie transcript
    #################################################

    for link in links:
        try:  # "try the code below. if something goes wrong, go to the "except" block"
            result = requests.get(f'{root}/{link}')  # structure --> https://subslikescript.com/movie/X-Men_2-290334
            content = result.text
            soup = BeautifulSoup(content, 'lxml')

            # Locate the box that contains title and transcript
            box = soup.find('article', class_='main-article')
            # Locate title and transcript
            title = box.find('h1').get_text()
            transcript = box.find('div', class_='full-script').get_text(strip=True, separator=' ')

            # Exporting data in a text file with the "title" name
            with open(f'{title}.txt', 'w') as file:
                file.write(transcript)
        except:
            print('------ Link not working -------')
            print(link)


------ Link not working -------
movie/X-13560574
------ Link not working -------
movie/X-118200
------ Link not working -------
movie/X_-_The_eXploited-6190456
------ Link not working -------
movie/X-Men_2-290334
------ Link not working -------
movie/X-Men_Apocalypse-3385516
------ Link not working -------
movie/X-Men_Days_of_Future_Past-1877832
------ Link not working -------
movie/X-Men_The_Last_Stand-376994
------ Link not working -------
movie/X-Rated_2_The_Greatest_Adult_Stars_of_All_Time-6189052
------ Link not working -------
movie/X_The_Man_with_the_X-Ray_Eyes-57693
------ Link not working -------
movie/XY-2790182
------ Link not working -------
movie/Xanadu-81777
------ Link not working -------
movie/Xenophobia-8571404
------ Link not working -------
movie/XTC_This_Is_Pop-7465694
