In [15]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import time
import json
from fake_headers import Headers
import requests
from bs4 import BeautifulSoup as bs

import string
from bs4 import BeautifulSoup, NavigableString
import re

def get_page(base_url, data):
    try:
        headers = Headers().generate()
        r = requests.get(base_url, headers=headers, params=data)
    except:
        r = None
    return r


def extract_writers(soup):
    writers_tag = soup.find('b', string='Writers')
    authors = []
    if writers_tag:
        sibling = writers_tag.next_sibling
        while sibling:
            if sibling.name == 'a':
                authors.append(sibling.text.strip())
            elif sibling.name == 'b':  # stop at next <b> section
                break
            sibling = sibling.next_sibling
    return authors

def extract_genres(soup):
    genres = []
    genres_tag = soup.find('b', string='Genres')
    if genres_tag:
        sibling = genres_tag.next_sibling
        while sibling:
            if sibling.name == 'a':
                genres.append(sibling.text.strip())
            elif sibling.name == 'b':
                break
            sibling = sibling.next_sibling
    return genres

def extract_rating(soup, tag_label):
    """Extracts a float rating from a labeled <b> tag, or returns None if not available."""
    tag = soup.find('b', string=tag_label)
    if not tag:
        return None

    next_elem = tag.find_next_sibling()
    while next_elem:
        if isinstance(next_elem, str):
            text = next_elem.strip().lower()
            if text in ["not available", "none available"]:
                return None
            match = re.search(r'\(([\d.]+)\s+out of 10\)', text)
            if match:
                return float(match.group(1))
        next_elem = next_elem.next_sibling

    return None

    
def extract_script_date(soup):
    script_date = None
    for b_tag in soup.find_all('b'):
        if 'Script Date' in b_tag.text:
            if b_tag.next_sibling:
                script_date = b_tag.next_sibling.strip(" :\n")
            break
    return script_date

def extract_url(soup):
    script_link = soup.find('a', href=True, string=lambda x: x and "Read" in x)
    script_url = script_link['href'] if script_link else None
    return script_url

def get_meta_from_imsdb(url):
    headers = Headers().generate()
    r = requests.get(url, headers=headers)
    soup = bs(r.text, 'html.parser')
    soup = soup.find_all("table", class_="script-details")[0]
    authors = extract_writers(soup)
    genres = extract_genres(soup)
    imsdb_rating = extract_rating(soup, "IMSDb rating")
    user_rating = extract_rating(soup, "Average user rating")
    script_date = extract_script_date(soup)
    script_url = extract_url(soup)
    data = {
        "url": url,
        "authors": authors,
           "genres": genres,
           "imsdb_rating": imsdb_rating,
           "user_rating": user_rating,
           "script_date": script_date,
           "script_url": script_url}
    time.sleep(1)
    return data

In [4]:
all_links = []
for k in tqdm(["0"] + list(string.ascii_uppercase)):
    headers = Headers().generate()
    r = requests.get("https://imsdb.com/alphabetical/{}".format(k), headers=headers)
    soup = bs(r.text, 'html.parser')
    links = [link for link in soup.find_all("a", href=True) if link['href'][:15] == '/Movie Scripts/' and link.has_attr("title")]
    all_links.extend(links)
    time.sleep(1)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:31<00:00,  1.17s/it]


In [98]:
link_df = []
for link in all_links:
    link_data = {"title": link.text,
                "url": "https://imsdb.com" + link["href"]}
    link_df.append(link_data)
link_df = pd.DataFrame(link_df)

In [191]:
meta_df = []
for i in tqdm(range(link_df.shape[0])):
    url = link_df.url.iloc[i]
    data = get_meta_from_imsdb(url)
    meta_df.append(data)

In [197]:
meta_df = pd.DataFrame(meta_df)
meta_df = meta_df.merge(link_df, how="left", on="url")
meta_df = meta_df[~meta_df.script_url.isna()].reset_index()
meta_df["script_url"] = "https://imsdb.com/" + meta_df.script_url

In [227]:
script_soup_list = []
for i in tqdm(range(meta_df.shape[0])):
    try:
        url = meta_df.script_url.iloc[i]
        headers = Headers().generate()
        r = requests.get(url, headers=headers)
        soup = bs(r.text, 'html.parser')
        script_soup_list.append(soup)
    except:
        script_soup_list.append(None)
    time.sleep(1)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1297/1297 [24:45<00:00,  1.15s/it]


In [233]:
meta_df["script_soup"] = script_soup_list

In [389]:
for i in range(meta_df.shape[0]):
    soup = meta_df["script_soup"].iloc[i]
    title = meta_df["title"].iloc[i]
    with open("raw_imsdb_html/{}.html".format(title), "w") as file:
        file.write(str(soup))

In [337]:
meta_df.script_url.iloc[1]

'https://imsdb.com//scripts/12.html'

In [5]:
meta_df.shape
meta_df.to_csv("meta_df.csv")

(1266, 12)