In [None]:
# setup cell
import time
import uuid
import json
import os
import requests
import random
import gzip
import calendar
from datetime import datetime
import pandas as pd
import urllib.request
from urllib.parse import urlparse
import re

## Extract data from HTML from sqlite db

In [None]:
import sqlite3
from db import scan_table_limit_offset, decompress_string
from bs4 import BeautifulSoup

db_conn = sqlite3.connect(os.path.join(os.path.dirname(__name__), "data/films.db"))

Generate keyword CSV

In [None]:
all_dfs = []

# scan pages
def process_keywords(df):
    for index, row in df.iterrows():
        html_content = decompress_string(row['page_content_zip'])
        film_id = get_film_id_from_url(row['url'])
        
        # Create a BeautifulSoup object
        soup = BeautifulSoup(html_content, 'html.parser')

        # Extract text of all elements with class="paragraph"
        items = soup.find_all(class_="ipc-metadata-list-summary-item__tc")
        keywords = [p.text.strip() for p in items]
        if len(keywords) == 0:
            assert "have any plot keywords for this title yet" in html_content.lower()
            continue
        keywords_df = pd.DataFrame(keywords, columns=['keyword'])
        keywords_df['film_id'] = film_id
        
        all_dfs.append(keywords_df)


scan_table_limit_offset(db_conn, "select * from pages_dump where url like '%keywords%'", 1000, process_keywords)
all_keywords_df = pd.concat(all_dfs)
all_keywords_df.to_csv(os.path.join(os.path.dirname(__name__), "data/film_keywords.csv"), index=False)
print("ok")


Generate plot summaries

In [None]:
all_dfs = []

def clean_summary(s):
    s = s.replace('\t', '    ').strip()
    return s

# scan pages
def process_plots(df):
    for index, row in df.iterrows():
        html_content = decompress_string(row['page_content_zip'])
        film_id = get_film_id_from_url(row['url'])
        
        # Create a BeautifulSoup object
        soup = BeautifulSoup(html_content, 'html.parser')

        # Extract text of all elements with class="paragraph"
        items = soup.find_all(class_="ipc-html-content-inner-div")
        item_texts = [p.text.strip() for p in items]
        item_texts = [clean_summary(p) for p in item_texts]
        if len(item_texts) == 0:
            #assert "have any plot summaries for this title yet" in html_content.lower()
            continue
        item_df = pd.DataFrame(item_texts, columns=['summary'])
        item_df['film_id'] = film_id
        item_df['summary_id'] = range(len(item_df))
        item_df['summary_id'] = item_df['summary_id']+1
        
        all_dfs.append(item_df)


scan_table_limit_offset(db_conn, "select * from pages_dump where url like '%plotsummary%'", 1000, process_plots)
all_plots_df = pd.concat(all_dfs)
all_plots_df.to_csv(os.path.join(os.path.dirname(__name__), "data/film_plots.tsv"), sep='\t', index=False)
print("ok")
del all_dfs

Generate Location CSV

In [None]:
all_locations = []
nolocation_counter = 0
#scan pages
def process_locations(df):
    counter = 0
    for index, row in df.iterrows():
        html_content = decompress_string(row['page_content_zip'])
        film_id = get_film_id_from_url(row['url'])
        
        # Create a BeautifulSoup object
        soup = BeautifulSoup(html_content, 'html.parser')

        # Extract text of all elements with class="paragraph"
        items = soup.find_all(class_="sc-bec740f7-5 eLQUFg")
        locations = [p.text.strip() for p in items]
        #print(f"film_id = {film_id}, locations = {locations}")
        if len(locations) == 0:
            nolocation_text = "It looks like we don't have any filming & production for this title yet. Be the first to contribute.Learn more"
            containers = soup.findAll('article',class_="sc-b707829e-0 umohR")
            locations = [p.text.strip() for p in containers]
            if (locations[0] != nolocation_text):
                assert "have any plot locations for this title yet" in html_content.lower()
                continue
            else:
                counter+=1
        locations_df = pd.DataFrame(locations, columns=['locations'])
        locations_df['film_id'] = film_id
        
        all_locations.append(locations_df)
    print(f"Movies with no locations = {counter}")
    nolocation_counter += counter
        
scan_table_limit_offset(db_conn, "select * from pages_dump where url like '%locations%'", 1000, process_locations)
all_locations_df = pd.concat(all_locations)
all_locations_df.to_csv(os.path.join(os.path.dirname(__name__), "data/filming_locations.csv"), index=False)
print(f"Total films with no locations = {nolocation_counter}")
print("ok")
del all_locations

Generate User Reviews TSV

In [None]:
all_dfs = []

def clean_summary(s):
    s = s.replace('\t', '    ').strip()
    return s

# scan pages
def process_user_reviews(df):
    for index, row in df.iterrows():
        all_titles = []
        all_reviews = []
        html_content = decompress_string(row['page_content_zip'])
        film_id = get_film_id_from_url(row['url'])
        
        # Create a BeautifulSoup object
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Extract text and title elements of all reviews
        box = soup.find('div', class_='lister-list')
        items = box.findAll(class_="lister-item")
        for item in items:
            title = item.find(class_="title").get_text()
            all_titles.append(title)
            review = item.find(class_="text").get_text(separator=' ', strip=True)
            all_reviews.append(review)

        my_dict = {'title':all_titles, 'user_review':all_reviews}

        if len(my_dict) == 0:
            assert "have any user reviews for this title yet" in html_content.lower()
            continue
        
        item_df = pd.DataFrame.from_dict(my_dict)
        item_df['film_id'] = film_id
        item_df['review_id'] = range(len(item_df))
        item_df['review_id'] = item_df['review_id']+1
        
        all_dfs.append(item_df)


scan_table_limit_offset(db_conn, "select * from pages_dump where url like '%reviews?sort=curated%'", 1000, process_user_reviews)
all_plots_df = pd.concat(all_dfs)
all_plots_df.to_csv(os.path.join(os.path.dirname(__name__), "data/user_reviews.tsv"), sep='\t', index=False)
print("ok")
del all_dfs

Generate Critic Reviews TSV

In [None]:
from selenium import webdriver

all_dfs = []

def process_criticreviews(url, film_id):

    path=os.path.join(os.path.dirname(__name__), "chromedriver-win64\chromedriver.exe")
    driver = webdriver.Chrome(path)
    driver.get(url)
    
    time.sleep(5)  # ensure the page is fully loaded
    html = driver.page_source
    
    page_soup = BeautifulSoup(html, "html.parser")
    page_soup.prettify()
    
    reviewscore = []
    logo = []
    reviewquote = []
    criticname = []

    # Extract all critic reviews and apply loop to extract info from each review
    uscore = page_soup.find(class_='c-siteReviewScore_background')
    universal_metascore = uscore.find('span').get_text()

    # Extract all critic reviews and apply loop to extract info from each review
    boxA = page_soup.find('div', class_='c-pageProductReviews_row')
    reviews = boxA.find_all(class_='c-siteReview')
    
    for review in reviews:
        boxB = review.find('div', class_='c-siteReviewHeader_reviewScore')
        reviewscore.append(boxB.find('span').get_text())
        logo.append(review.find('a', class_='c-siteReviewHeader_publicationName').get_text(separator=' ', strip=True))
        reviewquote.append(review.find('div', class_='c-siteReview_quote').get_text())
        boxC = review.find('div', class_='c-siteReview_extra')
        value = boxC.find('a', class_='c-siteReview_criticName')
        if value is None:
            criticname.append("")
        else:
            criticname.append(value.get_text(separator=' ', strip=True))

        my_dict = {'reviewscore':reviewscore, 'logo':logo, 'critic_review':reviewquote, 'critic_name':criticname}

        if len(my_dict) == 0:
            assert "have any critic reviews for this title yet" in html_content.lower()
            continue
        
        item_df = pd.DataFrame.from_dict(my_dict)
        item_df['film_id'] = film_id
        item_df['universal_score'] = universal_metascore
        item_df['review_id'] = range(len(item_df))
        item_df['review_id'] = item_df['review_id']+1
        
        all_dfs.append(item_df)

In [None]:

noreview_counter = 0
# scan pages
def process_imdb_critics(df):
    counter = 0
    for index, row in df.iterrows():
        all_titles = []
        all_reviews = []
        html_content = decompress_string(row['page_content_zip'])
        film_id = get_film_id_from_url(row['url'])
        
        # Create a BeautifulSoup object
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Extract href link of metacritics
        containers = soup.findAll(class_="ipc-metadata-list-item__label ipc-metadata-list-item__label--link")
        href = [p['href'] for p in containers]
        
        if len(href) == 0:
            noreview_text = "It looks like we don't have any metacritic reviews for this title yet."
            containers = soup.findAll('article',class_="sc-b707829e-0 umohR")
            no_review = [p.text.strip() for p in containers]
            if (no_review[0] != noreview_text):
                assert "have any critic reviews for this title yet" in html_content.lower()
                continue
            else:
                counter+=1
        website = re.sub(r'\?.*',"",href[0]) + '/critic-reviews/'
        process_criticreviews(website, film_id)
    noreview_counter += counter

scan_table_limit_offset(db_conn, "select * from pages_dump where url like '%criticreviews/?ref_=tt_ov_rt%'", 1000, process_imdb_critics)
all_plots_df = pd.concat(all_dfs)
all_plots_df.to_csv(os.path.join(os.path.dirname(__name__), "data/critic_reviews.tsv"), sep='\t', index=False)
print(f"Total films with no reviews = {noreview_counter}")
print("ok")
del all_dfs