In [None]:
import re
import string
import time
from tqdm import tqdm
import unicodedata

import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
from selenium import webdriver


In [None]:
def rendering(url):
    driver = webdriver.Chrome('/usr/local/bin/chromedriver') # run ChromeDriver
    driver.get(url) # load the web page from the URL
    time.sleep(3) # wait for the web page to load
    render = driver.page_source # get the page source HTML
    driver.quit() # quit ChromeDriver
    return render # return the page source HTML
    

In [None]:
base_url = 'https://d23.com/'

end_urls = ['0-9'] + list(string.ascii_lowercase)


In [None]:
df = pd.DataFrame(columns=['title', 'd23_link'])

for end in tqdm(end_urls):
    search_url = base_url + f'disney-a-to-z/{end}/'
    
    page = rendering(search_url)

    soup = BeautifulSoup(page, 'html.parser')
    
    page_content = soup.find_all('div', class_='a-z-letter-wrapper')
    
    for content in page_content:
        for a in content.select('a'):
            if ('film' in a.text) and (('(' or ')') in a.text) :
                df = df.append({'title': a.text, 'd23_link': a['href'].lstrip(base_url)}, ignore_index=True)
            

In [None]:
translate_dict = dict([(ord(x), ord(y)) for x,y in zip( u"‘’´“”—–-", u"'''\"\"---")]) 


In [None]:
# fix titles with unicode and non-ascii characters
df['title'] = [unicodedata.normalize('NFD', title.translate(translate_dict)) \
               .encode('ascii', 'ignore') \
               .decode("utf-8")\
               .replace('  ', ' ') for title in df['title']]


In [None]:
df

In [None]:
def get_d23(i, row):
    
    search_url = base_url + row['d23_link']

    page = rendering(search_url)

    soup = BeautifulSoup(page, 'html.parser')

    page_content = soup.find_all('div', class_='entry-content')

    # get film summary
    summary = []
    for content in page_content:
        for p in (content.select('p')):
            summary.extend(p.text)

    # join summary list, fix summaries with unicode and non-ascii characters
    summary = unicodedata.normalize('NFD', "".join(summary).translate(translate_dict)).encode('ascii', 'ignore').decode("utf-8").replace('  ', ' ')
    
    return summary


In [None]:
df['summary'] = ''


In [None]:
for i,row in tqdm(df.iterrows(), total=df.shape[0]):

    if row['summary'] == '':
        summary = get_d23(i,row)
    
        df.at[i,'summary'] = summary


In [None]:
df

In [None]:
# remove title from beginning of summary
df['summary'] = df.apply(lambda x: x['summary'][x['summary'].find(x['title'])+len(x['title']):].lstrip(), axis=1)

In [None]:
# remove 'film' and 'live-action' tags from titles
df['title'] = [title[:max([title.find('(film'), title.find('(live')])].rstrip() for title in df['title']]


In [None]:
df

In [None]:
df.to_csv('movie_summaries.csv')