In [32]:
import logging
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
import json

In [6]:


logging.basicConfig(
    format='%(asctime)s %(levelname)s:%(message)s',
    level=logging.INFO)

class Crawler:

    def __init__(self, urls=[]):
        self.visited_urls = []
        self.urls_to_visit = urls

    def download_url(self, url):
        return requests.get(url).text

    def get_linked_urls(self, url, html):
        soup = BeautifulSoup(html, 'html.parser')
        for link in soup.find_all('a'):
            path = link.get('href')
            if path and path.startswith('/'):
                path = urljoin(url, path)
            yield path

    def add_url_to_visit(self, url):
        if url not in self.visited_urls and url not in self.urls_to_visit:
            self.urls_to_visit.append(url)

    def crawl(self, url):
        html = self.download_url(url)
        for url in self.get_linked_urls(url, html):
            self.add_url_to_visit(url)

    def run(self):
        while self.urls_to_visit:
            url = self.urls_to_visit.pop(0)
            logging.info(f'Crawling: {url}')
            try:
                self.crawl(url)
            except Exception:
                logging.exception(f'Failed to crawl: {url}')
            finally:
                self.visited_urls.append(url)

In [9]:
url = 'https://bigbangtheory.fandom.com/wiki/Transcripts/Pilot'
r = requests.get(url)
r.encoding = r.apparent_encoding
html = r.text
soup = BeautifulSoup(html, 'html.parser')

In [55]:
br = soup.find_all('tbody')
transcript = br[0]

transcript_dict = {
    'Season': 1,
    'Episode': 1,
    'Title': 'Pilot',
    'Actor2Line': dict(list()),
    'Actor2Profile': dict(),
    'AllLines': list()
}

transcript_data = transcript.find_all('tr')

line_idx = 0
for pair in transcript_data[1:]:
    
    actor, content = pair.find_all('td')
    if content.string is None:
        continue

    if len(actor.find_all('a')) > 0:
        actor_name = actor.a.string.rstrip()
        actor_link = str(actor.a.get('href'))
        transcript_dict['Actor2Profile'][actor_name] = actor_link
    else:
        actor_name = actor.string.rstrip()
        
    actor_line = content.string.rstrip()
    
    assert isinstance(actor_name, str)
    assert isinstance(actor_line, str)
    
    if actor_name not in transcript_dict['Actor2Line']:
        transcript_dict['Actor2Line'][actor_name] = [(line_idx, actor_line)]
    else:
        transcript_dict['Actor2Line'][actor_name].append((line_idx, actor_line))
    transcript_dict['AllLines'].append(actor_line)
    line_idx += 1
        
    #except Exception as e:
    #    print("Failed processing at idx "+str(line_idx))
    #    print(e)
# print(transcript_dict)

with open('transcript.json', 'w+') as fout:
    json.dump(transcript_dict, fout, indent=4)