# Psych Scraper

The goal is to download all of the Psych transcripts written by some random dude on some random [blog](https://jpgr.livejournal.com/) I found to be used as a dataset.

In [153]:
import requests
from bs4 import BeautifulSoup
import re

In [88]:
# search for posts with the tag for the corresponding season
search_urls = [f"https://jpgr.livejournal.com/?tag=transcripts%3A%20psych%3A%20season%20{i}" for i in range(1, 4)]

In [89]:
def get_all_post_urls(search_url):
    result = []
    response = requests.get(search_url)
    soup = BeautifulSoup(response.text, "html.parser")
    post_link_els = soup.find_all("a", class_="subj-link")
    result += [el["href"] for el in post_link_els]

    # need to check if second page of results exist
    next_page_link_el = soup.find("a", string="go earlier")
    if (next_page_link_el is not None):
        next_page_url = next_page_link_el["href"]
        result += get_all_post_urls(next_page_url)

    return result

In [185]:
post_urls = []
for url in search_urls:
    post_urls += get_all_post_urls(url)

posts = [requests.get(url).text for url in post_urls]

In [329]:
def get_episode_data(post):
    soup = BeautifulSoup(post, "html.parser")
    
    subject = soup.find("div", class_="subject").string
    i = re.search(r'\dx\d*', subject).start()
    episode_title = subject[i:].strip()
    
    entry_text = soup.find("div", class_="entry_text")
    tags = entry_text.find("div", class_="ljtags")
    tags.extract()
    for br in entry_text.find_all("br"):
        br.replace_with("\n")


    # The post content had formatting errors which the rendering engine dumps the error and raw content for
    # what a pain    
    for error in entry_text.find_all("div", class_="ljparseerror"):
        print(f'Error in episode {episode_title}')
        error.extract()
    transcript_raw = entry_text.get_text()
    transcript_raw = transcript_raw.replace("<b>", "")
    transcript_raw = transcript_raw.replace("</b>", "")
    transcript_raw = transcript_raw.replace("<i>", "")
    transcript_raw = transcript_raw.replace("</i>", "")
    transcript_raw = transcript_raw.replace("<u>", "")
    transcript_raw = transcript_raw.replace("</u>", "")
    
    # Replace non-standard chars
    transcript_raw = transcript_raw.replace("\t", " ")
    transcript_raw = transcript_raw.replace("–", "-")
    transcript_raw = transcript_raw.replace("—", "-")
    transcript_raw = transcript_raw.replace("‘", "'")
    transcript_raw = transcript_raw.replace("’", "'")
    transcript_raw = transcript_raw.replace("“", "\"")
    transcript_raw = transcript_raw.replace("”", "\"")
    transcript_raw = transcript_raw.replace("…", "...")
    transcript_raw = transcript_raw.replace("™", "...")
        
    transcript_lines = [line.strip() for line in transcript_raw.split("\n")]
    transcript = "\n".join(transcript_lines).strip()
        
    return episode_title, transcript

# get_episode_data(posts[-9])[0]

Error in episode 3x09 Christmas Joy


'3x09 Christmas Joy'

In [330]:
episodes = [get_episode_data(post) for post in posts]
episodes.sort()

Error in episode 2x9 Bounty Hunters!
Error in episode 3x09 Christmas Joy
Error in episode 3x05 Disco Didn't Die, It Was Murdered


In [331]:
print(episodes[0][0])
print(episodes[0][1][:200])

1x01 Pilot (1/2)
1986, Santa Barbara, California

INT. DINER, DAY

A young boy of around 9 years old, SHAWN, sits across from his father, HENRY in a booth. HENRY is in his police

uniform.

HENRY:
Did you do your home


In [332]:
all_text = ""
for title, transcript in episodes:
    all_text += (f"Episode {title}\n\n")
    all_text += transcript
    all_text += "\n\n"

In [333]:
with open("psych-transcripts.txt", "w+", encoding="utf8") as f:
    f.write(all_text)