# Scrape How I Built This (Podcast)

In [36]:
# Jupyter magic
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [37]:
# imports
import requests
from bs4 import BeautifulSoup
import re
import time
import random
import pandas as pd
from seleniumrequests import Chrome
import pathlib

In [38]:
save_dir = "../1_data/how_i_built_this"
url = "https://www.happyscribe.com/public/how-i-built-this-with-guy-raz"

In [39]:
# Change this to where the chromedriver is in your environment
path_to_chromedriver = '../chromedriver'

driver = Chrome(executable_path=path_to_chromedriver)
driver.get(url)

### Note: to get all the links, use the selinium window to scroll down repeatedly triggerint the "load more" action in the window

In [41]:
links = driver.find_elements_by_class_name('hsp-card-episode')
len(links)

60

In [42]:
episode_urls = []
for l in links:
    episode_urls.append(l.get_property('href'))
    
episode_urls

['https://www.happyscribe.com/public/how-i-built-this-with-guy-raz/kenneth-cole-kenneth-cole',
 'https://www.happyscribe.com/public/how-i-built-this-with-guy-raz/dropbox-drew-houston',
 'https://www.happyscribe.com/public/how-i-built-this-with-guy-raz/how-i-built-resilience-varshini-prakash-of-sunrise-movement',
 'https://www.happyscribe.com/public/how-i-built-this-with-guy-raz/famous-dave-s-dave-anderson',
 'https://www.happyscribe.com/public/how-i-built-this-with-guy-raz/how-i-built-resilience-justin-gold-of-justin-s',
 'https://www.happyscribe.com/public/how-i-built-this-with-guy-raz/how-i-built-resilience-cheryl-contee-of-do-big-things',
 'https://www.happyscribe.com/public/how-i-built-this-with-guy-raz/mcbride-sisters-wine-part-2-of-2-robin-mcbride-and-andrea-mcbride-john',
 'https://www.happyscribe.com/public/how-i-built-this-with-guy-raz/how-i-built-resilience-sonia-gil-of-fluenz',
 'https://www.happyscribe.com/public/how-i-built-this-with-guy-raz/mcbride-sisters-wine-part-1-of-

In [88]:
def clean_text(text):
    drop_list = ["NPR sponsor", 
                 "before we start", "support of this show", "support show", 
                 "how I built this book", "Support for NPR", "our full interview",  "That's an excerpt", 
                 "question from Facebook", "Stay with us", "I'm Guy"]
    clean = []
    for t in text:
        drop = False
        for d in drop_list:
            if d in t:
                drop = True
        if drop:
            continue
        clean.append(t)
    return clean

def parse_title(title):
    
    if 'How I Built Resilience:' in title:
        t = title.split('How I Built Resilience:')[1]
        if 'of' in t:
            name =  t.split('of')[0]
            org =  t.split('of')[-1]
            return name.strip(), org.strip()
        if 'Live with':
            name = t.split('Live with')[-1]
            org = ""
    else:
        t = title.split(':')[-1]
        name = t.split('(')[0]
        org = title.split(':')[0]
    return name.strip(), org.strip()

def get_episode(episode_url):
    response = requests.get(episode_url)
    print(response)
    soup = BeautifulSoup(response.text, 'html.parser')
    title = soup.find('h1').text
    text = [t.text for t in soup.find('main').find_all('p')]
    text = clean_text(text)
    year = soup.find(id='date').text.split()[-1]
    return title, text, year


title, text, year = get_episode(episode_urls[2])


<Response [200]>


In [85]:
episode_url = episode_urls[5]
response = requests.get(episode_url)
print(response)
soup = BeautifulSoup(response.text, 'html.parser')
episode_url

<Response [200]>


'https://www.happyscribe.com/public/how-i-built-this-with-guy-raz/how-i-built-resilience-cheryl-contee-of-do-big-things'

In [89]:
parse_title(title)

('Varshini Prakash', 'Sunrise Movement')

In [47]:
name, org = parse_title(title)
name, org

('Cheryl Contee of Do Big Things', '')

In [92]:

data = []
for episode_url in episode_urls:
    title, text, year = get_episode(episode_url)

    
    name, org = parse_title(title)
    print(title)
    print(len(text), 'lines')
    
    fname = f"{name.split()[-1].lower()}_hibt_{year}"
    fpath = f"{save_dir}/{fname}.txt"
    print(fname)
    with open(fpath, 'w') as f:
        f.write("\n".join(text))
    d = {
        'name': name,
        'org': org,
        'title': title,
        'year': year,
        'filename': name,
        'source': 'hibt',
        'url': episode_url,
    }
    data.append(d)

<Response [200]>
Kenneth Cole: Kenneth Cole
270 lines
cole_hibt_2020
<Response [200]>
Dropbox: Drew Houston
139 lines
houston_hibt_2020
<Response [200]>
How I Built Resilience: Varshini Prakash of Sunrise Movement
58 lines
prakash_hibt_2020
<Response [200]>
Famous Dave's: Dave Anderson
218 lines
anderson_hibt_2020
<Response [200]>
How I Built Resilience: Justin Gold of Justin's 
64 lines
gold_hibt_2020
<Response [200]>
How I Built Resilience: Cheryl Contee of Do Big Things
57 lines
contee_hibt_2020
<Response [200]>
McBride Sisters Wine (Part 2 of 2): Robin McBride and Andréa McBride John 
221 lines
john_hibt_2020
<Response [200]>
How I Built Resilience: Sonia Gil of Fluenz
55 lines
gil_hibt_2020
<Response [200]>
McBride Sisters Wine (Part 1 of 2): Robin McBride and Andréa McBride John 
170 lines
john_hibt_2020
<Response [200]>
method: Adam Lowry & Eric Ryan (2018)
205 lines
ryan_hibt_2020
<Response [200]>
How I Built Resilience: Cynt Marshall of Dallas Mavericks
69 lines
marshall_hibt_

In [93]:
df = pd.DataFrame(data)
df.to_csv(f"{save_dir}/index.csv")
df

Unnamed: 0,name,org,title,year,filename,source,url
0,Kenneth Cole,Kenneth Cole,Kenneth Cole: Kenneth Cole,2020,Kenneth Cole,hibt,https://www.happyscribe.com/public/how-i-built...
1,Drew Houston,Dropbox,Dropbox: Drew Houston,2020,Drew Houston,hibt,https://www.happyscribe.com/public/how-i-built...
2,Varshini Prakash,Sunrise Movement,How I Built Resilience: Varshini Prakash of Su...,2020,Varshini Prakash,hibt,https://www.happyscribe.com/public/how-i-built...
3,Dave Anderson,Famous Dave's,Famous Dave's: Dave Anderson,2020,Dave Anderson,hibt,https://www.happyscribe.com/public/how-i-built...
4,Justin Gold,Justin's,How I Built Resilience: Justin Gold of Justin's,2020,Justin Gold,hibt,https://www.happyscribe.com/public/how-i-built...
5,Cheryl Contee,Do Big Things,How I Built Resilience: Cheryl Contee of Do Bi...,2020,Cheryl Contee,hibt,https://www.happyscribe.com/public/how-i-built...
6,Robin McBride and Andréa McBride John,McBride Sisters Wine (Part 2 of 2),McBride Sisters Wine (Part 2 of 2): Robin McBr...,2020,Robin McBride and Andréa McBride John,hibt,https://www.happyscribe.com/public/how-i-built...
7,Sonia Gil,Fluenz,How I Built Resilience: Sonia Gil of Fluenz,2020,Sonia Gil,hibt,https://www.happyscribe.com/public/how-i-built...
8,Robin McBride and Andréa McBride John,McBride Sisters Wine (Part 1 of 2),McBride Sisters Wine (Part 1 of 2): Robin McBr...,2020,Robin McBride and Andréa McBride John,hibt,https://www.happyscribe.com/public/how-i-built...
9,Adam Lowry & Eric Ryan,method,method: Adam Lowry & Eric Ryan (2018),2020,Adam Lowry & Eric Ryan,hibt,https://www.happyscribe.com/public/how-i-built...


# cleanup

In [None]:
# Save them as backup
with open(f'{save_dir}/index.txt', 'w') as f:
    f.writelines(episode_urls)

In [35]:


old_dir = "../1_data/how_i_built_this_backup"

p = pathlib.Path(old_dir)
data = []
for i in p.glob('*'):
    title = i.name.split('.txt')[0]
    if title == 'index':
        continue
    name, org = parse_title(title)
    if name is None:
        print(name, ' - ', org, ' - ', title)
    
    text  = i.read_text()
    year = 2000
    d = {
        'name': name,
        'org': org,
        'year': year,
        'title': title,
        'url': url,
        'source': 'hibt',
        'filename': f"{name.split()[-1]}_hibt_{year}.txt",
    }
    print(title)
    a = "guy here" in text
    if a:
        print(text)
        break

How I Built Resilience: Brian Chesky of Airbnb
True
The Laundress: Lindsey Boyd
False
Supergoop!: Holly Thaggard
False
How I Built Resilience: Bert and John Jacobs of Life is Good
True
How I Built Resilience: Pokimane
False
How I Built Resilience: Luke Holden and Ben Conniff of Luke's Lobster
False
How I Built Resilience: Live with John Foley
False
How I Built Resilience: Whitney Wolfe Herd of Bumble
False
Khan Academy: Sal Khan
True
How I Built Resilience: Ajay Prakash and James Joun of Rinse
True
How I Built Resilience: Songe LaRon of Squire
False
Briogeo: Nancy Twine
False
How I Built Resilience: Sarah Harden and Lauren Neustadter of Hello Sunshine
False
How I Built Resilience: Live with Sadie Lincoln
False
How I Built Resilience: Sandra Oh Lin of KiwiCo
False
How I Built Resilience: Sonia Gil of Fluenz
False
How I Built Resilience: Cynt Marshall of Dallas Mavericks
False
Lush Cosmetics: Mark Constantine
False
How I Built Resilience: John Zimmer of Lyft
False
McBride Sisters Wine (P

In [71]:
print("\n")



