## McKinsey podcast transcript scraper

In [18]:
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm

In [715]:
def get_page_html(url):
    """Get page's HTML from URL."""
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    return soup


def get_all_titles(base_url, home_url):
    """Get podcast titles & URLs from podcast home page."""
    
    soup = get_page_html(home_url)
    # titles = soup.find_all('h3', class_='headline -arrow')
    # s = soup.find('div', class_='wrapper universal-page')
    s = soup.find('h2', string=lambda t: 'recent mckinsey podcast episodes' in t.lower()).parent
    ts = s.find_all('a', class_="item-title-link")

    episodes = {}
    for i, t in enumerate(ts): 
        # TMP
        # if i in range(20,23):
        # print(t.text)
        # print(f"{BASE_URL}{t['href']}")
        episodes[i] = {'title': t.text.strip().replace('\n', ''), 'url': f"{base_url}{t['href']}"}
    
    return episodes 

# yyy

def get_page_text(episodes):
    """Get all podcast transcripts."""
    for i, e in tqdm(episodes.items()):
        # TMP
        s2 = get_page_html(e['url'])
        # print(f"Title: {e['title']} ({i}/{total})")
        # print(f"URL: {e['url']}")

        # Get article intro
        intro = s2.find_all('div', class_='deck-content-wrapper')
        assert len(intro) == 1, 'Number of intro is not 1!'
        intro = intro[0].text.strip()

        # Get sub intro
        sub_intro = ''
        r2 = s2.find_all(id='backtracks-player')
        if r2: 
            assert len(r2) == 1, 'Number of sub intro is not 1!'
            c = r2[0]
            while c != None: 
                c = c.previous_sibling
                if c and c.name == 'p' and c.text != '\n':
                    sub_intro = c.text.strip() + ' ' + sub_intro
            replace_str = r'(The following transcript .* edited.*\.|The McKinsey [pP]odcast is.*)'
            sub_intro = re.sub(replace_str, '', sub_intro).strip()
        else: 
            print(f"Check sub intro in episode {i}.")

        # Combine intros
        intro += ' ' + sub_intro

        # Get article paragraphs
        article = s2.find_all('div', class_='article-body-wrapper')
        assert len(article) == 1, 'Check number of tag matches!'

        ps = article[0].find_all('p')

        paragraphs = []
        paragraphs.append(intro.strip())
        for p in ps:
            # print(p.text + '\n')
            paragraphs.append(p.text.strip())

        # Put it all together
        episodes.update({i: 
                         {
                             'title': e['title'], 
                             'url': e['url'],
                             'intro': intro, 
                             'paragraphs': paragraphs, 
                         }
                        }
                       )

    return episodes



In [716]:
# Scrapes Mckinsey podcasts
base_url = 'https://www.mckinsey.com'
home_url = "https://www.mckinsey.com/featured-insights/mckinsey-podcast"
episodes = get_all_titles(base_url, home_url)
episodes = get_page_text(episodes)

 58%|████████████████████████▊                  | 49/85 [00:50<00:38,  1.07s/it]

Check sub intro in episode 48.


100%|███████████████████████████████████████████| 85/85 [01:22<00:00,  1.03it/s]


In [719]:
# Save transcripts to CSV
import pandas as pd

df = pd.DataFrame(episodes).T
# Clean up escape sequences
df['paragraphs'] = df.paragraphs.apply(lambda p: [re.sub(r'(\n|\xa0)', ' ', x) for x in p])
df.to_csv('./mckinsey_podcasts.csv')

In [743]:
df.intro.apply(lambda x: '\u' in x).sum()

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 0-1: truncated \uXXXX escape (3837299423.py, line 1)

### Create embeddings


In [3]:
import cohere
import numpy as np
import re
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
# from annoy import AnnoyIndex
from dotenv import load_dotenv
import os
load_dotenv()

True

In [720]:
# Paste your API key here. Remember to not share publicly
api_key = os.getenv('COHERE_API_KEY')

# Create and retrieve a Cohere API key from os.cohere.ai
co = cohere.Client(api_key)

In [721]:
df = pd.read_csv('./mckinsey_podcasts.csv', index_col=0, converters={'paragraphs': lambda x: x[2:-2].split("', '")})
df['embeds'] = None  # initialize embeddings

In [723]:
# Get the embeddings for each sentence/paragraph of each episode
# TMP
# for i, row in df.iloc[:2].iterrows():
for i, row in tqdm(df.iterrows()):
    # Embed intros
    embeds = co.embed(texts=[row.intro],
                  model="large",
                  truncate="LEFT").embeddings
    # Embed paragraphs
    # embeds = co.embed(texts=row.paragraphs,
    #                   model="large",
    #                   truncate="LEFT").embeddings
    df.at[i, 'embeds'] = np.array(embeds)
    

85it [00:18,  4.49it/s]


In [724]:
df.to_csv('./mckinsey_podcasts_embeds.csv')

### Index embeddings

In [725]:
embeds = df.embeds.to_list()
texts = df.intro.to_numpy()

In [727]:
from annoy import AnnoyIndex

embed_dim = embeds[0].shape[1]

# Index the embeddings
search_index = AnnoyIndex(embed_dim, 'angular')
for i in range(len(embeds)):
    search_index.add_item(i, embeds[i][0])

search_index.build(10) # 10 trees
search_index.save('./mckinsey_podcasts.ann')

True

### Test indexing

In [728]:
# Load index
search_index = AnnoyIndex(embed_dim, 'angular')
search_index.load('./mckinsey_podcasts.ann') # super fast, will just mmap the file

True

In [729]:
# Choose an example (we'll retrieve others similar to it)
example_id = 52

# Retrieve nearest neighbors
similar_item_ids = search_index.get_nns_by_item(example_id, 10,
                                                include_distances=True)

In [730]:
# Format and print the text and distances
# Drop idx 0 since it's same as example_id
results = pd.DataFrame(data={'texts': texts[similar_item_ids[0]], 
                             'distance': similar_item_ids[1]}).drop(0)

print(f"Question:'{texts[example_id]}'\nNearest neighbors:")
results
# STOP

Question:'Airlines and passengers alike are flying into a world of flygskam and flying taxis. In this episode of the McKinsey Podcast, Simon London speaks with McKinsey senior partner Alex Dichter and partner Robin Riedel about the economics of the airline industry.'
Nearest neighbors:


Unnamed: 0,texts,distance
1,The COVID-19 pandemic decimated airlines in 20...,0.824408
2,With more than half the world’s population liv...,0.937529
3,The competitive landscape is shifting across t...,0.955826
4,The cloud-service cofounder and CEO talks abou...,0.968307
5,The industry has taken a breather in 2018 as f...,0.968789
6,"Here’s how technology, data, and human insight...",1.003303
7,With the global economy cooling and concerns a...,1.006455
8,Next-generation mobile networks promise lower ...,1.018946
9,McKinsey research shows that adoption of IoT t...,1.022226


In [731]:
query = "tips on how I can accomplish more in my professional life"
query = "what are some major problems affecting the world recently?"

# Get the query's embedding
query_embed = co.embed(texts=[query],
                  model="large",
                  truncate="LEFT").embeddings

# Retrieve the nearest neighbors
similar_item_ids = search_index.get_nns_by_vector(query_embed[0],10,
                                                include_distances=True)
# Format the results
results_new = pd.DataFrame(data = {
    'id': similar_item_ids[0],
    'texts': texts[similar_item_ids[0]], 
    'distance': similar_item_ids[1],
})

In [732]:
print(f"Query:'{query}'\nNearest neighbors:")
results_new

Query:'what are some major problems affecting the world recently?'
Nearest neighbors:


Unnamed: 0,id,texts,distance
0,13,Lives lost and upended. Soaring food and energ...,1.054381
1,12,The war in Ukraine poses a looming threat to t...,1.063986
2,81,Great strides have been made since 2008 to pre...,1.139327
3,8,"Fashion suppliers and brands, like other compa...",1.15079
4,58,"Across the developed world, health systems mus...",1.170376
5,33,The pandemic has put demands on the public sec...,1.17508
6,34,COVID-19 has pushed working women to the point...,1.175303
7,42,Rarely have supply-chain leaders faced more co...,1.185897
8,39,The pandemic is exacerbating the US behavioral...,1.189055
9,66,With more than half the world’s population liv...,1.192957


In [197]:
print(f"Query:'{query}'\nNearest neighbors:")
results_new

Query:'tips on how I can accomplish more in my professional life'
Nearest neighbors:


Unnamed: 0,id,texts,distance
0,40,"How can senior managers get better, faster bus...",1.077713
1,29,How you get along with your manager can shape ...,1.091231
2,24,McKinsey’s Joanna Barsh discusses how you can ...,1.097185
3,19,Women are doing more to support employee well-...,1.152625
4,70,Research shows that companies can improve shor...,1.154624
5,3,"Despite leaders’ best efforts, worldwide burno...",1.170799
6,67,Bringing advanced computing power and analytic...,1.174368
7,7,McKinsey’s report on human capital highlights ...,1.176258
8,23,"To keep top talent in the fold, managers must ...",1.179255
9,61,Cognitive biases can trip up even the most exp...,1.180501


In [190]:
# tmp
results_new.texts.values


array(['How can senior managers get better, faster business decisions from the meetings they attend or lead? Planning is key.',
       'How you get along with your manager can shape your health, happiness, and productivity.',
       'McKinsey’s Joanna Barsh discusses how you can become an effective and engaged leader.',
       'Women are doing more to support employee well-being but face higher stress levels as a result. Here’s how leaders can help.',
       'Research shows that companies can improve short- and long-term performance by focusing on a small number of carefully selected management practices.',
       'Despite leaders’ best efforts, worldwide burnout persists. New research reveals why—and how to help employees begin to thrive again.',
       'Bringing advanced computing power and analytics capabilities to bear on people decisions in an organization is crucial to driving lasting and effective change.',
       'McKinsey’s report on human capital highlights the importance of 

## TFTS

### Scrape episodes

In [392]:
# Find all episode URLs
base = 'https://www.gsb.stanford.edu'
URL = "https://www.gsb.stanford.edu/business-podcasts/think-fast-talk-smart-podcast"

page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")

n = soup.find_all('div', class_='field__item field--item-view_item')

episodes = {}
i = 0
for _n in n:
    n2 = _n.find_all('div', class_='views-field views-field-title')
    for _n2 in n2:
        _url = _n2.find('a')
        # print(f"{_n2.text} | {base}{_url['href']}")
        
        # Put it all together
        episodes.update({i: {'title': _n2.text, 'url': f"{base}{_url['href']}"}})
        i += 1

In [415]:
# Find and combine all intro sentences within an episode
for i, episode in tqdm(episodes.items()):
    _page = requests.get(episode['url'])
    _soup = BeautifulSoup(_page.content, "html.parser")
    r = _soup.find_all('p', {'class':['intro', 'intro-copy']})
    # print(episode['url'])
    if len(r) > 1:
        intro = r[0].text.strip() + ' ' + r[1].text.strip()
        for sib in r[1].next_siblings:
            if sib.name == 'p' and 'a podcast produced by Stanford Graduate School' not in sib.text:
                intro += ' ' + sib.text.strip()
    elif len(r) == 1:
        r = _soup.find_all('p')
        r_skipped_date = r[0].findNext('p').findNext('p')
        intro = r[0].text.strip() + ' ' + r_skipped_date.text.strip()
        for sib in r_skipped_date.next_siblings:
            if sib.name == 'p' and 'a podcast produced by Stanford Graduate School' not in sib.text:
                intro += ' ' + sib.text.strip()

    episodes.update({i: {'title': episode['title'],
                                   'url': episode['url'],
                                   'intro': intro,
                                   'paragraphs': ''
                                  }})
    

100%|███████████████████████████████████████████| 66/66 [00:29<00:00,  2.24it/s]


In [419]:
df2 = pd.DataFrame(episodes).T
df2.head()
df2.to_csv('./tfts_podcasts.csv')

### Embed and index

In [421]:
# Get the embeddings for intro of each episode
df2['embeds'] = ''
for i, row in tqdm(df2.iterrows()):
    # Embed intros
    embeds = co.embed(texts=[row.intro],
                  model="large",
                  truncate="LEFT").embeddings
    # Embed paragraphs
    # embeds = co.embed(texts=row.paragraphs,
    #                   model="large",
    #                   truncate="LEFT").embeddings
    df2.at[i, 'embeds'] = np.array(embeds)
    

66it [00:13,  4.89it/s]


In [424]:
df2.to_csv('./tfts_podcasts_embeds.csv')

In [422]:
# Check the dimensions of the embeddings
embeds = df2.embeds.to_list()
texts = df2.intro.to_numpy()
# df.embeds.shape
# embeds = df.loc[0, 'embeds']
# texts = np.array(df.loc[0, 'paragraphs'])

In [423]:
from annoy import AnnoyIndex

embed_dim = embeds[0].shape[1]

# Index the embeddings
search_index = AnnoyIndex(embed_dim, 'angular')
for i in range(len(embeds)):
    search_index.add_item(i, embeds[i][0])

search_index.build(10) # 10 trees
search_index.save('./tfts_podcasts.ann')

True

# Appendix

---

## Eth doc scraper

Use BeautifulSoup to scrape documentation sites for text content.


In [7]:
from bs4 import BeautifulSoup
import requests

In [None]:
URL = "https://ethereum.org/en/developers/docs/"
# URL = "https://ethereum.org/en/developers/docs/intro-to-ethereum/#what-is-ethereum"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")

In [None]:
# Find child URLs
base = 'https://ethereum.org'
n = soup.find_all('a')
for _n in n:
    if 'docs' in _n['href']:
        print(f"{base}{_n['href']}")

In [125]:
# Get all article text (title, headers, paragraphs)
a = soup.find_all("article")
for _a in a:
    s = _a.find_all(["h1", "h2", "h3", 'p'])
    for _s in s:
        if _s.name in ["h1", "h2", "h3"]:
            print(f"{_s.name}: {_s.text}\n")
        else: 
            print(_s.text + '\n')

h1: Ethereum development documentation

This documentation is designed to help you build with Ethereum. It covers Ethereum as a concept, explains the Ethereum tech stack, and documents advanced topics for more complex applications and use cases.

This is an open-source community effort, so feel free to suggest new topics, add new content, and provide examples wherever you think it might be helpful. All documentation can be edited via GitHub – if you're unsure how, follow these instructions.

h2: Development modules

If this is your first attempt at Ethereum development, we recommend starting at the beginning and working your way through like a book.

h3: Foundational topics

h3: Ethereum stack

h3: Advanced

h3: Was this article helpful?

Next



### manually explore soup

In [425]:
URL2 = '22'
URL2 = 'https://www.gsb.stanford.edu/insights/monologue-dialogue-how-handle-skeptical-audience'
URL2 = 'https://www.gsb.stanford.edu/insights/quick-thinks-how-being-present-improves-communication'
URL2 = 'https://www.gsb.stanford.edu/insights/feeling-nervous-how-anxiety-can-fuel-better-communication'
URL2 = 'https://www.mckinsey.com/mhi/our-insights/beyond-burnout-what-helps-and-what-doesnt'
page2 = requests.get(URL2)
soup2 = BeautifulSoup(page2.content, "html.parser")


In [401]:
r = soup2.find_all('p', {'class':['intro', 'intro-copy']})
intro = r[0].text + ' ' + r[1].text
for sib in r[1].next_siblings:
    if sib.name == 'p' and 'a podcast produced by Stanford Graduate School' not in sib.text:
        intro += ' ' + sib.text
    
    
# r = soup2.find('a', string='Think Fast, Talk Smart')#string='a podcast designed to hone your communication skills')
# r.previous_sibling
# r
# r[1].findNext('p').findNext('p').findNext('p')#.find_next_sibling().find_next_sibling()#.text.strip()#.find_next_sibling()
# r.parent.parent.parent.parent.parent.parent.parent.parent.parent.parent.parent.parent.parent.parent.parent.parent.parent.parent.parent.parent.parent.parent.parent.parent#find_next_sibling()

IndexError: list index out of range

In [409]:
# print(intro)
r = soup2.find_all('p', {'class':['intro', 'intro-copy']})
r[0].findNext('p').findNext('p')

<p>Stress, anxiety, nervousness — when these feelings inevitably arise, lecturer Kelly McGonigal says it’s not about making them go away, but using them to your advantage.</p>

In [367]:
'a podcast produced by Stanford Graduate School' in r[1].findNext('p').findNext('p').findNext('p').text

True

In [261]:
r = soup2.find('p', class_=['intro-copy', 'intro'])
# .findNext('p').findNext('p')#, string=lambda x: 'podcast produced by stanford graduate school of business and hosted by matt abrahams' in x.lower())
# intro selectionShareable
print(r.text)
print(r.findNext('p').text)
print(r.findNext('p').findNext('p').text)


Use these techniques when handling challenges and objections.
March 12, 2020
Preparing to speak in front of a skeptical audience is more than thinking about objections beforehand — there are specific techniques you can use to respond to these situations without sounding defensive, evasive, or dismissive.


# Example website scraper 

In [7]:
from bs4 import BeautifulSoup
import requests

In [8]:
URL = "https://realpython.github.io/fake-jobs/"
page = requests.get(URL)

# print(page.text)

soup = BeautifulSoup(page.content, "html.parser")


In [None]:
results = soup.find(id="ResultsContainer")
job_elements = results.find_all("div", class_="card-content")

# print(results.prettify())

for job_element in job_elements:
    title_element = job_element.find("h2", class_="title")
    company_element = job_element.find("h3", class_="company")
    location_element = job_element.find("p", class_="location")
    print(title_element.text.strip())
    print(company_element.text.strip())
    print(location_element.text.strip())
    print()
    


In [None]:
python_jobs = results.find_all(
    "h2", string=lambda text: "python" in text.lower()
)

python_job_elements = [
    h2_element.parent.parent.parent for h2_element in python_jobs
]

In [None]:
for job_element in python_job_elements:
    # -- snip --
    links = job_element.find_all("a")
    for link in links:
        link_url = link["href"]
        print(f"Apply here: {link_url}\n")