## McKinsey podcast transcript scraper

In [18]:
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm

In [20]:
def get_page_html(url):
    """Get page's HTML from URL."""
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    return soup


def get_all_titles(base_url, home_url):
    """Get podcast titles & URLs."""
    
    soup = get_page_html(home_url)
    titles = soup.find_all('h3', class_='headline -arrow')
    # s = soup.find('div', class_='wrapper universal-page')
    s = soup.find('h2', string=lambda t: 'recent mckinsey podcast episodes' in t.lower()).parent
    ts = s.find_all('a', class_="item-title-link")

    episodes = {}
    for i, t in enumerate(ts): 
        # print(t.text)
        # print(f"{BASE_URL}{t['href']}")
        episodes[i] = {'title': t.text.strip().replace('\n', ''), 'url': f"{base_url}{t['href']}"}
    
    return episodes 


def get_page_text(episodes):
    """Get all podcast transcripts."""
    total = len(episodes)

    for i, episode in tqdm(enumerate(episodes)):
        # TMP
        # if i in [1, 2]:
        e = episodes[i]
        s2 = get_page_html(e['url'])
        # print(f"Title: {e['title']} ({i}/{total})")
        # print(f"URL: {e['url']}")

        # Get article intro
        intro = s2.find_all('div', class_='deck-content-wrapper')
        assert len(intro) == 1, 'More than one intro!'
        intro = intro[0].text.strip()

        # Get article paragraphs
        article = s2.find_all('div', class_='article-body-wrapper')
        assert len(article) == 1, 'Check number of tag matches!'

        ps = article[0].find_all('p')

        paragraphs = []
        paragraphs.append(intro.strip())
        for p in ps:
            # print(p.text + '\n')
            paragraphs.append(p.text.strip())

        # Put it all together
        episodes.update({i: 
                         {
                             'title': e['title'], 
                             'url': e['url'],
                             'intro': intro, 
                             'paragraphs': paragraphs, 
                         }
                        }
                       )

    return episodes



In [21]:
# Scrapes Mckinsey podcasts
base_url = 'https://www.mckinsey.com'
home_url = "https://www.mckinsey.com/featured-insights/mckinsey-podcast"
episodes = get_all_titles(base_url, home_url)
episodes = get_page_text(episodes)

85it [01:23,  1.01it/s]


In [42]:
# Save transcripts to CSV
import pandas as pd

df = pd.DataFrame(episodes).T
# Clean up escape sequences
df['paragraphs'] = df.paragraphs.apply(lambda p: [re.sub(r'(\n|\xa0)', ' ', x) for x in p])
df.to_csv('./mckinsey_podcasts.csv')

## Embed and compute consine sim


In [3]:
import cohere
import numpy as np
import re
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
# from annoy import AnnoyIndex
from dotenv import load_dotenv
import os
load_dotenv()

True

In [4]:
# Paste your API key here. Remember to not share publicly
api_key = os.getenv('COHERE_API_KEY')

# Create and retrieve a Cohere API key from os.cohere.ai
co = cohere.Client(api_key)

In [138]:
df = pd.read_csv('./mckinsey_podcasts.csv', index_col=0, converters={'paragraphs': lambda x: x[2:-2].split("', '")})
df['embeds'] = None  # initialize embeddings

In [140]:
# Get the embeddings for each sentence/paragraph of each episode

# TMP
# for i, row in df.iloc[:2].iterrows():
for i, row in tqdm(df.iterrows()):
    # Embed intros
    embeds = co.embed(texts=[row.intro],
                  model="large",
                  truncate="LEFT").embeddings
    # Embed paragraphs
    # embeds = co.embed(texts=row.paragraphs,
    #                   model="large",
    #                   truncate="LEFT").embeddings
    df.at[i, 'embeds'] = np.array(embeds)
    

85it [00:17,  4.89it/s]


In [202]:
import time
df.to_csv('./mckinsey_podcasts_embeds.csv')

# time.sleep(3)
# print('y')

# df.paragraphs.apply(lambda x: len(x))
# df.intro
# df.head()

In [168]:
# Check the dimensions of the embeddings
embeds = df.embeds.to_list()
texts = df.intro.to_numpy()
# df.embeds.shape
# embeds = df.loc[0, 'embeds']
# texts = np.array(df.loc[0, 'paragraphs'])

In [169]:
from annoy import AnnoyIndex

embed_dim = embeds[0].shape[1]

# Index the embeddings
search_index = AnnoyIndex(embed_dim, 'angular')
for i in range(len(embeds)):
    search_index.add_item(i, embeds[i][0])

search_index.build(10) # 10 trees
search_index.save('./podcasts.ann')

True

In [170]:
# Load index
search_index = AnnoyIndex(embed_dim, 'angular')
search_index.load('./podcasts.ann') # super fast, will just mmap the file

True

In [171]:
# Choose an example (we'll retrieve others similar to it)
example_id = 52

# Retrieve nearest neighbors
similar_item_ids = search_index.get_nns_by_item(example_id, 10,
                                                include_distances=True)

In [172]:
# Format and print the text and distances
# Drop idx 0 since it's same as example_id
results = pd.DataFrame(data={'texts': texts[similar_item_ids[0]], 
                             'distance': similar_item_ids[1]}).drop(0)

print(f"Question:'{texts[example_id]}'\nNearest neighbors:")
results
# STOP

Question:'Airlines and passengers alike are flying into a world of flygskam and flying taxis.'
Nearest neighbors:


Unnamed: 0,texts,distance
1,The COVID-19 pandemic decimated airlines in 20...,1.100947
2,Scooter skeptic? Think again. Micromobility is...,1.109316
3,Innovation is accelerating in Europe and beyon...,1.139736
4,With more than half the world’s population liv...,1.142045
5,The competitive landscape is shifting across t...,1.142707
6,"A transformative year in the technology, trend...",1.149731
7,"Here’s how technology, data, and human insight...",1.162902
8,"Globalization isn’t in retreat, but it has mor...",1.163296
9,Next-generation mobile networks promise lower ...,1.172421


1. compute query embedding (q)
2. compare q to every paragraph of every podcast transcript
3. 

In [198]:
query = "tips on how I can accomplish more in my professional life"
query = "what are some major problems affecting the world recently?"

# Get the query's embedding
query_embed = co.embed(texts=[query],
                  model="large",
                  truncate="LEFT").embeddings

# Retrieve the nearest neighbors
similar_item_ids = search_index.get_nns_by_vector(query_embed[0],10,
                                                include_distances=True)
# Format the results
results_new = pd.DataFrame(data = {
    'id': similar_item_ids[0],
    'texts': texts[similar_item_ids[0]], 
    'distance': similar_item_ids[1],
})

In [199]:
print(f"Query:'{query}'\nNearest neighbors:")
results_new

Query:'what are some major problems affecting the world recently?'
Nearest neighbors:


Unnamed: 0,id,texts,distance
0,13,Lives lost and upended. Soaring food and energ...,1.034461
1,12,The war in Ukraine poses a looming threat to t...,1.080329
2,8,"Fashion suppliers and brands, like other compa...",1.119978
3,66,With more than half the world’s population liv...,1.129816
4,74,Governments and companies have much work to do...,1.144383
5,81,Great strides have been made since 2008 to pre...,1.146366
6,57,"Around the world, women are paid at lower rate...",1.157874
7,39,The pandemic is exacerbating the US behavioral...,1.172694
8,41,The COVID-19 pandemic has upended norms about ...,1.191313
9,58,"Across the developed world, health systems mus...",1.195445


In [197]:
print(f"Query:'{query}'\nNearest neighbors:")
results_new

Query:'tips on how I can accomplish more in my professional life'
Nearest neighbors:


Unnamed: 0,id,texts,distance
0,40,"How can senior managers get better, faster bus...",1.077713
1,29,How you get along with your manager can shape ...,1.091231
2,24,McKinsey’s Joanna Barsh discusses how you can ...,1.097185
3,19,Women are doing more to support employee well-...,1.152625
4,70,Research shows that companies can improve shor...,1.154624
5,3,"Despite leaders’ best efforts, worldwide burno...",1.170799
6,67,Bringing advanced computing power and analytic...,1.174368
7,7,McKinsey’s report on human capital highlights ...,1.176258
8,23,"To keep top talent in the fold, managers must ...",1.179255
9,61,Cognitive biases can trip up even the most exp...,1.180501


In [190]:
# tmp
results_new.texts.values


array(['How can senior managers get better, faster business decisions from the meetings they attend or lead? Planning is key.',
       'How you get along with your manager can shape your health, happiness, and productivity.',
       'McKinsey’s Joanna Barsh discusses how you can become an effective and engaged leader.',
       'Women are doing more to support employee well-being but face higher stress levels as a result. Here’s how leaders can help.',
       'Research shows that companies can improve short- and long-term performance by focusing on a small number of carefully selected management practices.',
       'Despite leaders’ best efforts, worldwide burnout persists. New research reveals why—and how to help employees begin to thrive again.',
       'Bringing advanced computing power and analytics capabilities to bear on people decisions in an organization is crucial to driving lasting and effective change.',
       'McKinsey’s report on human capital highlights the importance of 

# Appendix

---

## Eth doc scraper

Use BeautifulSoup to scrape documentation sites for text content.


In [7]:
from bs4 import BeautifulSoup
import requests

In [124]:
URL = "https://ethereum.org/en/developers/docs/"
# URL = "https://ethereum.org/en/developers/docs/intro-to-ethereum/#what-is-ethereum"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")

In [None]:
# Find child URLs
base = 'https://ethereum.org'
n = soup.find_all('a')
for _n in n:
    if 'docs' in _n['href']:
        print(f"{base}{_n['href']}")

In [125]:
# Get all article text (title, headers, paragraphs)
a = soup.find_all("article")
for _a in a:
    s = _a.find_all(["h1", "h2", "h3", 'p'])
    for _s in s:
        if _s.name in ["h1", "h2", "h3"]:
            print(f"{_s.name}: {_s.text}\n")
        else: 
            print(_s.text + '\n')

h1: Ethereum development documentation

This documentation is designed to help you build with Ethereum. It covers Ethereum as a concept, explains the Ethereum tech stack, and documents advanced topics for more complex applications and use cases.

This is an open-source community effort, so feel free to suggest new topics, add new content, and provide examples wherever you think it might be helpful. All documentation can be edited via GitHub – if you're unsure how, follow these instructions.

h2: Development modules

If this is your first attempt at Ethereum development, we recommend starting at the beginning and working your way through like a book.

h3: Foundational topics

h3: Ethereum stack

h3: Advanced

h3: Was this article helpful?

Next



# Example website scraper 

In [7]:
from bs4 import BeautifulSoup
import requests

In [8]:
URL = "https://realpython.github.io/fake-jobs/"
page = requests.get(URL)

# print(page.text)

soup = BeautifulSoup(page.content, "html.parser")


In [None]:
results = soup.find(id="ResultsContainer")
job_elements = results.find_all("div", class_="card-content")

# print(results.prettify())

for job_element in job_elements:
    title_element = job_element.find("h2", class_="title")
    company_element = job_element.find("h3", class_="company")
    location_element = job_element.find("p", class_="location")
    print(title_element.text.strip())
    print(company_element.text.strip())
    print(location_element.text.strip())
    print()
    


In [None]:
python_jobs = results.find_all(
    "h2", string=lambda text: "python" in text.lower()
)

python_job_elements = [
    h2_element.parent.parent.parent for h2_element in python_jobs
]

In [None]:
for job_element in python_job_elements:
    # -- snip --
    links = job_element.find_all("a")
    for link in links:
        link_url = link["href"]
        print(f"Apply here: {link_url}\n")