In [1]:
#We'll need spaCy for this

!pip install -Uqq pip setuptools wheel
!pip install -Uqq 'spacy[cuda-autodetect]'
!python -m spacy download en_core_web_trf

Collecting en-core-web-trf==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.5.0/en_core_web_trf-3.5.0-py3-none-any.whl (460.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m460.3/460.3 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')


In [2]:
import pandas as pd
from collections import Counter
from bs4 import BeautifulSoup
import re

In [3]:
df = pd.read_csv("../../data/htbaayj/raw.csv", encoding = 'UTF-8')

In [4]:
#There are some encoding issues we need to address, so let's tackle those:
df['text'] = df['text'].apply(lambda x: str(x).replace(u'\xa0', u''))

In [5]:
#We know this transcript scraped the wrong information, so let's get this out of the way...
df = df[df['title'] != '131: The Art of Public Speaking with Dr. Ray Hull...PLUS How Fast Are You Talking?! - How to be Awesome at Your Job']

#We know that some transcripts didn't scrape so let's get those out of the way, too!
#Note this removes 23 episodes
df = df[df['text'] != '\n']

#There are also some episodes that aren't conversations:
df = df[df['title'] != 'Happy 4th of July! - How to be Awesome at Your Job']
df = df[df['title'] !='Announcement - How to be Awesome at Your Job']
df = df[df['title'] != 'Winning the Mental Game of Quarantine - How to be Awesome at Your Job']

df.reset_index(drop=True, inplace = True)

In [6]:
#These are spelling errors and typos we've found in our cleaning process...
df.replace('Pete Mockatis', "Pete Mockaitis", inplace=True)
df.replace('Pete Mockaits', "Pete Mockaitis", inplace=True)
df.replace('Pete Mockaitis:', "Pete Mockaitis", inplace=True)
df.replace('Pete Mocakitis', "Pete Mockaitis", inplace=True)
df.replace('[Pete Mockaitis', "Pete Mockaitis", inplace=True)
df.replace('Michael Kerr|', "Michael Kerr", inplace=True)
df.replace('Bob Pozen', "Robert Pozen", inplace=True)
df.replace('Michael. P. Dolan', "Michael P. Dolan", inplace=True)
df.replace('Nicole Merrill', 'Nicolle Merrill', inplace=True)
df.replace('Karlo Siriban:', 'Karlo Siriban', inplace = True)
df.replace('Tonya Dalton:', 'Tonya Dalton', inplace = True)

In [7]:
#There are a lot of random hexcode and unicode characters that we need to remove.
#Also, the date column didn't accurately gather the publish date
#.replace seems to leave some values, so we're going to use a different approach

#BeautifulSoup throws a warning about how what we're giving it looks like a filename...
#We don't need that warning
import warnings

warnings.filterwarnings(
    action='ignore',
    module=r'.*bs4')

lines = []

for index, row in df.iterrows():
    text = row['text']
    title = row['title']
    transcript = text.split("\n")
    for line in transcript:
        line = BeautifulSoup(line, 'html.parser').get_text(strip=True)
        lines.append([title, line])  
        
new_df = pd.DataFrame(lines, columns = ['title', 'text'])
new_df['text'] = new_df['text'].str.strip()
new_df['title'] = new_df['title'].str.strip()

new_df.head()

Unnamed: 0,title,text
0,835: How to Thrive amid Stress and Irritation ...,
1,835: How to Thrive amid Stress and Irritation ...,Pete Mockaitis
2,835: How to Thrive amid Stress and Irritation ...,"Sharon, welcome to How to be Awesome at Your Job."
3,835: How to Thrive amid Stress and Irritation ...,Sharon Melnick
4,835: How to Thrive amid Stress and Irritation ...,"Delighted to be here, Pete."


In [8]:
new_df.tail()

Unnamed: 0,title,text
165651,001: Communicating with Inspiration and Clarit...,"Are you going to say, “Well I spent my life do..."
165652,001: Communicating with Inspiration and Clarit...,"I would challenge anybody here, life is too pr..."
165653,001: Communicating with Inspiration and Clarit...,Pete Mockaitis
165654,001: Communicating with Inspiration and Clarit...,Fantastic. Mawi thanks so much for kicking us ...
165655,001: Communicating with Inspiration and Clarit...,


In [9]:
# Now we want to chat this so that our dataframe looks like this
# title | speaker | text
# Let's start by getting a collection of who our speakers are:

#Let's use some machine learning to our advantage. We can use Spacy to identify speakers from the list of titles
import spacy
spacy.prefer_gpu()
nlp = spacy.load('en_core_web_trf')

#Convert all of our titles into one string and tokenize
tokens = nlp(' '.join(df.title.tolist()))

#Make a list of speakers
entities = [entity.text for entity in tokens.ents if entity.label_ == 'PERSON']


Counter(entities).most_common(5)

[('Brian Ahearn', 4),
 ('Michael Bungay Stanier', 4),
 ('Laura Vanderkam', 3),
 ('Devora Zack', 3),
 ('Brad Stulberg', 3)]

In [10]:
#Limit the list of entities to unique speakers
unique_speakers = list(set(entities))

print(len(unique_speakers))

754


In [11]:
# Find the episodes that don't have a speaker in our list

# To do that, we need a list of episode titles
title_list = df['title'].tolist()

filtered_titles = [title for title in title_list if not any(name in title for name in unique_speakers)]

print(len(filtered_titles))
print(filtered_titles)

4
['429: A Navy SEAL’s Surprising Key to Building Unstoppable Teams: Caring - How to be Awesome at Your Job', '425: Achieving More by Constantly Embarrassing Yourself with Case Kenny - How to be Awesome at Your Job', '237: Crafting Memorable Stories with Dr. Carmen Simon - How to be Awesome at Your Job', '236: Persuasion Pointers from a Legendary Infomercial Pitchman with Anthony Sullivan - How to be Awesome at Your Job']


In [12]:
#Let's just manually add these speakers.
missing_speakers = ['Alden Mills', 'Case Kenny', 'Dr. Carmen Simon', 'Anthony Sullivan']
unique_speakers.extend(missing_speakers)

#Let's compare how many speakers we have vs the number of episodes
print(len(unique_speakers))
df.shape

758


(819, 4)

In [13]:
#We know some of our speakers were in multiple episodes:

#Let's get a list of speakers listed more than once
speaker_counts = Counter(entities)
multiple_counts = Counter({k: c for k, c in speaker_counts.items() if c > 1})

#And see how many episodes are unaccounted for..
multiple_counts_episodes = (sum(multiple_counts.values()) - len(multiple_counts))
print(multiple_counts_episodes)

76


In [14]:
#That puts us at 834 speakers and only 819 episodes...
#Some episodes may have multiple speakers names in the titles...
#And some of the episodes don't have the speakers names in the title...
#We're within 2%. Let's see if some other processing helps us get to where we should be
#Let's figure out what the minimum and maximum number of words in our list of names is:

split_speakers = [name.split() for name in unique_speakers]

min = 1
max = 0
for item in split_speakers:
    if len(item) < min:
        min = len(item)
    elif len(item) > max:
        max = len(item)
        
print("minimum :", min)
print("maximum :", max)    

minimum : 1
maximum : 4


In [15]:
short_names = []
long_names = []
for item in split_speakers:
    if len(item) == 1:
        short_names.append(item[0])
    elif len(item) >= 3:
        name = ' '.join(item)
        print(name)
        long_names.append(name)

Alan Stein Jr.
Gemma Leigh Roberts
Sukhinder Singh Cassidy
R. Michael Anderson
Bradley R. Staats
Scott H. Young
Gayle Van Gils
Shawn C. Jones
Michael Bungay Stanier
Michael A. Roberto
Maura Nevel Thomas
W. Chris Winter
Michael D. Watkins
J. Kelly Hoey
Scott Anthony Barlow
Mamie Kanfer Stewart
J. Elise Keith
Kwame Christian, Esq.
Pam Fox Rollin
Michael J. Arena
Robert D. Smith
Michael J. Gelb
Olivia June Poole
John C. Maxwell
Guy Pierce Bell
Stephen M. R. Covey
Beth Benatti Kennedy
G. Richard Shell
Jim Harshaw, Jr.
John V. Petrocelli
Lee Hartley Carter
Brandi Nicole Johnson
Chris De Santis
Sarah Noll Wilson
Vanessa Van Edwards
Frances Cole Jones
S. Chris Edmonds
Michelle Tillis Lederman
Scott Jeffrey Miller
Oren Jay Sofer
Dr. Carmen Simon


In [16]:
print(short_names)

['Lisa', 'Steinman', 'David', 'Einhorn', 'Guy', 'Heather', 'ex-FBI']


In [17]:
#Okay, this shows us there's definitely some stuff that's not quite right!
#Let's start with the short names:

short_name_titles = [title for title in title_list if any(name in title for name in short_names)]

print(len(short_name_titles))
print(short_name_titles)

28
['807: How to Develop Confidence, Credibility, and Advocates with Heather Hansen - How to be Awesome at Your Job', '780: How Minds Change and How to Change Minds with David McRaney - How to be Awesome at Your Job', '720: Navigating the Great Resignation with Dr. David Rock - How to be Awesome at Your Job', '672: How to Ask For and Get What You Want with Heather Hansen - How to be Awesome at Your Job', '624: How to Be More Engaging with Storytelling and Humor with David Nihill - How to be Awesome at Your Job', '617: Enhancing Your Productivity by Managing Your Mental Energy with David Kadavy - How to be Awesome at Your Job', '578: How to Stay Calm and Productive Amid Uncertainty with David Lebel - How to be Awesome at Your Job', '559: How to Unify, Motivate, and Direct Any Team by Picking a Fight with David Burkus - How to be Awesome at Your Job', '546: Choosing Better Words for Better Leadership with David Marquet - How to be Awesome at Your Job', '516: Making Difficult Conversation

In [18]:
#there is probably a more efficient way to do this, but I'm going manual...

missing_short_names = ['Heather Hansen', 'David McRaney', 'Dr. David Rock', 'David Nihill', 'David Kadavy', 
                       'David Lebel', 'David Burkus', 'David Marquet', 'David Wood', 'David Allen', 
                       'David Komlos', 'Diana Wu David', 'Lisa Wentz', 'David Epstein', 'David Greene', 
                       'Guy Pierce Bell', 'Joe Navarro', 'David Okuniev', 'Heather Ackmann', 'Alan Ackmann', 
                       'David Mead', 'Lisa McLeod', 'Elizabeth McLeod', 'Cheryl Strauss Einhorn', 
                       'Guy Ferdman', 'Ilan Ferdman', 'Topper Steinman', 'Lisa Bodell', 'Lisa Cummings', 
                       'David Kadavy']

#Add all of the names to the list of speakers
unique_speakers.extend(missing_short_names)

#Remove all of the names from the list of speakers
for name in short_names:
    unique_speakers.remove(name)

In [19]:
#Now let's look at the long names
print(long_names)

['Alan Stein Jr.', 'Gemma Leigh Roberts', 'Sukhinder Singh Cassidy', 'R. Michael Anderson', 'Bradley R. Staats', 'Scott H. Young', 'Gayle Van Gils', 'Shawn C. Jones', 'Michael Bungay Stanier', 'Michael A. Roberto', 'Maura Nevel Thomas', 'W. Chris Winter', 'Michael D. Watkins', 'J. Kelly Hoey', 'Scott Anthony Barlow', 'Mamie Kanfer Stewart', 'J. Elise Keith', 'Kwame Christian, Esq.', 'Pam Fox Rollin', 'Michael J. Arena', 'Robert D. Smith', 'Michael J. Gelb', 'Olivia June Poole', 'John C. Maxwell', 'Guy Pierce Bell', 'Stephen M. R. Covey', 'Beth Benatti Kennedy', 'G. Richard Shell', 'Jim Harshaw, Jr.', 'John V. Petrocelli', 'Lee Hartley Carter', 'Brandi Nicole Johnson', 'Chris De Santis', 'Sarah Noll Wilson', 'Vanessa Van Edwards', 'Frances Cole Jones', 'S. Chris Edmonds', 'Michelle Tillis Lederman', 'Scott Jeffrey Miller', 'Oren Jay Sofer', 'Dr. Carmen Simon']


In [20]:
#Let's look at the titles with the long:

long_name_titles = [title for title in title_list if any(name in title for name in long_names)]

print(len(long_name_titles))
print(long_name_titles)

48
['819: How to Stop Avoiding Conflict with Sarah Noll Wilson - How to be Awesome at Your Job', '818: How to Find Greater Clarity, Satisfaction, and Fulfillment in Your Career with Scott Anthony Barlow - How to be Awesome at Your Job', '802: How to Level Up Your Career and Find a Job You Love with Brandi Nicole Johnson - How to be Awesome at Your Job', '789: How to Beat Stress, Stagnation, and Burnout with Alan Stein Jr. - How to be Awesome at Your Job', '772: How to Build Resilience to Thrive in Uncertainty with Gemma Leigh Roberts - How to be Awesome at Your Job', '768: How to Embrace Generational Differences and Resolve Conflict with Chris De Santis - How to be Awesome at Your Job', '763: Stephen M. R. Covey Reveals How Great Leaders Inspire Teams - How to be Awesome at Your Job', '727: How to Start Something New and See it Through with Michael Bungay Stanier - How to be Awesome at Your Job', '696: How to Separate Truth from Bullsh*t for Smarter Decisions with John V. Petrocelli - 

In [21]:
#These names are in our unique names list, but are missing the speaker's title
#We're probably going to need to revisit these names because the transcripts tend to not be so formal with the
#speakers name. For example Stephen M. R. Covey, is just Stephen Covey...
#In other cases, doctors may be referred to as Dr. Last name or First name Last name
#We'll revisit this at a later time, but it's helping us get in the direction we need to go!

missing_long_names = ['Dr. Michael D. Watkins', 'Dr. W. Chris Winter']

long_names.extend(missing_long_names)
long_names.remove('Michael D. Watkins')
long_names.remove('W. Chris Winter')

#Add the names to the list of unique speakers
unique_speakers.extend(missing_long_names)

#Remove the names missing the titles...
unique_speakers.remove('Michael D. Watkins')
unique_speakers.remove('W. Chris Winter')

In [22]:
#Now that we have a list of names pulled from the titles of episodes,
#Let's see if we can filter the episodes to those where the guest's name doesn't match the episode

#First, let's find the episodes where the guest's name is in the episode transcript
filtered_df = new_df[new_df['text'].isin(unique_speakers)]

#Let's make that a list
missing = filtered_df.drop_duplicates(subset='title', keep='first', inplace=False)

filtered_title_list = missing['title'].tolist()


#And subtract the episodes that aren't in that list
filtered_titles = [title for title in title_list if not any(name == title for name in filtered_title_list)]
print(len(filtered_titles))
print(filtered_titles)

3
['415: Pursuing Your Passion the Smart Way with Brad Stulberg - How to be Awesome at Your Job', '291: Deciding Whether to Stay or Go with Pete Mockaitis - How to be Awesome at Your Job', '034: Accelerating Amid Complexity with Kevan Hall - How to be Awesome at Your Job']


In [23]:
missing_names = ['Brad Stulberg', 'Pete Mockaitis', 'Kevan Hall']

#Add all of the names to the list of speakers
unique_speakers.extend(missing_names)

#And remove any duplicates
unique_speakers = list(set(unique_speakers))
print(len(unique_speakers))

758


In [24]:
# Now that we have a list of speakers, let's use that to setup our dataframe
# title | speaker | text
# Let's start by getting a collection of who our speakers are:

speakers = []
statements = []
current_speaker = None
for index, row in new_df.iterrows():
    title = row['title']
    text = row['text']
    if text in unique_speakers:
        current_speaker = text
        speakers.append(text)
    else:
        statements.append([title, current_speaker, text])

statement_df = pd.DataFrame(statements, columns=['title', 'speaker', 'statement'])
statement_df.head(20)

Unnamed: 0,title,speaker,statement
0,835: How to Thrive amid Stress and Irritation ...,,
1,835: How to Thrive amid Stress and Irritation ...,Pete Mockaitis,"Sharon, welcome to How to be Awesome at Your Job."
2,835: How to Thrive amid Stress and Irritation ...,Sharon Melnick,"Delighted to be here, Pete."
3,835: How to Thrive amid Stress and Irritation ...,Pete Mockaitis,"Well, I’d love it if you could kick us off by ..."
4,835: How to Thrive amid Stress and Irritation ...,Sharon Melnick,"What a claim to fame. Well, it was early on in..."
5,835: How to Thrive amid Stress and Irritation ...,Sharon Melnick,"Without overthinking it, I went right over, I ..."
6,835: How to Thrive amid Stress and Irritation ...,Sharon Melnick,"At a certain point, she turns to her chief of ..."
7,835: How to Thrive amid Stress and Irritation ...,Sharon Melnick,And she starts telling me about all the initia...
8,835: How to Thrive amid Stress and Irritation ...,Sharon Melnick,"Well, I don’t exactly say no, like, I said, “W..."
9,835: How to Thrive amid Stress and Irritation ...,Sharon Melnick,What I did was I prioritized my own evaluation...


In [25]:
#This looks really good. Now let's see if we can find any episodes missing a speaker other than Pete

# Group the data by "title" and count the unique values in the "speaker" column
speaker_count = statement_df.groupby('title')['speaker'].nunique()

# Filter the data to only keep the rows where the count is less than 2
filtered_df = statement_df[statement_df['title'].isin(speaker_count[speaker_count < 2].index)]

filtered_df.head(20)

Unnamed: 0,title,speaker,statement
48963,426: How to Feel Limitless in Your Career with...,Pete Mockaitis,
48964,426: How to Feel Limitless in Your Career with...,Pete Mockaitis,"Laura, thanks so much for joining us here on t..."
48965,426: How to Feel Limitless in Your Career with...,Pete Mockaitis,Laura Gassner Otting
48966,426: How to Feel Limitless in Your Career with...,Pete Mockaitis,Thank you so much. This is such a better podca...
48967,426: How to Feel Limitless in Your Career with...,Pete Mockaitis,"Yeah, that one sort of petered out pretty quic..."
48968,426: How to Feel Limitless in Your Career with...,Pete Mockaitis,Laura Gassner Otting
48969,426: How to Feel Limitless in Your Career with...,Pete Mockaitis,Yeah. I was that kid in gym class growing up t...
48970,426: How to Feel Limitless in Your Career with...,Pete Mockaitis,I also went to cyber camp.
48971,426: How to Feel Limitless in Your Career with...,Pete Mockaitis,Laura Gassner Otting
48972,426: How to Feel Limitless in Your Career with...,Pete Mockaitis,Did you really?


In [26]:
#And let's see if we can clean these 1 by 1
#We'll add Laura to our list and recreate our dataframe

unique_speakers.append('Laura Gassner Otting')

#We'll be doing this a few times, so let's make this a function
def create_statement_df(df):
    global unique_speakers
    speakers = []
    statements = []
    current_speaker = None
    for index, row in df.iterrows():
        title = row['title']
        text = row['text']
        if text in unique_speakers:
            current_speaker = text
            speakers.append(text)
        else:
            statements.append([title, current_speaker, text])
    statements = pd.DataFrame(statements, columns=['title', 'speaker', 'statement'])
    return statements

statement_df = create_statement_df(new_df)
statement_df.head(20)

Unnamed: 0,title,speaker,statement
0,835: How to Thrive amid Stress and Irritation ...,,
1,835: How to Thrive amid Stress and Irritation ...,Pete Mockaitis,"Sharon, welcome to How to be Awesome at Your Job."
2,835: How to Thrive amid Stress and Irritation ...,Sharon Melnick,"Delighted to be here, Pete."
3,835: How to Thrive amid Stress and Irritation ...,Pete Mockaitis,"Well, I’d love it if you could kick us off by ..."
4,835: How to Thrive amid Stress and Irritation ...,Sharon Melnick,"What a claim to fame. Well, it was early on in..."
5,835: How to Thrive amid Stress and Irritation ...,Sharon Melnick,"Without overthinking it, I went right over, I ..."
6,835: How to Thrive amid Stress and Irritation ...,Sharon Melnick,"At a certain point, she turns to her chief of ..."
7,835: How to Thrive amid Stress and Irritation ...,Sharon Melnick,And she starts telling me about all the initia...
8,835: How to Thrive amid Stress and Irritation ...,Sharon Melnick,"Well, I don’t exactly say no, like, I said, “W..."
9,835: How to Thrive amid Stress and Irritation ...,Sharon Melnick,What I did was I prioritized my own evaluation...


In [27]:
#And since we'll be cleaning these, let's make some functions for them as well...

def get_dirty_data(df):
    # Group the data by "title" and count the unique values in the "speaker" column
    speaker_count = df.groupby('title')['speaker'].nunique()
    # Filter the data to only keep the rows where the count is less than 2
    filtered_df = df[df['title'].isin(speaker_count[speaker_count < 2].index)]    
    return filtered_df

filtered_df = get_dirty_data(statement_df)
filtered_df.head(20)

Unnamed: 0,title,speaker,statement
50452,415: Pursuing Your Passion the Smart Way with ...,Steven Landsburg,
50453,415: Pursuing Your Passion the Smart Way with ...,Steven Landsburg,"Pete Mockaitis: Brad, welcome back to the How ..."
50454,415: Pursuing Your Passion the Smart Way with ...,Steven Landsburg,"Brad Stulberg: Hey, thanks so much for having me."
50455,415: Pursuing Your Passion the Smart Way with ...,Steven Landsburg,"Pete Mockaitis: Well, I’m excited to dig into ..."
50456,415: Pursuing Your Passion the Smart Way with ...,Steven Landsburg,Brad Stulberg: My love of cats. How do you kno...
50457,415: Pursuing Your Passion the Smart Way with ...,Steven Landsburg,"Pete Mockaitis: Well, there’s a form I have gu..."
50458,415: Pursuing Your Passion the Smart Way with ...,Steven Landsburg,"Brad Stulberg: Oh, I said I loved-"
50459,415: Pursuing Your Passion the Smart Way with ...,Steven Landsburg,"Pete Mockaitis: Oh, yeah, you totally … You ju..."
50460,415: Pursuing Your Passion the Smart Way with ...,Steven Landsburg,"Brad Stulberg: I’ve got two, as you said, Sonn..."
50461,415: Pursuing Your Passion the Smart Way with ...,Steven Landsburg,"Pete Mockaitis: Well, tell me what are some of..."


#Note, look at episode 415 again

In [28]:
#This looks like an easy fix!

df.replace('Pete Mockaitis:', "Pete Mockaitis", inplace=True)
df.replace('Brad Stulberg:', "Brad Stulberg", inplace=True)

lines = []

for index, row in df.iterrows():
    text = row['text']
    title = row['title']
    transcript = text.split("\n")
    for line in transcript:
        line = BeautifulSoup(line, 'html.parser').get_text(strip=True)
        lines.append([title, line])  
        
new_df = pd.DataFrame(lines, columns = ['title', 'text'])
new_df['text'] = new_df['text'].str.strip()
new_df['title'] = new_df['title'].str.strip()


statement_df = create_statement_df(new_df)
filtered_df = get_dirty_data(statement_df)
filtered_df.head(20)

Unnamed: 0,title,speaker,statement
50452,415: Pursuing Your Passion the Smart Way with ...,Steven Landsburg,
50453,415: Pursuing Your Passion the Smart Way with ...,Steven Landsburg,"Pete Mockaitis: Brad, welcome back to the How ..."
50454,415: Pursuing Your Passion the Smart Way with ...,Steven Landsburg,"Brad Stulberg: Hey, thanks so much for having me."
50455,415: Pursuing Your Passion the Smart Way with ...,Steven Landsburg,"Pete Mockaitis: Well, I’m excited to dig into ..."
50456,415: Pursuing Your Passion the Smart Way with ...,Steven Landsburg,Brad Stulberg: My love of cats. How do you kno...
50457,415: Pursuing Your Passion the Smart Way with ...,Steven Landsburg,"Pete Mockaitis: Well, there’s a form I have gu..."
50458,415: Pursuing Your Passion the Smart Way with ...,Steven Landsburg,"Brad Stulberg: Oh, I said I loved-"
50459,415: Pursuing Your Passion the Smart Way with ...,Steven Landsburg,"Pete Mockaitis: Oh, yeah, you totally … You ju..."
50460,415: Pursuing Your Passion the Smart Way with ...,Steven Landsburg,"Brad Stulberg: I’ve got two, as you said, Sonn..."
50461,415: Pursuing Your Passion the Smart Way with ...,Steven Landsburg,"Pete Mockaitis: Well, tell me what are some of..."


In [29]:
#Looks like our colons weren't removed... let's try that again...

df['text'] = df['text'].apply(lambda x: str(x).replace(u'Pete Mockaitis:', u'\nPete Mockaitis\n'))
df['text'] = df['text'].apply(lambda x: str(x).replace(u'Brad Stulberg:', u'\nBrad Stulberg\n'))

lines = []

for index, row in df.iterrows():
    text = row['text']
    title = row['title']
    transcript = text.split("\n")
    for line in transcript:
        line = BeautifulSoup(line, 'html.parser').get_text(strip=True)
        lines.append([title, line])  
        
new_df = pd.DataFrame(lines, columns = ['title', 'text'])
new_df['text'] = new_df['text'].str.strip()
new_df['title'] = new_df['title'].str.strip()


statement_df = create_statement_df(new_df)
filtered_df = get_dirty_data(statement_df)
filtered_df.head(20)

Unnamed: 0,title,speaker,statement
54599,390: Five Practices for Flexible Course Correc...,Pete Mockaitis,
54600,390: Five Practices for Flexible Course Correc...,Pete Mockaitis,"Ed, thanks so much for joining us here on the ..."
54601,390: Five Practices for Flexible Course Correc...,Pete Mockaitis,Edward Muzio
54602,390: Five Practices for Flexible Course Correc...,Pete Mockaitis,"Hi Pete, thanks for having me."
54603,390: Five Practices for Flexible Course Correc...,Pete Mockaitis,"Oh, well, it’s good to have you. I’m excited t..."
54604,390: Five Practices for Flexible Course Correc...,Pete Mockaitis,Edward Muzio
54605,390: Five Practices for Flexible Course Correc...,Pete Mockaitis,"My kindergartener had a talent for music, so w..."
54606,390: Five Practices for Flexible Course Correc...,Pete Mockaitis,What I can tell you is I’m ahead of him now Pe...
54607,390: Five Practices for Flexible Course Correc...,Pete Mockaitis,He has a sort of infinite patience in the sens...
54608,390: Five Practices for Flexible Course Correc...,Pete Mockaitis,"Well, I have a feeling he’s going to love that..."


In [30]:
#We'll add Edward Muzio to our list

unique_speakers.append('Edward Muzio')

#Recreate our dataframe
statement_df = create_statement_df(new_df)

#And see how it's changed
filtered_df = get_dirty_data(statement_df)
filtered_df.head(20)

Unnamed: 0,title,speaker,statement
71759,291: Deciding Whether to Stay or Go with Pete ...,Oren Jay Sofer,
71760,291: Deciding Whether to Stay or Go with Pete ...,Oren Jay Sofer,I’m so fired up to dig into this stuff. I want...
71761,291: Deciding Whether to Stay or Go with Pete ...,Oren Jay Sofer,The reason is because I’ve done many coaching ...
71762,291: Deciding Whether to Stay or Go with Pete ...,Oren Jay Sofer,"Beyond that, in talking with listeners, it see..."
71763,291: Deciding Whether to Stay or Go with Pete ...,Oren Jay Sofer,It’s a tricky question and one that cannot be ...
71764,291: Deciding Whether to Stay or Go with Pete ...,Oren Jay Sofer,I want to share with you some of the greatest ...
71765,291: Deciding Whether to Stay or Go with Pete ...,Oren Jay Sofer,I’m going to talk through each of these questi...
71766,291: Deciding Whether to Stay or Go with Pete ...,Oren Jay Sofer,You might have a decision by the end of this p...
71767,291: Deciding Whether to Stay or Go with Pete ...,Oren Jay Sofer,The first question to consider here is what’s ...
71768,291: Deciding Whether to Stay or Go with Pete ...,Oren Jay Sofer,I’ve already had folks share with me as they’v...


In [31]:
#This episode is actually a monologue, so let's get rid of it...
new_df = new_df[new_df['title'] != '291: Deciding Whether to Stay or Go with Pete Mockaitis - How to be Awesome at Your Job']
df = df[df['title'] != '291: Deciding Whether to Stay or Go with Pete Mockaitis - How to be Awesome at Your Job']

#Recreate our dataframe
statement_df = create_statement_df(new_df)

#And see how it's changed
filtered_df = get_dirty_data(statement_df)
filtered_df.head(20)

Unnamed: 0,title,speaker,statement
83953,"203: Cultivating Sponsors, Developing Fearless...",Pete Mockaitis,
83954,"203: Cultivating Sponsors, Developing Fearless...",Pete Mockaitis,"Simon, thanks for joining us here on the How t..."
83955,"203: Cultivating Sponsors, Developing Fearless...",Pete Mockaitis,Simon T. Bailey
83956,"203: Cultivating Sponsors, Developing Fearless...",Pete Mockaitis,Thank you so much for having me. Good to be wi...
83957,"203: Cultivating Sponsors, Developing Fearless...",Pete Mockaitis,"Well, I’m so excited to dig into this conversa..."
83958,"203: Cultivating Sponsors, Developing Fearless...",Pete Mockaitis,Simon T. Bailey
83959,"203: Cultivating Sponsors, Developing Fearless...",Pete Mockaitis,"We do about 175 days a year, speak about 100 t..."
83960,"203: Cultivating Sponsors, Developing Fearless...",Pete Mockaitis,"Oh, that’s wild. I love it when speakers get t..."
83961,"203: Cultivating Sponsors, Developing Fearless...",Pete Mockaitis,Simon T. Bailey
83962,"203: Cultivating Sponsors, Developing Fearless...",Pete Mockaitis,"Oh, my goodness. It just happened a couple of ..."


In [32]:
#We'll add Simon T. Bailey to our list

unique_speakers.append('Simon T. Bailey')

#Recreate our dataframe
statement_df = create_statement_df(new_df)

#And see how it's changed
filtered_df = get_dirty_data(statement_df)
filtered_df.head(20)

Unnamed: 0,title,speaker,statement
103608,041: Developing a Powerful Presence with Dr. N...,Pete Mockaitis,
103609,041: Developing a Powerful Presence with Dr. N...,Pete Mockaitis,"Nick, thank you so much for joining us here on..."
103610,041: Developing a Powerful Presence with Dr. N...,Pete Mockaitis,Dr. Nick Morgan
103611,041: Developing a Powerful Presence with Dr. N...,Pete Mockaitis,"It’s a pleasure, Pete. Thanks for having me on..."
103612,041: Developing a Powerful Presence with Dr. N...,Pete Mockaitis,"Oh, yes well I’m so pumped up. I’ve been a big..."
103613,041: Developing a Powerful Presence with Dr. N...,Pete Mockaitis,Dr. Nick Morgan
103614,041: Developing a Powerful Presence with Dr. N...,Pete Mockaitis,"Yes, sure. I worked only recently with a consu..."
103615,041: Developing a Powerful Presence with Dr. N...,Pete Mockaitis,Her affect was very minimal. She was successfu...
103616,041: Developing a Powerful Presence with Dr. N...,Pete Mockaitis,You don’t want to get somebody pretending to b...
103617,041: Developing a Powerful Presence with Dr. N...,Pete Mockaitis,"Well it is a great story, and now you got me c..."


In [33]:
#We'll add Dr. Nick Morgan to our list

unique_speakers.append('Dr. Nick Morgan')

#Recreate our dataframe
statement_df = create_statement_df(new_df)

#And see how it's changed
filtered_df = get_dirty_data(statement_df)
filtered_df.head(20)

Unnamed: 0,title,speaker,statement


In [34]:
#Looks like we've handled our missing speakers! Let's see if there's anything where there are possibly too many speakers

# Group the data by "title" and count the unique values in the "speaker" column
speaker_count = statement_df.groupby('title')['speaker'].nunique()

# Filter the data to only keep the rows where the count is more than 2
filtered_df = statement_df[statement_df['title'].isin(speaker_count[speaker_count > 2].index)]

filtered_df.head(20)

Unnamed: 0,title,speaker,statement
107,834: How to End Micromanagement Once and For A...,Sharon Melnick,
108,834: How to End Micromanagement Once and For A...,Pete Mockaitis,"Lia, welcome back to How to be Awesome at Your..."
109,834: How to End Micromanagement Once and For A...,Lia Garvin,Thank you so much for having me. So excited to...
110,834: How to End Micromanagement Once and For A...,Pete Mockaitis,"Well, I’m excited to dig into what you’ve been..."
111,834: How to End Micromanagement Once and For A...,Lia Garvin,"Yes, with micromanagement and how to end it on..."
112,834: How to End Micromanagement Once and For A...,Pete Mockaitis,"Yes. Well, lay it on us, what’s the story?"
113,834: How to End Micromanagement Once and For A...,Lia Garvin,"Yeah. Since we last met, I actually ended up l..."
114,834: How to End Micromanagement Once and For A...,Pete Mockaitis,"Well, yes, I’ve heard it many a time. So, mayb..."
115,834: How to End Micromanagement Once and For A...,Lia Garvin,"So, I think a lot does depend on the kind of j..."
116,834: How to End Micromanagement Once and For A...,Lia Garvin,Let’s say we bring that into a job that’s more...


In [35]:
#Looks like we're carrying over the last speaker from the previous episode... Let's remove those lines

new_df = new_df[new_df['text'] != '']

#Recreate our dataframe
statement_df = create_statement_df(new_df)

# Group the data by "title" and count the unique values in the "speaker" column
speaker_count = statement_df.groupby('title')['speaker'].nunique()

# Filter the data to only keep the rows where the count is more than 2
filtered_df = statement_df[statement_df['title'].isin(speaker_count[speaker_count > 2].index)]

filtered_df.head(20)

Unnamed: 0,title,speaker,statement
6079,783: How to Restore Energy and Clarity by Tuni...,Pete Mockaitis,"Leigh and Justin, welcome to How to be Awesome..."
6080,783: How to Restore Energy and Clarity by Tuni...,Leigh Marz,"Hey, thanks, Pete."
6081,783: How to Restore Energy and Clarity by Tuni...,Justin Zorn,"Thanks for having us, Pete."
6082,783: How to Restore Energy and Clarity by Tuni...,Pete Mockaitis,"Well, I’m excited to dig in. And, first, I’d l..."
6083,783: How to Restore Energy and Clarity by Tuni...,Leigh Marz,"Well, I guess what we found is that when we st..."
6084,783: How to Restore Energy and Clarity by Tuni...,Justin Zorn,We really did start thinking about the importa...
6085,783: How to Restore Energy and Clarity by Tuni...,Justin Zorn,"And as we asked people this question, “What’s ..."
6086,783: How to Restore Energy and Clarity by Tuni...,Pete Mockaitis,That’s so funny because the first thing I thou...
6087,783: How to Restore Energy and Clarity by Tuni...,Leigh Marz,"That was the big surprise. Yeah, that was the ..."
6088,783: How to Restore Energy and Clarity by Tuni...,Justin Zorn,"And the funny thing is, even an anechoic chamb..."


In [36]:
#I went through all of these and Pete's name is spelled wrong in the following episode
#287: Establishing Motivation, Intention, and Boundaries Like a Boss with Emily Thompson and Kathleen Shannon - How to be Awesome at Your Job

df['text'] = df['text'].apply(lambda x: str(x).replace(u'Pete Mocakitis', u'Pete Mockaitis'))
new_df.replace('Pete Mocakitis', "Pete Mockaitis", inplace=True)

#Recreate our dataframe
statement_df = create_statement_df(new_df)

In [37]:
#This dataset is actually in a really, really great place!
#We have an effective list of all of our speakers, and our statements are mostly labeled!
#In previous pipeline attempts, we learned that there are some quirks we're going to attempt to address now
#There are a few lines in a few episodes that were not separated by a '\n', for example:
#John Poelstra: Yes.
#Matthew Abrahams Yes.
#Pete Mockaitis Okay.
#What we want to do is try to find the times where the statement starts with a name in our list of speakers
#And then we can review those

filtered_df = statement_df[statement_df['statement'].str.startswith(tuple(unique_speakers))]
filtered_df.head()

Unnamed: 0,title,speaker,statement
2104,817: How to Navigate Complexity and Win with J...,Pete Mockaitis,Jennifer Garvey Berger
2107,817: How to Navigate Complexity and Win with J...,Pete Mockaitis,Jennifer Garvey Berger
2110,817: How to Navigate Complexity and Win with J...,Pete Mockaitis,Jennifer Garvey Berger
2113,817: How to Navigate Complexity and Win with J...,Pete Mockaitis,Jennifer Garvey Berger
2117,817: How to Navigate Complexity and Win with J...,Pete Mockaitis,Jennifer Garvey Berger


In [38]:
filtered_df.at[2113, 'title']

'817: How to Navigate Complexity and Win with Jennifer Garvey Berger - How to be Awesome at Your Job'

In [39]:
filtered_df = filtered_df[filtered_df['title'] == '817: How to Navigate Complexity and Win with Jennifer Garvey Berger - How to be Awesome at Your Job']
filtered_df.head(20)

Unnamed: 0,title,speaker,statement
2104,817: How to Navigate Complexity and Win with J...,Pete Mockaitis,Jennifer Garvey Berger
2107,817: How to Navigate Complexity and Win with J...,Pete Mockaitis,Jennifer Garvey Berger
2110,817: How to Navigate Complexity and Win with J...,Pete Mockaitis,Jennifer Garvey Berger
2113,817: How to Navigate Complexity and Win with J...,Pete Mockaitis,Jennifer Garvey Berger
2117,817: How to Navigate Complexity and Win with J...,Pete Mockaitis,Jennifer Garvey Berger
2123,817: How to Navigate Complexity and Win with J...,Pete Mockaitis,Jennifer Garvey Berger
2126,817: How to Navigate Complexity and Win with J...,Pete Mockaitis,Jennifer Garvey Berger
2129,817: How to Navigate Complexity and Win with J...,Pete Mockaitis,Jennifer Garvey Berger
2132,817: How to Navigate Complexity and Win with J...,Pete Mockaitis,Jennifer Garvey Berger
2135,817: How to Navigate Complexity and Win with J...,Pete Mockaitis,Jennifer Garvey Berger


In [40]:
filtered_df.at[2104, 'statement']

'Jennifer Garvey Berger'

In [41]:
statement_df.iloc[2100:2115]

Unnamed: 0,title,speaker,statement
2100,"818: How to Find Greater Clarity, Satisfaction...",Scott Anthony Barlow,"We’ve had almost 50,000 people at this point t..."
2101,"818: How to Find Greater Clarity, Satisfaction...",Pete Mockaitis,"All right. Scott, this has been a treat. I wis..."
2102,"818: How to Find Greater Clarity, Satisfaction...",Scott Anthony Barlow,I appreciate it.
2103,817: How to Navigate Complexity and Win with J...,Pete Mockaitis,"Jennifer, welcome to How to be Awesome at Your..."
2104,817: How to Navigate Complexity and Win with J...,Pete Mockaitis,Jennifer Garvey Berger
2105,817: How to Navigate Complexity and Win with J...,Pete Mockaitis,Thank you for having me. It’s great to be here.
2106,817: How to Navigate Complexity and Win with J...,Pete Mockaitis,"Well, it’s great to have you and I appreciate ..."
2107,817: How to Navigate Complexity and Win with J...,Pete Mockaitis,Jennifer Garvey Berger
2108,817: How to Navigate Complexity and Win with J...,Pete Mockaitis,"You know, we moved to New Zealand in 2006, and..."
2109,817: How to Navigate Complexity and Win with J...,Pete Mockaitis,"That is good. And I’ve been surprised at how, ..."


In [42]:
#Let's add Jennifer to our unique speakers list and see if that changes anything?

unique_speakers.append('Jennifer Garvey Berger')

#Recreate our dataframe
statement_df = create_statement_df(new_df)

#And see how it's changed
statement_df.iloc[2100:2115]

Unnamed: 0,title,speaker,statement
2100,"818: How to Find Greater Clarity, Satisfaction...",Scott Anthony Barlow,"We’ve had almost 50,000 people at this point t..."
2101,"818: How to Find Greater Clarity, Satisfaction...",Pete Mockaitis,"All right. Scott, this has been a treat. I wis..."
2102,"818: How to Find Greater Clarity, Satisfaction...",Scott Anthony Barlow,I appreciate it.
2103,817: How to Navigate Complexity and Win with J...,Pete Mockaitis,"Jennifer, welcome to How to be Awesome at Your..."
2104,817: How to Navigate Complexity and Win with J...,Jennifer Garvey Berger,Thank you for having me. It’s great to be here.
2105,817: How to Navigate Complexity and Win with J...,Pete Mockaitis,"Well, it’s great to have you and I appreciate ..."
2106,817: How to Navigate Complexity and Win with J...,Jennifer Garvey Berger,"You know, we moved to New Zealand in 2006, and..."
2107,817: How to Navigate Complexity and Win with J...,Pete Mockaitis,"That is good. And I’ve been surprised at how, ..."
2108,817: How to Navigate Complexity and Win with J...,Jennifer Garvey Berger,It was amazing. I used to get into taxicabs an...
2109,817: How to Navigate Complexity and Win with J...,Pete Mockaitis,"That makes sense. Well, let’s talk about attai..."


In [43]:
#Let's get back into that dirty data
filtered_df = get_dirty_data(statement_df)
filtered_df.head(20)

Unnamed: 0,title,speaker,statement
5285,"789: How to Beat Stress, Stagnation, and Burno...",Pete Mockaitis,"Alan, welcome back to How to be Awesome at You..."
5286,"789: How to Beat Stress, Stagnation, and Burno...",Pete Mockaitis,"Alan Stein, Jr."
5287,"789: How to Beat Stress, Stagnation, and Burno...",Pete Mockaitis,"Oh, it’s so great to be with you again. I’ve b..."
5288,"789: How to Beat Stress, Stagnation, and Burno...",Pete Mockaitis,"Oh, me, too. Well, I’m curious to hear, any pa..."
5289,"789: How to Beat Stress, Stagnation, and Burno...",Pete Mockaitis,"Alan Stein, Jr."
5290,"789: How to Beat Stress, Stagnation, and Burno...",Pete Mockaitis,"Yeah, I would say a pretty long list of them, ..."
5291,"789: How to Beat Stress, Stagnation, and Burno...",Pete Mockaitis,"So, my most recent book is about stress, stagn..."
5292,"789: How to Beat Stress, Stagnation, and Burno...",Pete Mockaitis,"Well, let’s talk about the book Sustain Your G..."
5293,"789: How to Beat Stress, Stagnation, and Burno...",Pete Mockaitis,"Alan Stein, Jr."
5294,"789: How to Beat Stress, Stagnation, and Burno...",Pete Mockaitis,"Well, the big idea, I think the cornerstone of..."


In [44]:
#We'll add Alan Stein, Jr.

unique_speakers.append('Alan Stein, Jr.')

#Recreate our dataframe
statement_df = create_statement_df(new_df)

#And see how it's changed
filtered_df = get_dirty_data(statement_df)
filtered_df.head(20)

Unnamed: 0,title,speaker,statement
7403,772: How to Build Resilience to Thrive in Unce...,Pete Mockaitis,"Gemma, welcome to How to be Awesome at Your Job."
7404,772: How to Build Resilience to Thrive in Unce...,Pete Mockaitis,Gemma Roberts
7405,772: How to Build Resilience to Thrive in Unce...,Pete Mockaitis,"Hello. Thanks for having me, Pete."
7406,772: How to Build Resilience to Thrive in Unce...,Pete Mockaitis,"Well, I’m excited to hear your wisdom. And I’d..."
7407,772: How to Build Resilience to Thrive in Unce...,Pete Mockaitis,Gemma Roberts
7408,772: How to Build Resilience to Thrive in Unce...,Pete Mockaitis,"Yes. Well, I guess a lot changed in life aroun..."
7409,772: How to Build Resilience to Thrive in Unce...,Pete Mockaitis,"So, I thought at that time that, actually, the..."
7410,772: How to Build Resilience to Thrive in Unce...,Pete Mockaitis,"So, I actually thought of it as an obligation ..."
7411,772: How to Build Resilience to Thrive in Unce...,Pete Mockaitis,All right.
7412,772: How to Build Resilience to Thrive in Unce...,Pete Mockaitis,Gemma Roberts


In [45]:
#We'll add Gemma Roberts

unique_speakers.append('Gemma Roberts')

#Recreate our dataframe
statement_df = create_statement_df(new_df)

#And see how it's changed
filtered_df = get_dirty_data(statement_df)
filtered_df.head(20)

Unnamed: 0,title,speaker,statement
8830,763: Stephen M. R. Covey Reveals How Great Lea...,Pete Mockaitis,"Stephen, welcome to How to be Awesome at Your ..."
8831,763: Stephen M. R. Covey Reveals How Great Lea...,Pete Mockaitis,Stephen Covey
8832,763: Stephen M. R. Covey Reveals How Great Lea...,Pete Mockaitis,"Hi, Pete. Excited to be with you today."
8833,763: Stephen M. R. Covey Reveals How Great Lea...,Pete Mockaitis,"Oh, I’m excited as well. I’m really looking fo..."
8834,763: Stephen M. R. Covey Reveals How Great Lea...,Pete Mockaitis,Stephen Covey
8835,763: Stephen M. R. Covey Reveals How Great Lea...,Pete Mockaitis,"Yeah, absolutely, several but I’ll share one o..."
8836,763: Stephen M. R. Covey Reveals How Great Lea...,Pete Mockaitis,"Now, this was back in the days before automati..."
8837,763: Stephen M. R. Covey Reveals How Great Lea...,Pete Mockaitis,And then he kind of taught me what clean meant...
8838,763: Stephen M. R. Covey Reveals How Great Lea...,Pete Mockaitis,Okay.
8839,763: Stephen M. R. Covey Reveals How Great Lea...,Pete Mockaitis,Stephen Covey


In [46]:
#We'll add Stephen Covey

unique_speakers.append('Stephen Covey')

#Recreate our dataframe
statement_df = create_statement_df(new_df)

#And see how it's changed
filtered_df = get_dirty_data(statement_df)
filtered_df.head(20)

Unnamed: 0,title,speaker,statement
16204,696: How to Separate Truth from Bullsh*t for S...,Pete Mockaitis,We’ll get a few seconds of silence for the aud...
16205,696: How to Separate Truth from Bullsh*t for S...,Pete Mockaitis,John Petrocelli
16206,696: How to Separate Truth from Bullsh*t for S...,Pete Mockaitis,"Well, thank you. Thank you for having me."
16207,696: How to Separate Truth from Bullsh*t for S...,Pete Mockaitis,"Well, I’m so excited to dig into the wisdom of..."
16208,696: How to Separate Truth from Bullsh*t for S...,Pete Mockaitis,John Petrocelli
16209,696: How to Separate Truth from Bullsh*t for S...,Pete Mockaitis,"Well, I think everybody does actually. When I ..."
16210,696: How to Separate Truth from Bullsh*t for S...,Pete Mockaitis,"So, I’m convinced that we are constantly surro..."
16211,696: How to Separate Truth from Bullsh*t for S...,Pete Mockaitis,But we often think that it doesn’t have the de...
16212,696: How to Separate Truth from Bullsh*t for S...,Pete Mockaitis,"Okay. Well, I want to definitely hear about th..."
16213,696: How to Separate Truth from Bullsh*t for S...,Pete Mockaitis,John Petrocelli


In [47]:
#We'll add John Petrocelli

unique_speakers.append('John Petrocelli')

#Recreate our dataframe
statement_df = create_statement_df(new_df)

#And see how it's changed
filtered_df = get_dirty_data(statement_df)
filtered_df.head(20)

Unnamed: 0,title,speaker,statement
23922,"625: How to Be Happier, More Fulfilled, and Mo...",Pete Mockaitis,"Tal, thanks for joining us here on the How to ..."
23923,"625: How to Be Happier, More Fulfilled, and Mo...",Pete Mockaitis,Tal Ben-Shahar
23924,"625: How to Be Happier, More Fulfilled, and Mo...",Pete Mockaitis,"Thank you, Pete. It’s great to be here."
23925,"625: How to Be Happier, More Fulfilled, and Mo...",Pete Mockaitis,"Well, I’m so excited to chat with you. I’ve re..."
23926,"625: How to Be Happier, More Fulfilled, and Mo...",Pete Mockaitis,Tal Ben-Shahar
23927,"625: How to Be Happier, More Fulfilled, and Mo...",Pete Mockaitis,"So, I became interested in happiness because o..."
23928,"625: How to Be Happier, More Fulfilled, and Mo...",Pete Mockaitis,"Now, I remember, this was a very cold Boston m..."
23929,"625: How to Be Happier, More Fulfilled, and Mo...",Pete Mockaitis,"And I said, “Because I have two questions. The..."
23930,"625: How to Be Happier, More Fulfilled, and Mo...",Pete Mockaitis,"Actually, I did become happier as a result of ..."
23931,"625: How to Be Happier, More Fulfilled, and Mo...",Pete Mockaitis,"Well, that’s great. I got a chuckle out of “Ab..."


In [48]:
#We'll add Tal Ben-Shahar

unique_speakers.append('Tal Ben-Shahar')

#Recreate our dataframe
statement_df = create_statement_df(new_df)

#And see how it's changed
filtered_df = get_dirty_data(statement_df)
filtered_df.head(20)

Unnamed: 0,title,speaker,statement
26878,599: How to Break the Habit of Anxiety Using C...,Pete Mockaitis,"Jud, thanks so much for joining us here on the..."
26879,599: How to Break the Habit of Anxiety Using C...,Pete Mockaitis,Judson Brewer
26880,599: How to Break the Habit of Anxiety Using C...,Pete Mockaitis,Thanks for having me.
26881,599: How to Break the Habit of Anxiety Using C...,Pete Mockaitis,"Well, I’m excited to dig into your wisdom. One..."
26882,599: How to Break the Habit of Anxiety Using C...,Pete Mockaitis,Judson Brewer
26883,599: How to Break the Habit of Anxiety Using C...,Pete Mockaitis,"Well, just one of the many is that it’s actual..."
26884,599: How to Break the Habit of Anxiety Using C...,Pete Mockaitis,"Okay, yeah. Thank you. Much to chew on already..."
26885,599: How to Break the Habit of Anxiety Using C...,Pete Mockaitis,Judson Brewer
26886,599: How to Break the Habit of Anxiety Using C...,Pete Mockaitis,"Yeah, I think I have a very simple one. It’s k..."
26887,599: How to Break the Habit of Anxiety Using C...,Pete Mockaitis,"Okay. Well, then it sounds like we all do some..."


In [49]:
#We'll add Judson Brewer

unique_speakers.append('Judson Brewer')

#Recreate our dataframe
statement_df = create_statement_df(new_df)

#And see how it's changed
filtered_df = get_dirty_data(statement_df)
filtered_df.head(20)

Unnamed: 0,title,speaker,statement
30800,562: How to Get More Done by Working Less with...,Pete Mockaitis,"Alex, thanks so much for joining us here on th..."
30801,562: How to Get More Done by Working Less with...,Pete Mockaitis,Alex Soojung-Kim Pang
30802,562: How to Get More Done by Working Less with...,Pete Mockaitis,"Oh, thanks very much. It’s a pleasure to be wi..."
30803,562: How to Get More Done by Working Less with...,Pete Mockaitis,"Well, I’m excited to talk about working less a..."
30804,562: How to Get More Done by Working Less with...,Pete Mockaitis,Alex Soojung-Kim Pang
30805,562: How to Get More Done by Working Less with...,Pete Mockaitis,I think it’s a challenge for everybody. I do a...
30806,562: How to Get More Done by Working Less with...,Pete Mockaitis,"Excellent. Well, I’m glad to hear that you’re ..."
30807,562: How to Get More Done by Working Less with...,Pete Mockaitis,Alex Soojung-Kim Pang
30808,562: How to Get More Done by Working Less with...,Pete Mockaitis,"So, Shorter is essentially a sequel to my prev..."
30809,562: How to Get More Done by Working Less with...,Pete Mockaitis,"And so, I started looking for organizations th..."


In [50]:
#We'll add Alex Soojung-Kim Pang

unique_speakers.append('Alex Soojung-Kim Pang')

#Recreate our dataframe
statement_df = create_statement_df(new_df)

#And see how it's changed
filtered_df = get_dirty_data(statement_df)
filtered_df.head(20)

Unnamed: 0,title,speaker,statement
31782,552: The Foundational Principle that Separates...,Pete Mockaitis,"Pat, thanks for joining us here on the How to ..."
31783,552: The Foundational Principle that Separates...,Pete Mockaitis,Patrick Lencioni
31784,552: The Foundational Principle that Separates...,Pete Mockaitis,"It’s great to be with you, Pete."
31785,552: The Foundational Principle that Separates...,Pete Mockaitis,"Well, I’ve been so excited to chat with you he..."
31786,552: The Foundational Principle that Separates...,Pete Mockaitis,Patrick Lencioni
31787,552: The Foundational Principle that Separates...,Pete Mockaitis,"Wow, there’s a lot there."
31788,552: The Foundational Principle that Separates...,Pete Mockaitis,Just breaking the ice.
31789,552: The Foundational Principle that Separates...,Pete Mockaitis,Patrick Lencioni
31790,552: The Foundational Principle that Separates...,Pete Mockaitis,I think the thing I would say is it’s messier ...
31791,552: The Foundational Principle that Separates...,Pete Mockaitis,"Yes. Well, and I think that kind of goes right..."


In [51]:
#We'll add Patrick Lencioni

unique_speakers.append('Patrick Lencioni')

#Recreate our dataframe
statement_df = create_statement_df(new_df)

#And see how it's changed
filtered_df = get_dirty_data(statement_df)
filtered_df.head(20)

Unnamed: 0,title,speaker,statement
34480,527: How to Boost Energy through Greater Menta...,Pete Mockaitis,"Chris, thanks so much for joining us here on t..."
34481,527: How to Boost Energy through Greater Menta...,Pete Mockaitis,Chris Barez-Brown
34482,527: How to Boost Energy through Greater Menta...,Pete Mockaitis,"Hey, it’s absolutely a pleasure to be here, Pete."
34483,527: How to Boost Energy through Greater Menta...,Pete Mockaitis,"Well, I am so intrigued. You have taken the bo..."
34484,527: How to Boost Energy through Greater Menta...,Pete Mockaitis,Chris Barez-Brown
34485,527: How to Boost Energy through Greater Menta...,Pete Mockaitis,"Yeah, usually it raises an eyebrow or two to t..."
34486,527: How to Boost Energy through Greater Menta...,Pete Mockaitis,"Okay, certainly. Well, energy, I mean, hey, I’..."
34487,527: How to Boost Energy through Greater Menta...,Pete Mockaitis,Chris Barez-Brown
34488,527: How to Boost Energy through Greater Menta...,Pete Mockaitis,"Well, I think the biggest problem is that we’r..."
34489,527: How to Boost Energy through Greater Menta...,Pete Mockaitis,The business world in which we live right now ...


In [52]:
#We'll add Chris Barez-Brown

unique_speakers.append('Chris Barez-Brown')

#Recreate our dataframe
statement_df = create_statement_df(new_df)

#And see how it's changed
filtered_df = get_dirty_data(statement_df)
filtered_df.head(20)

Unnamed: 0,title,speaker,statement
35987,513: How to Persuade When Facts Don’t Seem to ...,Pete Mockaitis,"Lee, thanks for joining us here on the How to ..."
35988,513: How to Persuade When Facts Don’t Seem to ...,Pete Mockaitis,Lee Carter
35989,513: How to Persuade When Facts Don’t Seem to ...,Pete Mockaitis,I’m so happy to be here and excited about this...
35990,513: How to Persuade When Facts Don’t Seem to ...,Pete Mockaitis,"Oh, me too. And one thing that we share is tha..."
35991,513: How to Persuade When Facts Don’t Seem to ...,Pete Mockaitis,Lee Carter
35992,513: How to Persuade When Facts Don’t Seem to ...,Pete Mockaitis,"Okay. So, I was a finalist and it was a long, ..."
35993,513: How to Persuade When Facts Don’t Seem to ...,Pete Mockaitis,And what I realized was that my parents and my...
35994,513: How to Persuade When Facts Don’t Seem to ...,Pete Mockaitis,"Tell me more. So, what would’ve been the negat..."
35995,513: How to Persuade When Facts Don’t Seem to ...,Pete Mockaitis,Lee Carter
35996,513: How to Persuade When Facts Don’t Seem to ...,Pete Mockaitis,"I just don’t think at age, whatever I was, 18,..."


In [53]:
#We'll add Lee Carter

unique_speakers.append('Lee Carter')

#Recreate our dataframe
statement_df = create_statement_df(new_df)

#And see how it's changed
filtered_df = get_dirty_data(statement_df)
filtered_df.head(20)

Unnamed: 0,title,speaker,statement
36440,510: The Science Behind Successful Teams with ...,Pete Mockaitis,"Janice, thanks for joining us here on the How ..."
36441,510: The Science Behind Successful Teams with ...,Pete Mockaitis,Dr. Janice Presser
36442,510: The Science Behind Successful Teams with ...,Pete Mockaitis,It’s awesome to be here.
36443,510: The Science Behind Successful Teams with ...,Pete Mockaitis,"Well, I’m excited to get into your wisdom. And..."
36444,510: The Science Behind Successful Teams with ...,Pete Mockaitis,Dr. Janice Presser
36445,510: The Science Behind Successful Teams with ...,Pete Mockaitis,"Well, I started out life like anybody else try..."
36446,510: The Science Behind Successful Teams with ...,Pete Mockaitis,"And so, I actually started to think about, “Wh..."
36447,510: The Science Behind Successful Teams with ...,Pete Mockaitis,"You see, back in the day, there were lots of p..."
36448,510: The Science Behind Successful Teams with ...,Pete Mockaitis,"I mean, I had a whole lot of questions that ma..."
36449,510: The Science Behind Successful Teams with ...,Pete Mockaitis,"So, what made a whole lot of sense to me was, ..."


In [54]:
#Okay... This is a lot of manual adds. Let's consider how we might make this faster
#First let's see how many more there are...
len(filtered_df['title'].unique().tolist())

34

In [55]:
#34 is too many to do manually
#Let's see if we can identify the names

#We'll start with spaCy's name recognition
def is_name(word):
    doc = nlp(word)
    for entity in doc.ents:
        if entity.label_ == "PERSON":
            return True
    return False

names = []
for index, row in filtered_df.iterrows():
    text = row['statement']
    name = is_name(text)
    if name:
            names.append(text)

#Limit the list of entities to unique speakers
unique_names = list(set(names))

print(unique_names)

['Oh, all right. Well, I don’t know if you’ve read this, but they did make a movie, and I haven’t been able to bring myself to see the movie because I love the book so much. And it’s Madeleine L’Engle’s book A Wrinkle in Time and it’s a children’s book, and it’s part of her Time Trilogy which won all kinds of wonderful awards. And I love it because of the science in it.', 'They really took a role. They saw themselves as healers. And Wrzesniewski explained to me that when the hospital found out about this, the custodians were often told not to do this, because this was not part of their job description.', 'John, thanks so much for joining us here on the How to be Awesome at Your Job podcast.', 'Kim, that’s dead on, thank you. Please, unpack the others just like this.', 'And he kind of got a little upset about this, he said, “No, I got lots of friends.” And I said, “Yes, you do. Yes, you do. But you don’t have a lot of people that you need. And I don’t mean need for, ‘Give me a ride to t

In [56]:
#That did not work. Let's try this to use istitle

names = []
for index, row in filtered_df.iterrows():
    text = row['statement']
    if text.istitle():
            names.append(text)

#Limit the list of entities to unique speakers
unique_names = list(set(names))

print(len(unique_names))
print(unique_names)

79
['Great.', 'Drinking Jack Bauer.', 'Yeah, 2%.', 'Okay.', 'No.', 'Measure.', 'Jaws?', 'Christine E. Hassler', 'Pete', 'Cool.', 'Brad R. Staats', 'Perform.', 'Interesting.', 'Joseph Sanok', 'Well, Covey.', 'Yes.', 'Right', 'Lovely.', 'Edward Latimore', 'Wow. Yeah.', 'Yeah.', 'Sure.', 'Awesome.', 'Scott Young', 'John C Maxwell', 'Like Keifer Sutherland?', 'Excellent.', 'Congratulations! Congratulations!', 'Hawaii.', 'Zing!', 'Kim Powell', 'Whoa.', 'Oh.', 'Drew Geant', 'Right.', 'Jolene Blackbourn', 'Dr. Janice Presser', 'Mike Marquardt', 'Yay.', 'Me.', 'Michael J Arena', 'Ellen Ruppel Shell', 'Carmen Simon', 'Yes', 'Beautiful.', 'Exactly.', 'Awesome, Pete.', 'Kevan Hall:', 'Shawn Jones', 'Michael. P. Dolan', 'Yeah', 'James E. Lukaszewski', 'Yeah. Classic.', 'Thanks, Pete.', 'Yes. ….', 'Okay. Good.', 'Dr. Richard Shuster', 'W. Chris Winter', 'Michael Dow', 'Guy Bell', '#Kardashians.', 'Dr. Bob Nelson', 'Dr. John Townsend', 'Nice.', 'Pam Skillings', 'Tony Tjan', 'Michelle Lederman', 'Abs

In [57]:
doc = nlp(filtered_df.at[36441, 'statement'])

ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_]
ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_]
print(ent_san)
print(ent_francisco)

['Dr.', 'O', '']
['Janice', 'B', 'PERSON']


In [58]:
#Still pretty messy
#Let's revisit spaCy's named entity recognition with some better parameters around it

names = []
for index, row in filtered_df.iterrows():
    text = row['statement']
    doc = nlp(text)
    
    # Check if there are at least two and no more than four tokens in the Doc object
    if 2<= len(doc) <= 4:
        if doc[0].ent_type_ == 'PERSON' or doc[1].ent_type_ == 'PERSON':
            names.append(text)
    # If there's only one token, check if it's a 'PERSON'
    elif len(doc) == 1:
        if doc[0].ent_type_ == 'PERSON':
            names.append(text)

            
# Limit the list of entities to unique speakers
unique_names = list(set(names))

print(len(unique_names))
print(unique_names)

36
['Jolene Blackbourn', 'Dr. Janice Presser', 'Mike Marquardt', 'Pam McLean', 'Drinking Jack Bauer.', 'Dr. Richard Shuster', 'W. Chris Winter', 'Michael Dow', 'Guy Bell', 'Christine E. Hassler', 'Ellen Ruppel Shell', 'Michael J Arena', 'Dr. Bob Nelson', 'Dr. John Townsend', 'Pete', 'Brad R. Staats', 'Carmen Simon', 'Scott Young', 'John C Maxwell', 'Tony Tjan', 'Like Keifer Sutherland?', 'Michelle Lederman', 'Bill Poundstone', 'Kevan Hall:', 'Shawn Jones', 'Michael P. Dolan', 'Bob Sutton', 'Michael. P. Dolan', 'Kim Powell', 'Joseph Sanok', 'Linda Thaler', 'James E. Lukaszewski', 'Edward Latimore', 'Mamie Stewart', 'Dr. Michelle Reina', 'Drew Geant']


In [59]:
#That's pretty good. Let's do a little digging
Counter(names).most_common()

[('Joseph Sanok', 88),
 ('Kevan Hall:', 64),
 ('Drew Geant', 62),
 ('Christine E. Hassler', 54),
 ('Carmen Simon', 54),
 ('Michelle Lederman', 54),
 ('Bill Poundstone', 49),
 ('Brad R. Staats', 47),
 ('Pam McLean', 44),
 ('Michael Dow', 39),
 ('Kim Powell', 39),
 ('John C Maxwell', 38),
 ('W. Chris Winter', 35),
 ('Jolene Blackbourn', 34),
 ('Tony Tjan', 33),
 ('Michael J Arena', 32),
 ('Mamie Stewart', 32),
 ('Scott Young', 30),
 ('Shawn Jones', 30),
 ('Dr. Bob Nelson', 30),
 ('Edward Latimore', 30),
 ('Linda Thaler', 30),
 ('Dr. Michelle Reina', 29),
 ('Dr. Richard Shuster', 26),
 ('Ellen Ruppel Shell', 25),
 ('Bob Sutton', 24),
 ('Michael P. Dolan', 23),
 ('Dr. Janice Presser', 22),
 ('Mike Marquardt', 22),
 ('Dr. John Townsend', 21),
 ('Guy Bell', 21),
 ('James E. Lukaszewski', 15),
 ('Michael. P. Dolan', 5),
 ('Like Keifer Sutherland?', 1),
 ('Drinking Jack Bauer.', 1),
 ('Pete', 1)]

In [60]:
#Let's start by looking at the instances of a name being used only 1 time in case there's a problem

filtered_df.reset_index(drop=True, inplace = True)
i = filtered_df[filtered_df['statement'] == 'Pete'].index[0]
filtered_df.iloc[i-10:i+10]

Unnamed: 0,title,speaker,statement
3182,237: Crafting Memorable Stories with Dr. Carme...,Pete Mockaitis,"Because it’s one thing to say, “I’m working he..."
3183,237: Crafting Memorable Stories with Dr. Carme...,Pete Mockaitis,"Oh, I really like that. And so, you’re bringin..."
3184,237: Crafting Memorable Stories with Dr. Carme...,Pete Mockaitis,Carmen Simon
3185,237: Crafting Memorable Stories with Dr. Carme...,Pete Mockaitis,So true. And the nice thing about being able t...
3186,237: Crafting Memorable Stories with Dr. Carme...,Pete Mockaitis,"Yes. Well, I was going to go there next in ter..."
3187,237: Crafting Memorable Stories with Dr. Carme...,Pete Mockaitis,Carmen Simon
3188,237: Crafting Memorable Stories with Dr. Carme...,Pete Mockaitis,I really like that question because you’re so ...
3189,237: Crafting Memorable Stories with Dr. Carme...,Pete Mockaitis,"I’ve never said that. One year in, I’ve manage..."
3190,237: Crafting Memorable Stories with Dr. Carme...,Pete Mockaitis,Carmen Simon
3191,237: Crafting Memorable Stories with Dr. Carme...,Pete Mockaitis,You just recently got married. You just give i...


In [61]:
#Pete is going to be a challenge because it's supposed to be Pete Mockaitis...
title_pete = filtered_df.at[i, 'title']
episode = df[df['title'] == title_pete]
episode.reset_index(drop=True, inplace = True)
episode_text = episode.at[0, 'text']
print(episode_text)


Pete Mockaitis
Carmen, welcome back to the How to Be Awesome at Your Job podcast.
Carmen Simon
Thank you. Thank you so much and welcome back, everyone.
Pete Mockaitis
Well, it’s so fun. There’s only been about three guests who have done, well, exactly three, I believe, who have made a repeat appearance, so welcome. It’s cool to have you in the club here.
Carmen Simon
Thank you so much. And, you know, repetition is the mother of memory so repeated exposure with yet some statements that people remember.
Pete Mockaitis
Oh, that is well-played. You know, Skype just informed me that your birthday is on New Year’s Day.
Carmen Simon
Oh, I wonder how it got that information. No, it’s actually equally cool birthday. I’m a Halloween baby.
Pete Mockaitis
No kidding? Well, maybe you just set said, “Forget this, Skype. I’m not telling you my birthday. I’m filling in, oh, 1-01.”
Carmen Simon
Yes, it’s relinking this with the concept of memory. False memories are very much of a cautionary topic for 

In [62]:
df_i = df[df['title'] == title_pete].index[0]
print(df_i)
df.iloc[df_i-3:df_i+2]

602


Unnamed: 0.1,Unnamed: 0,title,text,date
600,605,239: Building Yours Systems for Success with S...,"\nPete Mockaitis\nSam, thanks so much for join...","January 16, 2023"
601,606,238: The Ingredients of a Great First Impressi...,"\nPete Mockaitis\nAnn, thanks so much for join...","January 16, 2023"
602,607,237: Crafting Memorable Stories with Dr. Carme...,"\nPete Mockaitis\nCarmen, welcome back to the ...","January 16, 2023"
603,608,236: Persuasion Pointers from a Legendary Info...,"\nPete Mockaitis\nSully, thanks so much for jo...","January 16, 2023"
604,609,235: The Power of Finding Your Why with David ...,"\nPete Mockaitis\nDavid, thanks so much for jo...","January 16, 2023"


In [63]:
#Let's replace it in our original transcript incase we need to re-use it later..

def replace_pete(text):
    lines = text.split("\n")
    for i, line in enumerate(lines):
        if line.strip() == "Pete":
            lines[i] = "Pete Mockaitis"
    return "\n".join(lines)

# Replace 'Pete' with 'Pete Mockaitis' in the specific row and column
df.at[df_i, 'text'] = replace_pete(df.at[df_i, 'text'])

title_pete = filtered_df.at[i, 'title']
episode = df[df['title'] == title_pete]
episode.reset_index(drop=True, inplace = True)
episode_text = episode.at[0, 'text']
print(episode_text)


Pete Mockaitis
Carmen, welcome back to the How to Be Awesome at Your Job podcast.
Carmen Simon
Thank you. Thank you so much and welcome back, everyone.
Pete Mockaitis
Well, it’s so fun. There’s only been about three guests who have done, well, exactly three, I believe, who have made a repeat appearance, so welcome. It’s cool to have you in the club here.
Carmen Simon
Thank you so much. And, you know, repetition is the mother of memory so repeated exposure with yet some statements that people remember.
Pete Mockaitis
Oh, that is well-played. You know, Skype just informed me that your birthday is on New Year’s Day.
Carmen Simon
Oh, I wonder how it got that information. No, it’s actually equally cool birthday. I’m a Halloween baby.
Pete Mockaitis
No kidding? Well, maybe you just set said, “Forget this, Skype. I’m not telling you my birthday. I’m filling in, oh, 1-01.”
Carmen Simon
Yes, it’s relinking this with the concept of memory. False memories are very much of a cautionary topic for 

In [64]:
#And we can target this specific line in the speaker separated dataframe

new_df[new_df['text'] == 'Pete']
new_df.reset_index(drop=True, inplace = True)

pete_i = new_df[new_df['text'] == 'Pete'].index[0]

new_df.iloc[pete_i-2:pete_i+3]

Unnamed: 0,title,text
118965,237: Crafting Memorable Stories with Dr. Carme...,Carmen Simon
118966,237: Crafting Memorable Stories with Dr. Carme...,You just recently got married. You just give i...
118967,237: Crafting Memorable Stories with Dr. Carme...,Pete
118968,237: Crafting Memorable Stories with Dr. Carme...,Okay.
118969,237: Crafting Memorable Stories with Dr. Carme...,Carmen Simon


In [65]:
new_df.at[pete_i, 'text'] = 'Pete Mockaitis'
new_df.iloc[pete_i-2:pete_i+3]

Unnamed: 0,title,text
118965,237: Crafting Memorable Stories with Dr. Carme...,Carmen Simon
118966,237: Crafting Memorable Stories with Dr. Carme...,You just recently got married. You just give i...
118967,237: Crafting Memorable Stories with Dr. Carme...,Pete Mockaitis
118968,237: Crafting Memorable Stories with Dr. Carme...,Okay.
118969,237: Crafting Memorable Stories with Dr. Carme...,Carmen Simon


Note: revisit episode 234

In [66]:
#Let's remove the 'Pete from the list of unique names
unique_names.remove('Pete')

i = filtered_df[filtered_df['statement'] == 'Drinking Jack Bauer.'].index[0]
filtered_df.iloc[i-10:i+10]

Unnamed: 0,title,speaker,statement
2510,306: Taking Care of Your Brain With Dr. Mike D...,Pete Mockaitis,Michael Dow
2511,306: Taking Care of Your Brain With Dr. Mike D...,Pete Mockaitis,Yeah.
2512,306: Taking Care of Your Brain With Dr. Mike D...,Pete Mockaitis,I’d love to hear some hard-hitting numbers fro...
2513,306: Taking Care of Your Brain With Dr. Mike D...,Pete Mockaitis,Michael Dow
2514,306: Taking Care of Your Brain With Dr. Mike D...,Pete Mockaitis,"Yeah, thank you. Omega-3’s – and I have a chap..."
2515,306: Taking Care of Your Brain With Dr. Mike D...,Pete Mockaitis,We know from research – there was a really gro...
2516,306: Taking Care of Your Brain With Dr. Mike D...,Pete Mockaitis,"After a month, they scanned their brains and t..."
2517,306: Taking Care of Your Brain With Dr. Mike D...,Pete Mockaitis,Like Keifer Sutherland?
2518,306: Taking Care of Your Brain With Dr. Mike D...,Pete Mockaitis,Michael Dow
2519,306: Taking Care of Your Brain With Dr. Mike D...,Pete Mockaitis,"Yeah, exactly."


In [67]:
#And we can remove the other two instances of single use names in this list!
unique_names.remove('Like Keifer Sutherland?')
unique_names.remove('Drinking Jack Bauer.')

In [68]:
i = filtered_df[filtered_df['statement'] == 'Michael P. Dolan'].index[0]
filtered_df.iloc[i-10:i+10]

Unnamed: 0,title,speaker,statement
4461,099: Likability Principles with Michelle Tilli...,Pete Mockaitis,That’s good. Thank you. And what would you say...
4462,099: Likability Principles with Michelle Tilli...,Pete Mockaitis,Michelle Lederman
4463,099: Likability Principles with Michelle Tilli...,Pete Mockaitis,The best hub is my website which is michelleti...
4464,099: Likability Principles with Michelle Tilli...,Pete Mockaitis,"Perfect. Thank you. Well, is there perhaps a f..."
4465,099: Likability Principles with Michelle Tilli...,Pete Mockaitis,Michelle Lederman
4466,099: Likability Principles with Michelle Tilli...,Pete Mockaitis,"Well, two things. One, figure out what your th..."
4467,099: Likability Principles with Michelle Tilli...,Pete Mockaitis,"Beautiful. Thank you. Well, Michelle, this has..."
4468,099: Likability Principles with Michelle Tilli...,Pete Mockaitis,Michelle Lederman
4469,099: Likability Principles with Michelle Tilli...,Pete Mockaitis,My pleasure.
4470,091: The Path to Truly Productive Leadership w...,Pete Mockaitis,Michael thanks so much for joining us here on ...


In [69]:
#Why are there only 5 instances of Michael?
title_mike = filtered_df.at[i, 'title']
mike_df = new_df[new_df['title'] == title_mike]
mike_df.tail(50)

Unnamed: 0,title,text
145815,091: The Path to Truly Productive Leadership w...,Interesting.
145816,091: The Path to Truly Productive Leadership w...,Michael P. Dolan
145817,091: The Path to Truly Productive Leadership w...,"Immunity to Change is what it’s called, ITC."
145818,091: The Path to Truly Productive Leadership w...,Pete Mockaitis
145819,091: The Path to Truly Productive Leadership w...,And how about a favorite book?
145820,091: The Path to Truly Productive Leadership w...,Michael. P. Dolan
145821,091: The Path to Truly Productive Leadership w...,A book lately that I’ve been liking and recomm...
145822,091: The Path to Truly Productive Leadership w...,This book goes through unapologetically what d...
145823,091: The Path to Truly Productive Leadership w...,Pete Mockaitis
145824,091: The Path to Truly Productive Leadership w...,And how about a favorite tool?


In [70]:
#It's just weird... 
print(len(unique_names))
print(unique_names)

33
['Jolene Blackbourn', 'Dr. Janice Presser', 'Mike Marquardt', 'Pam McLean', 'Dr. Richard Shuster', 'W. Chris Winter', 'Michael Dow', 'Guy Bell', 'Christine E. Hassler', 'Ellen Ruppel Shell', 'Michael J Arena', 'Dr. Bob Nelson', 'Dr. John Townsend', 'Brad R. Staats', 'Carmen Simon', 'Scott Young', 'John C Maxwell', 'Tony Tjan', 'Michelle Lederman', 'Bill Poundstone', 'Kevan Hall:', 'Shawn Jones', 'Michael P. Dolan', 'Bob Sutton', 'Michael. P. Dolan', 'Kim Powell', 'Joseph Sanok', 'Linda Thaler', 'James E. Lukaszewski', 'Edward Latimore', 'Mamie Stewart', 'Dr. Michelle Reina', 'Drew Geant']


In [71]:
#We're missing 34 names and we have 33!
#Let's add them and see what's left!

for name in unique_names:
    unique_speakers.append(name)
    
#Recreate our dataframe
statement_df = create_statement_df(new_df)

#And see how it's changed
filtered_df = get_dirty_data(statement_df)
filtered_df.head(20)

Unnamed: 0,title,speaker,statement
66170,299: How to Rock an Interview with Pamela Skil...,Pete Mockaitis,"Pam, thank you so much for joining us here on ..."
66171,299: How to Rock an Interview with Pamela Skil...,Pete Mockaitis,Pam Skillings
66172,299: How to Rock an Interview with Pamela Skil...,Pete Mockaitis,Thanks for inviting me. I’m excited to chat to...
66173,299: How to Rock an Interview with Pamela Skil...,Pete Mockaitis,"Oh, I’m excited too. I understand you also hav..."
66174,299: How to Rock an Interview with Pamela Skil...,Pete Mockaitis,Pam Skillings
66175,299: How to Rock an Interview with Pamela Skil...,Pete Mockaitis,"Well, I have a five-year-old. As you can imagi..."
66176,299: How to Rock an Interview with Pamela Skil...,Pete Mockaitis,I don’t think I can summon an image of what a ...
66177,299: How to Rock an Interview with Pamela Skil...,Pete Mockaitis,Pam Skillings
66178,299: How to Rock an Interview with Pamela Skil...,Pete Mockaitis,"I’m trying to be a little bit different, you k..."
66179,299: How to Rock an Interview with Pamela Skil...,Pete Mockaitis,Does it have any noteworthy features or abilit...


In [72]:
#We'll add Pam Skillings mannually

unique_speakers.append('Pam Skillings')

#Recreate our dataframe
statement_df = create_statement_df(new_df)

#And see how it's changed
filtered_df = get_dirty_data(statement_df)
filtered_df.head(20)

Unnamed: 0,title,speaker,statement


In [73]:
#This is really exciting!
#Let's take a look at how many speakers we have
print(len(unique_speakers))
df.shape

807


(818, 4)

In [74]:
#This is looking good, let's look at some of our lowest count names:
statement_df['speaker'].value_counts().tail(10)

Luke Yoquinto        36
Mark McLaughlin      36
Bill Schiemann       34
Rich Jones           34
Guy Ferdman          34
Howie Jacobson       30
Elizabeth McLeod     30
Jennifer Rock        28
Michael Voss         25
Michael. P. Dolan    12
Name: speaker, dtype: int64

In [75]:
#Looking really good! Let's revisit Mike Dolan's episode

mike_df = statement_df[statement_df['title'] == title_mike]
mike_df.tail(50)

Unnamed: 0,title,speaker,statement
90425,091: The Path to Truly Productive Leadership w...,Pete Mockaitis,"I would love to get your take, if you’re think..."
90426,091: The Path to Truly Productive Leadership w...,Michael P. Dolan,You know it reminds me of an article I saw rec...
90427,091: The Path to Truly Productive Leadership w...,Michael P. Dolan,Because they have this culture of…
90428,091: The Path to Truly Productive Leadership w...,Pete Mockaitis,Well the surf’s up bro
90429,091: The Path to Truly Productive Leadership w...,Michael P. Dolan,"Exactly, you got to get out there on the water..."
90430,091: The Path to Truly Productive Leadership w...,Michael P. Dolan,The focus during the working hours on getting ...
90431,091: The Path to Truly Productive Leadership w...,Pete Mockaitis,I think I buy it especially if there’s a meeti...
90432,091: The Path to Truly Productive Leadership w...,Michael P. Dolan,"Right, I’m going to go over to Facebook here o..."
90433,091: The Path to Truly Productive Leadership w...,Michael P. Dolan,If I go beyond four hours of that per day I fe...
90434,091: The Path to Truly Productive Leadership w...,Pete Mockaitis,"Beautiful, thank you. So now this has been a f..."


In [76]:
mike_df.head(50)

Unnamed: 0,title,speaker,statement
90395,091: The Path to Truly Productive Leadership w...,Pete Mockaitis,Michael thanks so much for joining us here on ...
90396,091: The Path to Truly Productive Leadership w...,Michael P. Dolan,"Good to be here, I’m excited to have this conv..."
90397,091: The Path to Truly Productive Leadership w...,Pete Mockaitis,I am too and it’s so fun. So I see that you we...
90398,091: The Path to Truly Productive Leadership w...,Michael P. Dolan,"Well, I started working there about eleven yea..."
90399,091: The Path to Truly Productive Leadership w...,Michael P. Dolan,"And I loved working for David at the company, ..."
90400,091: The Path to Truly Productive Leadership w...,Michael P. Dolan,Another thing a little bit deeper that still I...
90401,091: The Path to Truly Productive Leadership w...,Michael P. Dolan,But to learn and see someone like David who ha...
90402,091: The Path to Truly Productive Leadership w...,Pete Mockaitis,That’s fun. I was so glad to hear and that’s j...
90403,091: The Path to Truly Productive Leadership w...,Michael P. Dolan,And I have to say only in hindsight could I re...
90404,091: The Path to Truly Productive Leadership w...,Pete Mockaitis,So the name of your practice currently is Trul...


In [77]:
#This looks like more than 12 to me:
mike_df['speaker'].value_counts().tail(10)

Michael P. Dolan     39
Pete Mockaitis       29
Michael. P. Dolan    12
Name: speaker, dtype: int64

In [78]:
#Oooohhh... It's because some are Michael. and not Michael.. Easy fix!

df.replace('Michael. P. Dolan', 'Michael P. Dolan', inplace = True)
new_df.replace('Michael. P. Dolan', 'Michael P. Dolan', inplace = True)


#Recreate our dataframe
statement_df = create_statement_df(new_df)

#let's revisit some of our lowest count names:
statement_df['speaker'].value_counts().tail(10)

Victor Yocco        38
Luke Yoquinto       36
Mark McLaughlin     36
Rich Jones          34
Bill Schiemann      34
Guy Ferdman         34
Elizabeth McLeod    30
Howie Jacobson      30
Jennifer Rock       28
Michael Voss        25
Name: speaker, dtype: int64

In [79]:
#Let's look at Michael Voss

i = statement_df[statement_df['speaker'] == 'Michael Voss'].index[0]

title_voss = statement_df.at[i, 'title']
voss_df = statement_df[statement_df['title'] == title_voss]
voss_df.head(50)

Unnamed: 0,title,speaker,statement
79899,185: Banishing BS at Work with Jennifer Rock a...,Pete Mockaitis,"Jennifer and Michael, welcome to the How to be..."
79900,185: Banishing BS at Work with Jennifer Rock a...,Jennifer Rock,Thanks for having us.
79901,185: Banishing BS at Work with Jennifer Rock a...,Michael Voss,It’s great to be here.
79902,185: Banishing BS at Work with Jennifer Rock a...,Pete Mockaitis,It’s great to have you. And so I want to kick ...
79903,185: Banishing BS at Work with Jennifer Rock a...,Michael Voss,"Well, when we decided to start the company, as..."
79904,185: Banishing BS at Work with Jennifer Rock a...,Jennifer Rock,Clearly.
79905,185: Banishing BS at Work with Jennifer Rock a...,Michael Voss,"Yeah, so we toyed with a couple other things, ..."
79906,185: Banishing BS at Work with Jennifer Rock a...,Jennifer Rock,And clearly if we went with who is better look...
79907,185: Banishing BS at Work with Jennifer Rock a...,Michael Voss,"But ultimately, Jennifer and I had worked toge..."
79908,185: Banishing BS at Work with Jennifer Rock a...,Pete Mockaitis,"Well, that’s good. Certainly, whatever concess..."


In [80]:
#It's because there's multiple speakers! Let's see if there are any episodes showing more than 3 speakers, again:
# Group the data by "title" and count the unique values in the "speaker" column
speaker_count = statement_df.groupby('title')['speaker'].nunique()
# Filter the data to only keep the rows where the count is less than 2
filtered_df = df[df['title'].isin(speaker_count[speaker_count > 3].index)]    

filtered_df.head(20)

Unnamed: 0.1,Unnamed: 0,title,text,date


In [81]:
#Awesome! Let's check our statements and see if anything seems amiss
#We're going to use isTitle to see if anything pops ups:

names = []
for index, row in statement_df.iterrows():
    text = row['statement']
    if text.istitle():
            names.append(text)

#Limit the list of entities to unique speakers
unique_names = []
unique_names = list(set(names))

print(len(unique_names))
print(unique_names)

450
['Steve Jobs.', 'Certainly. Yes.', 'Congrats.', 'Yeah, 2%.', 'Ambitious.', 'Toxic?', 'Uuu.', 'Whoa!', 'Again.', 'Napping.', 'Jaws?', 'Nice. Perfect.', 'The 4-Hour Workweek.', 'Crickets.', 'Amen.', 'My Peloton.', 'Nobody.', 'Paper.', 'Boom.', 'Kahneman?', '“Oh.”', 'Persuasive.', 'Zeigarnik.', 'Simple Habits.', 'So—', 'Right', 'Yoga.', 'Highlight.', 'Right? Yeah.', 'Two! Okay.', 'Wow. Yeah.', 'Yes, ….', 'Yeah.', 'Pete Mockatis', 'Mihaly Csikszentmihalyi.', 'Pomodoro Technique.', 'Karlo Siriban:', 'Oh Bo Jackson.', 'Awesome.', 'Dramatic.', 'Like, “Wow!”', 'Marvelous.', 'Anger. Yep.', 'Excellent.', 'Uh-Huh.', '“Yup.”', 'Correct. Absolutely.', 'Huh-', 'Later. Yes!', 'Indeed. Indeed.', 'Email.', 'Yeah, 100%. Yeah, 100%.', 'Or Darth Vader.', 'Content.', 'Noted.', 'Competence.', 'Mookie.', 'Awesome. Thanks.', 'Hotdog.', 'Matthew Abrahams\u2028Yes.', 'Yeah ….', 'K-N-O-W-D-E-L-L.', 'Snipers.', 'Booyeah! Congratulations!', 'Pickle Tracker?', 'Amplify Your Influence.', 'Contraceptive?', 'How S

In [82]:
#There's still some mess in here... let's see if we can't isolate the problems
Counter(names).most_common()

[('Yeah.', 469),
 ('Okay.', 390),
 ('Right.', 309),
 ('Yes.', 201),
 ('Exactly.', 132),
 ('Absolutely.', 93),
 ('Sure.', 84),
 ('Alright.', 42),
 ('No.', 34),
 ('Wow.', 32),
 ('Certainly.', 32),
 ('Pete Mockaits', 29),
 ('Yup.', 27),
 ('Cool.', 24),
 ('Pete', 16),
 ('Perfect.', 15),
 ('Totally.', 14),
 ('Awesome.', 14),
 ('Oh.', 11),
 ('Nice.', 11),
 ('Understood.', 11),
 ('Really?', 10),
 ('Correct.', 10),
 ('Congratulations.', 10),
 ('Exactly. Exactly.', 10),
 ('Beautiful.', 10),
 ('Lovely.', 9),
 ('Thanks.', 9),
 ('Right?', 8),
 ('Excellent.', 8),
 ('Right. Right.', 7),
 ('Intriguing.', 7),
 ('Yes. Yes.', 7),
 ('Gotcha.', 7),
 ('Right. Exactly.', 7),
 ('Great.', 7),
 ('Nicole Merrill', 7),
 ('Okay. Cool.', 6),
 ('Hotdog.', 6),
 ('Yep.', 6),
 ('Agreed.', 6),
 ('Interesting.', 5),
 ('Okay. Understood.', 5),
 ('Thanks, Pete.', 5),
 ('Fascinating.', 5),
 ('Good.', 5),
 ('Yeah', 5),
 ('Maybe.', 4),
 ('Indeed.', 4),
 ('Zeigarnik.', 4),
 ('Okay. Good.', 4),
 ('Whoa.', 4),
 ('Okay. Yeah.', 

In [83]:
#First one to address is Pete Mockaits

i = statement_df[statement_df['statement'] == 'Pete Mockaits'].index[0]

title_pete = statement_df.at[i, 'title']
pete_df = statement_df[statement_df['title'] == title_pete]
pete_df.head(50)

Unnamed: 0,title,speaker,statement
46889,425: Achieving More by Constantly Embarrassing...,Laura Gassner Otting,Pete Mockaits
46890,425: Achieving More by Constantly Embarrassing...,Laura Gassner Otting,"Case, thanks so much for joining us here on th..."
46891,425: Achieving More by Constantly Embarrassing...,Case Kenny,"Yeah, thank you. Thanks for having me. Very ex..."
46892,425: Achieving More by Constantly Embarrassing...,Case Kenny,Pete Mockaits
46893,425: Achieving More by Constantly Embarrassing...,Case Kenny,"Well, I think we’ve got so much fun stuff to d..."
46894,425: Achieving More by Constantly Embarrassing...,Case Kenny,Fluent is gracious. It’s very nice to say. In ...
46895,425: Achieving More by Constantly Embarrassing...,Case Kenny,At one point I was quite good at it. I lived i...
46896,425: Achieving More by Constantly Embarrassing...,Case Kenny,Pete Mockaits
46897,425: Achieving More by Constantly Embarrassing...,Case Kenny,That’s intriguing because I understand these a...
46898,425: Achieving More by Constantly Embarrassing...,Case Kenny,"Oh man, yeah, they’re very different. Chinese,..."


In [84]:
df['text'] = df['text'].apply(lambda x: str(x).replace(u'Pete Mockaits', u'Pete Mockaitis'))
new_df.replace('Pete Mockaits', 'Pete Mockaitis', inplace = True)

#Recreate our dataframe
statement_df = create_statement_df(new_df)

#And we're going to be doing this a few times, so let's make a function
def dirty_data(df):
    names = []
    for index, row in df.iterrows():
        text = row['statement']
        if text.istitle():
            names.append(text)
    return names

#Get the list of names
dirty_names = dirty_data(statement_df)
Counter(dirty_names).most_common()

[('Yeah.', 469),
 ('Okay.', 390),
 ('Right.', 309),
 ('Yes.', 201),
 ('Exactly.', 132),
 ('Absolutely.', 93),
 ('Sure.', 84),
 ('Alright.', 42),
 ('No.', 34),
 ('Wow.', 32),
 ('Certainly.', 32),
 ('Yup.', 27),
 ('Cool.', 24),
 ('Pete', 16),
 ('Perfect.', 15),
 ('Totally.', 14),
 ('Awesome.', 14),
 ('Oh.', 11),
 ('Nice.', 11),
 ('Understood.', 11),
 ('Really?', 10),
 ('Correct.', 10),
 ('Congratulations.', 10),
 ('Exactly. Exactly.', 10),
 ('Beautiful.', 10),
 ('Lovely.', 9),
 ('Thanks.', 9),
 ('Right?', 8),
 ('Excellent.', 8),
 ('Right. Right.', 7),
 ('Intriguing.', 7),
 ('Yes. Yes.', 7),
 ('Gotcha.', 7),
 ('Right. Exactly.', 7),
 ('Great.', 7),
 ('Nicole Merrill', 7),
 ('Okay. Cool.', 6),
 ('Hotdog.', 6),
 ('Yep.', 6),
 ('Agreed.', 6),
 ('Interesting.', 5),
 ('Okay. Understood.', 5),
 ('Thanks, Pete.', 5),
 ('Fascinating.', 5),
 ('Good.', 5),
 ('Yeah', 5),
 ('Maybe.', 4),
 ('Indeed.', 4),
 ('Zeigarnik.', 4),
 ('Okay. Good.', 4),
 ('Whoa.', 4),
 ('Okay. Yeah.', 4),
 ('Prescott', 4),
 (

In [85]:
#Next, let's address Pete

i = statement_df[statement_df['statement'] == 'Pete'].index[0]

title_pete = statement_df.at[i, 'title']
pete_df = statement_df[statement_df['title'] == title_pete]
pete_df.head(50)

Unnamed: 0,title,speaker,statement
74545,234: Sharper Critical Thinking for Better Solu...,Pete Mockaitis,"Mike, thanks so much for joining us here on th..."
74546,234: Sharper Critical Thinking for Better Solu...,Mike Figliuolo,It’s my pleasure to be here. Thanks for having...
74547,234: Sharper Critical Thinking for Better Solu...,Pete Mockaitis,"Oh yeah. Well, you’ve been on the list since E..."
74548,234: Sharper Critical Thinking for Better Solu...,Mike Figliuolo,Absolutely.
74549,234: Sharper Critical Thinking for Better Solu...,Pete Mockaitis,Now I understand you have a bit of a fondness ...
74550,234: Sharper Critical Thinking for Better Solu...,Mike Figliuolo,"Yeah, not really, and that’s what’s funny abou..."
74551,234: Sharper Critical Thinking for Better Solu...,Pete Mockaitis,"Okay. Well, I totally misinterpreted that tidb..."
74552,234: Sharper Critical Thinking for Better Solu...,Mike Figliuolo,"I mean, it was cool. It was cool. After the se..."
74553,234: Sharper Critical Thinking for Better Solu...,Pete Mockaitis,Yeah. I’ve done it once and I liked it. I coul...
74554,234: Sharper Critical Thinking for Better Solu...,Mike Figliuolo,It’s a little different when you’ve got on a r...


In [86]:
#This is tricky, like when we addressed 'Pete' in a previous episode
pete_df = statement_df[statement_df['statement'] == 'Pete']
pete_df.head(50)

Unnamed: 0,title,speaker,statement
74563,234: Sharper Critical Thinking for Better Solu...,Mike Figliuolo,Pete
74565,234: Sharper Critical Thinking for Better Solu...,Mike Figliuolo,Pete
74568,234: Sharper Critical Thinking for Better Solu...,Mike Figliuolo,Pete
74572,234: Sharper Critical Thinking for Better Solu...,Mike Figliuolo,Pete
74577,234: Sharper Critical Thinking for Better Solu...,Mike Figliuolo,Pete
74580,234: Sharper Critical Thinking for Better Solu...,Mike Figliuolo,Pete
74583,234: Sharper Critical Thinking for Better Solu...,Mike Figliuolo,Pete
74586,234: Sharper Critical Thinking for Better Solu...,Mike Figliuolo,Pete
74590,234: Sharper Critical Thinking for Better Solu...,Mike Figliuolo,Pete
74593,234: Sharper Critical Thinking for Better Solu...,Mike Figliuolo,Pete


In [87]:
title_pete = statement_df.at[i, 'title']
episode = df[df['title'] == title_pete]
episode.reset_index(drop=True, inplace = True)
episode_text = episode.at[0, 'text']
print(episode_text)
df_i = df[df['title'] == title_pete].index[0]
print(df_i)
df.iloc[df_i-3:df_i+2]


Pete Mockaitis
Mike, thanks so much for joining us here on the How to be Awesome at Your Job podcast.
Mike Figliuolo
It’s my pleasure to be here. Thanks for having me as a guest.
Pete Mockaitis
Oh yeah. Well, you’ve been on the list since Episode 3 with Victor Prince’s co-author, and now seemed like a fine time. So, I’m glad you made it happen.
Mike Figliuolo
Absolutely.
Pete Mockaitis
Now I understand you have a bit of a fondness for skydiving. What’s the backstory here?
Mike Figliuolo
Yeah, not really, and that’s what’s funny about it. So, I was in the army, and I’ve always hated heights. My father used to laugh at me when we would go up on the roof to clean out the gutters and I’m looking like Spider-Man plastered to the roof, just worried about falling off. So when I was in the army, they have you go to military schools during your summers when you’re at West Point. And one of my summers I put in for a specific type of very ground-based training, and the Army and its wisdom decide

Unnamed: 0.1,Unnamed: 0,title,text,date
603,608,236: Persuasion Pointers from a Legendary Info...,"\nPete Mockaitis\nSully, thanks so much for jo...","January 16, 2023"
604,609,235: The Power of Finding Your Why with David ...,"\nPete Mockaitis\nDavid, thanks so much for jo...","January 16, 2023"
605,610,234: Sharper Critical Thinking for Better Solu...,"\nPete Mockaitis\nMike, thanks so much for joi...","January 16, 2023"
606,611,233: Best Practices for Better Relationships a...,"\nPete Mockaitis\nTodd, thanks so much for joi...","January 16, 2023"
607,612,232: How to Be a Better Leader by Being More P...,"\nPete Mockaitis\nBrenda, thanks so much for j...","January 16, 2023"


In [88]:
#This is weird. After reviewing the episode and the transcript online, the transcript is actually missing Pete's text, let's just remove this episode
df = df[df['title'] != title_pete]
new_df = new_df[new_df['title'] != title_pete]

#Recreate our dataframe
statement_df = create_statement_df(new_df)

#Get the list of names
dirty_names = dirty_data(statement_df)
Counter(dirty_names).most_common()

[('Yeah.', 469),
 ('Okay.', 390),
 ('Right.', 309),
 ('Yes.', 201),
 ('Exactly.', 132),
 ('Absolutely.', 92),
 ('Sure.', 84),
 ('Alright.', 42),
 ('No.', 34),
 ('Wow.', 32),
 ('Certainly.', 32),
 ('Yup.', 27),
 ('Cool.', 24),
 ('Perfect.', 15),
 ('Totally.', 14),
 ('Awesome.', 14),
 ('Oh.', 11),
 ('Nice.', 11),
 ('Understood.', 11),
 ('Really?', 10),
 ('Correct.', 10),
 ('Congratulations.', 10),
 ('Exactly. Exactly.', 10),
 ('Beautiful.', 10),
 ('Lovely.', 9),
 ('Thanks.', 9),
 ('Right?', 8),
 ('Excellent.', 8),
 ('Right. Right.', 7),
 ('Intriguing.', 7),
 ('Yes. Yes.', 7),
 ('Gotcha.', 7),
 ('Right. Exactly.', 7),
 ('Great.', 7),
 ('Nicole Merrill', 7),
 ('Okay. Cool.', 6),
 ('Hotdog.', 6),
 ('Yep.', 6),
 ('Agreed.', 6),
 ('Interesting.', 5),
 ('Okay. Understood.', 5),
 ('Thanks, Pete.', 5),
 ('Fascinating.', 5),
 ('Good.', 5),
 ('Yeah', 5),
 ('Maybe.', 4),
 ('Indeed.', 4),
 ('Zeigarnik.', 4),
 ('Okay. Good.', 4),
 ('Whoa.', 4),
 ('Okay. Yeah.', 4),
 ('Prescott', 4),
 ('Okay', 3),
 ('

In [89]:
#Let's look at Nicole

i = statement_df[statement_df['statement'] == 'Nicole Merrill'].index[0]

title_nicole = statement_df.at[i, 'title']
nicole_df = statement_df[statement_df['title'] == title_nicole]
nicole_df.head(50)

Unnamed: 0,title,speaker,statement
33594,535: How to Conquer Doubt and Pursue New Caree...,Pete Mockaitis,"Nicolle, thanks so much for joining us here on..."
33595,535: How to Conquer Doubt and Pursue New Caree...,Pete Mockaitis,Nicole Merrill
33596,535: How to Conquer Doubt and Pursue New Caree...,Pete Mockaitis,"Well, hey, thanks for having me. I’m really ex..."
33597,535: How to Conquer Doubt and Pursue New Caree...,Pete Mockaitis,"Oh, I’m excited to have you here and I want to..."
33598,535: How to Conquer Doubt and Pursue New Caree...,Pete Mockaitis,Nicole Merrill
33599,535: How to Conquer Doubt and Pursue New Caree...,Pete Mockaitis,"Oh, thanks. It was actually harder to name my ..."
33600,535: How to Conquer Doubt and Pursue New Caree...,Pete Mockaitis,"But, first, I want to hear about you and pinba..."
33601,535: How to Conquer Doubt and Pursue New Caree...,Pete Mockaitis,Nicole Merrill
33602,535: How to Conquer Doubt and Pursue New Caree...,Pete Mockaitis,"Well, I grew up with my dad really taking the ..."
33603,535: How to Conquer Doubt and Pursue New Caree...,Pete Mockaitis,And it’s really funny too. I’m actually a huge...


In [90]:
#Her name was spelled wrong

df['text'] = df['text'].apply(lambda x: str(x).replace(u'Nicole Merrill', u'Nicolle Merrill'))
new_df.replace('Nicole Merrill', 'Nicolle Merrill', inplace = True)

#Recreate our dataframe
statement_df = create_statement_df(new_df)

#Get the list of names
dirty_names = dirty_data(statement_df)
Counter(dirty_names).most_common()

[('Yeah.', 469),
 ('Okay.', 390),
 ('Right.', 309),
 ('Yes.', 201),
 ('Exactly.', 132),
 ('Absolutely.', 92),
 ('Sure.', 84),
 ('Alright.', 42),
 ('No.', 34),
 ('Wow.', 32),
 ('Certainly.', 32),
 ('Yup.', 27),
 ('Cool.', 24),
 ('Perfect.', 15),
 ('Totally.', 14),
 ('Awesome.', 14),
 ('Oh.', 11),
 ('Nice.', 11),
 ('Understood.', 11),
 ('Really?', 10),
 ('Correct.', 10),
 ('Congratulations.', 10),
 ('Exactly. Exactly.', 10),
 ('Beautiful.', 10),
 ('Lovely.', 9),
 ('Thanks.', 9),
 ('Right?', 8),
 ('Excellent.', 8),
 ('Right. Right.', 7),
 ('Intriguing.', 7),
 ('Yes. Yes.', 7),
 ('Gotcha.', 7),
 ('Right. Exactly.', 7),
 ('Great.', 7),
 ('Okay. Cool.', 6),
 ('Hotdog.', 6),
 ('Yep.', 6),
 ('Agreed.', 6),
 ('Interesting.', 5),
 ('Okay. Understood.', 5),
 ('Thanks, Pete.', 5),
 ('Fascinating.', 5),
 ('Good.', 5),
 ('Yeah', 5),
 ('Maybe.', 4),
 ('Indeed.', 4),
 ('Zeigarnik.', 4),
 ('Okay. Good.', 4),
 ('Whoa.', 4),
 ('Okay. Yeah.', 4),
 ('Prescott', 4),
 ('Okay', 3),
 ('Brilliant.', 3),
 ('Abso

In [91]:
#Let's look at Prescott

i = statement_df[statement_df['statement'] == 'Prescott'].index[0]

title_prescott = statement_df.at[i, 'title']
prescott_df = statement_df[statement_df['title'] == title_prescott]
prescott_df.head(50)

Unnamed: 0,title,speaker,statement
90919,085: Keeping the Creative Juices Flowing with ...,Pete Mockaitis,"Prescott, thanks so much for being here on the..."
90920,085: Keeping the Creative Juices Flowing with ...,Prescott Perez-Fox,"Yeah, thanks for having me. I’m really looking..."
90921,085: Keeping the Creative Juices Flowing with ...,Pete Mockaitis,"Oh yes, me too. And I think we’re going to hav..."
90922,085: Keeping the Creative Juices Flowing with ...,Prescott Perez-Fox,"Oh goodness, this is actually kind of funny, b..."
90923,085: Keeping the Creative Juices Flowing with ...,Prescott Perez-Fox,"And they were saying, “No, you’ve got to do it..."
90924,085: Keeping the Creative Juices Flowing with ...,Prescott Perez-Fox,"And I was like, “You know, Starship is a good ..."
90925,085: Keeping the Creative Juices Flowing with ...,Pete Mockaitis,"I’d take it too, and it makes me think of Star..."
90926,085: Keeping the Creative Juices Flowing with ...,Prescott Perez-Fox,"Yeah, that’s funny. I have Google alert actual..."
90927,085: Keeping the Creative Juices Flowing with ...,Pete Mockaitis,"I hear you. Well, that’s fun. And so another t..."
90928,085: Keeping the Creative Juices Flowing with ...,Pete Mockaitis,"And it just cracks me up because it’s so bold,..."


In [92]:
#Anoter mispelling, but this is going to be a little more challenging because we don't want Prescott Perez-Fox to become Prescott Prescott Perez-Fox

df_i = df[df['title'] == title_prescott].index[0]
df.iloc[df_i-3:df_i+2]

Unnamed: 0.1,Unnamed: 0,title,text,date
752,758,086: Honing Your Persuasive Skills with Kwame ...,"\nPete Mockaitis\nKwame, thanks so much for jo...","January 16, 2023"
753,759,085: Keeping the Creative Juices Flowing with ...,"\nPete Mockaitis\nPrescott, thanks so much for...","January 16, 2023"
754,760,084: Navigating Your Career Climb with Jolene ...,"\nPete Mockaitis\nJolene, thanks so much for j...","January 16, 2023"
755,761,083: Expanding Your Career Network with Mac Pr...,"\nPete Mockaitis\nMac, thanks so much for join...","January 16, 2023"
756,762,082: Why You Might Not Need to be a Manager wi...,\nPete Mockaitis\nMark thanks so much for join...,"January 16, 2023"


In [93]:
df.reset_index(drop=True, inplace = True)
new_df.reset_index(drop=True, inplace = True)

df_i = df[df['title'] == title_prescott].index[0]
df.iloc[df_i-3:df_i+2]

Unnamed: 0.1,Unnamed: 0,title,text,date
748,756,088: Getting Automated with Dan Caspi - How to...,"\nPete Mockaitis\nDan, thanks so much for join...","January 16, 2023"
749,757,087: More Positivity with Michelle Gielan - Ho...,"\nPete Mockaitis\nMichelle, thanks so much for...","January 16, 2023"
750,758,086: Honing Your Persuasive Skills with Kwame ...,"\nPete Mockaitis\nKwame, thanks so much for jo...","January 16, 2023"
751,759,085: Keeping the Creative Juices Flowing with ...,"\nPete Mockaitis\nPrescott, thanks so much for...","January 16, 2023"
752,760,084: Navigating Your Career Climb with Jolene ...,"\nPete Mockaitis\nJolene, thanks so much for j...","January 16, 2023"


In [94]:
print(df_i)

751


In [95]:
# Replace 'Prescott' with 'Prescott Perez-Fox' in the specific row and column
def replace_prescott(text):
    lines = text.split("\n")
    for i, line in enumerate(lines):
        if line.strip() == "Prescott":
            lines[i] = "Prescott Perez-Fox"
    return "\n".join(lines)

df.at[df_i, 'text'] = replace_prescott(df.at[df_i, 'text'])


episode = df[df['title'] == title_prescott]
episode.reset_index(drop=True, inplace = True)
episode_text = episode.at[0, 'text']
print(episode_text)


Pete Mockaitis
Prescott, thanks so much for being here on the How To Be Awesome At Your Job podcast.
Prescott Perez-Fox
Yeah, thanks for having me. I’m really looking forward to it.
Pete Mockaitis
Oh yes, me too. And I think we’re going to have some real fun here. And I want to know first of all, why did you name your company Starship Design?
Prescott Perez-Fox
Oh goodness, this is actually kind of funny, because it stemmed from a comment thread on a blog, where we were talking about how to name a design firm. And this is maybe 10 years ago, and the folks who brought it up were saying that you should name a design firm the way you name a band, I don’t know, just coming up with weird names. ‘Cause most people do it like a law firm, and it’s like Rigby Jones. I guess that could be a design firm, but it could also be accounting.
And they were saying, “No, you’ve got to do it like a band, it’d be like The Flaming Lips or something crazy like that.” And then people were saying, “What about

In [96]:
#And we can target this specific line in the working dataframe

new_df[new_df['text'] == 'Prescott']
new_df.reset_index(drop=True, inplace = True)

prescott_i = new_df[new_df['text'] == 'Prescott'].index[0]

new_df.iloc[prescott_i-2:prescott_i+3]

Unnamed: 0,title,text
146731,085: Keeping the Creative Juices Flowing with ...,"And it just cracks me up because it’s so bold,..."
146732,085: Keeping the Creative Juices Flowing with ...,Could you share what are some kind of things t...
146733,085: Keeping the Creative Juices Flowing with ...,Prescott
146734,085: Keeping the Creative Juices Flowing with ...,"Right, right. Well, I’m going to back up just ..."
146735,085: Keeping the Creative Juices Flowing with ...,Pete Mockaitis


In [97]:
new_df.at[prescott_i, 'text'] = 'Prescott Perez-Fox'
new_df.iloc[prescott_i-2:prescott_i+3]

Unnamed: 0,title,text
146731,085: Keeping the Creative Juices Flowing with ...,"And it just cracks me up because it’s so bold,..."
146732,085: Keeping the Creative Juices Flowing with ...,Could you share what are some kind of things t...
146733,085: Keeping the Creative Juices Flowing with ...,Prescott Perez-Fox
146734,085: Keeping the Creative Juices Flowing with ...,"Right, right. Well, I’m going to back up just ..."
146735,085: Keeping the Creative Juices Flowing with ...,Pete Mockaitis


In [98]:
prescott_i = new_df[new_df['text'] == 'Prescott'].index[0]
new_df.at[prescott_i, 'text'] = 'Prescott Perez-Fox'
new_df.iloc[prescott_i-2:prescott_i+3]

Unnamed: 0,title,text
146735,085: Keeping the Creative Juices Flowing with ...,Pete Mockaitis
146736,085: Keeping the Creative Juices Flowing with ...,I’m inflamed!
146737,085: Keeping the Creative Juices Flowing with ...,Prescott Perez-Fox
146738,085: Keeping the Creative Juices Flowing with ...,"And cheeky, you know. Yeah. And I never actual..."
146739,085: Keeping the Creative Juices Flowing with ...,Those are the types of things that really suck...


In [99]:
prescott_i = new_df[new_df['text'] == 'Prescott'].index[0]
new_df.at[prescott_i, 'text'] = 'Prescott Perez-Fox'
new_df.iloc[prescott_i-2:prescott_i+3]

Unnamed: 0,title,text
146740,085: Keeping the Creative Juices Flowing with ...,Pete Mockaitis
146741,085: Keeping the Creative Juices Flowing with ...,That is fun. I’m looking forward to getting my...
146742,085: Keeping the Creative Juices Flowing with ...,Prescott Perez-Fox
146743,085: Keeping the Creative Juices Flowing with ...,Awesome!
146744,085: Keeping the Creative Juices Flowing with ...,Pete Mockaitis


In [100]:
prescott_i = new_df[new_df['text'] == 'Prescott'].index[0]
new_df.at[prescott_i, 'text'] = 'Prescott Perez-Fox'
new_df.iloc[prescott_i-2:prescott_i+3]

Unnamed: 0,title,text
146745,085: Keeping the Creative Juices Flowing with ...,"Alright, anyway. Let’s get into some good stuf..."
146746,085: Keeping the Creative Juices Flowing with ...,"But nonetheless, there comes a time when folks..."
146747,085: Keeping the Creative Juices Flowing with ...,Prescott Perez-Fox
146748,085: Keeping the Creative Juices Flowing with ...,"Yeah, okay. Goodness, where do I start? Well, ..."
146749,085: Keeping the Creative Juices Flowing with ...,And in terms of the workplace there’s all sort...


In [101]:
#Recreate our dataframe
statement_df = create_statement_df(new_df)

#Get the list of names
dirty_names = dirty_data(statement_df)
Counter(dirty_names).most_common()

[('Yeah.', 469),
 ('Okay.', 390),
 ('Right.', 309),
 ('Yes.', 201),
 ('Exactly.', 132),
 ('Absolutely.', 92),
 ('Sure.', 84),
 ('Alright.', 42),
 ('No.', 34),
 ('Wow.', 32),
 ('Certainly.', 32),
 ('Yup.', 27),
 ('Cool.', 24),
 ('Perfect.', 15),
 ('Totally.', 14),
 ('Awesome.', 14),
 ('Oh.', 11),
 ('Nice.', 11),
 ('Understood.', 11),
 ('Really?', 10),
 ('Correct.', 10),
 ('Congratulations.', 10),
 ('Exactly. Exactly.', 10),
 ('Beautiful.', 10),
 ('Lovely.', 9),
 ('Thanks.', 9),
 ('Right?', 8),
 ('Excellent.', 8),
 ('Right. Right.', 7),
 ('Intriguing.', 7),
 ('Yes. Yes.', 7),
 ('Gotcha.', 7),
 ('Right. Exactly.', 7),
 ('Great.', 7),
 ('Okay. Cool.', 6),
 ('Hotdog.', 6),
 ('Yep.', 6),
 ('Agreed.', 6),
 ('Interesting.', 5),
 ('Okay. Understood.', 5),
 ('Thanks, Pete.', 5),
 ('Fascinating.', 5),
 ('Good.', 5),
 ('Yeah', 5),
 ('Maybe.', 4),
 ('Indeed.', 4),
 ('Zeigarnik.', 4),
 ('Okay. Good.', 4),
 ('Whoa.', 4),
 ('Okay. Yeah.', 4),
 ('Okay', 3),
 ('Brilliant.', 3),
 ('Absolutely. Absolutely

In [102]:
#Let's look at Pete Mockaitis. and Okay. Pete Mockaitis.

i = statement_df[statement_df['statement'] == 'Pete Mockaitis.'].index[0]

title_pete = statement_df.at[i, 'title']
pete_df = statement_df[statement_df['title'] == title_pete]
pete_df.head(50)

Unnamed: 0,title,speaker,statement
7980,"769: How to Command the Room, Connect with You...",Pete Mockaitis,"Laura, welcome to How to be Awesome at Your Job."
7981,"769: How to Command the Room, Connect with You...",Laura Sicola,"Hi, Pete. Thanks so much for having me on the ..."
7982,"769: How to Command the Room, Connect with You...",Pete Mockaitis,"Well, I’m so excited to get into your wisdom, ..."
7983,"769: How to Command the Room, Connect with You...",Laura Sicola,"The funny thing is that most people, how often..."
7984,"769: How to Command the Room, Connect with You...",Laura Sicola,"So, the challenge is that the way that we usua..."
7985,"769: How to Command the Room, Connect with You...",Pete Mockaitis,"And you said, “Hi, I’m Laura Sicola.”"
7986,"769: How to Command the Room, Connect with You...",Laura Sicola,Right. Exactly. And then you go from there int...
7987,"769: How to Command the Room, Connect with You...",Pete Mockaitis,I want to pause right there. I think that’s pe...
7988,"769: How to Command the Room, Connect with You...",Laura Sicola,Exactly.
7989,"769: How to Command the Room, Connect with You...",Pete Mockaitis,"It’s likewise with your name, you said it many..."


In [103]:
#Looked this up in the official transcript page and it's actually what's supposed to be there!
#So is Laura Sicola.
#Let's look at Bob Pozen

i = statement_df[statement_df['statement'] == 'Bob Pozen'].index[0]

title_pete = statement_df.at[i, 'title']
pete_df = statement_df[statement_df['title'] == title_pete]
pete_df.head(50)

Unnamed: 0,title,speaker,statement
17600,682: How to Boost Your Results through Extreme...,Pete Mockaitis,"Bob, thanks for joining us here on the How to ..."
17601,682: How to Boost Your Results through Extreme...,Robert Pozen,"Glad to be with you, Pete."
17602,682: How to Boost Your Results through Extreme...,Pete Mockaitis,"Well, I’m excited to dig into your wisdom. So,..."
17603,682: How to Boost Your Results through Extreme...,Robert Pozen,"Yeah. Well, I was a member of the board of the..."
17604,682: How to Boost Your Results through Extreme...,Robert Pozen,"We have an induction ceremony every year, and ..."
17605,682: How to Boost Your Results through Extreme...,Robert Pozen,He was one of the few players to hit a triple ...
17606,682: How to Boost Your Results through Extreme...,Pete Mockaitis,"Oh, sure."
17607,682: How to Boost Your Results through Extreme...,Robert Pozen,"“Robertson, Robertson, Robertson."
17608,682: How to Boost Your Results through Extreme...,Robert Pozen,"Robertson, Robertson, Robertson."
17609,682: How to Boost Your Results through Extreme...,Robert Pozen,"Robertson, Robertson, Robertson."


In [104]:
#Bob is a nickname for Robert. It was just used wrong

df['text'] = df['text'].apply(lambda x: str(x).replace(u'Bob Pozen', u'Robert Pozen'))
new_df.replace('Bob Pozen', 'Robert Pozen', inplace = True)

#Recreate our dataframe
statement_df = create_statement_df(new_df)

#Get the list of names
dirty_names = dirty_data(statement_df)
Counter(dirty_names).most_common()

[('Yeah.', 469),
 ('Okay.', 390),
 ('Right.', 309),
 ('Yes.', 201),
 ('Exactly.', 132),
 ('Absolutely.', 92),
 ('Sure.', 84),
 ('Alright.', 42),
 ('No.', 34),
 ('Wow.', 32),
 ('Certainly.', 32),
 ('Yup.', 27),
 ('Cool.', 24),
 ('Perfect.', 15),
 ('Totally.', 14),
 ('Awesome.', 14),
 ('Oh.', 11),
 ('Nice.', 11),
 ('Understood.', 11),
 ('Really?', 10),
 ('Correct.', 10),
 ('Congratulations.', 10),
 ('Exactly. Exactly.', 10),
 ('Beautiful.', 10),
 ('Lovely.', 9),
 ('Thanks.', 9),
 ('Right?', 8),
 ('Excellent.', 8),
 ('Right. Right.', 7),
 ('Intriguing.', 7),
 ('Yes. Yes.', 7),
 ('Gotcha.', 7),
 ('Right. Exactly.', 7),
 ('Great.', 7),
 ('Okay. Cool.', 6),
 ('Hotdog.', 6),
 ('Yep.', 6),
 ('Agreed.', 6),
 ('Interesting.', 5),
 ('Okay. Understood.', 5),
 ('Thanks, Pete.', 5),
 ('Fascinating.', 5),
 ('Good.', 5),
 ('Yeah', 5),
 ('Maybe.', 4),
 ('Indeed.', 4),
 ('Zeigarnik.', 4),
 ('Okay. Good.', 4),
 ('Whoa.', 4),
 ('Okay. Yeah.', 4),
 ('Okay', 3),
 ('Brilliant.', 3),
 ('Absolutely. Absolutely

In [105]:
#Let's look at Cal Newport, Michael Hyatt

i = statement_df[statement_df['statement'] == 'Cal Newport, Michael Hyatt.'].index[0]

title_pete = statement_df.at[i, 'title']
pete_df = statement_df[statement_df['title'] == title_pete]
pete_df.tail(50)

Unnamed: 0,title,speaker,statement
42243,461: Tactics for Boosting Productivity and Ban...,Pete Mockaitis,I dig.
42244,461: Tactics for Boosting Productivity and Ban...,Erik Fisher,"So, yeah."
42245,461: Tactics for Boosting Productivity and Ban...,Pete Mockaitis,"Cool. All right. Well, maybe we’ll zoom out a ..."
42246,461: Tactics for Boosting Productivity and Ban...,Erik Fisher,"Yeah, we’re basically at the seven-year mark."
42247,461: Tactics for Boosting Productivity and Ban...,Pete Mockaitis,"That’s so amazing. Well, congratulations. And ..."
42248,461: Tactics for Boosting Productivity and Ban...,Erik Fisher,Thank you.
42249,461: Tactics for Boosting Productivity and Ban...,Pete Mockaitis,"So, yeah, what are some themes that have come ..."
42250,461: Tactics for Boosting Productivity and Ban...,Erik Fisher,"Well, I kind of alluded to it a little bit jus..."
42251,461: Tactics for Boosting Productivity and Ban...,Pete Mockaitis,Effectiveness?
42252,461: Tactics for Boosting Productivity and Ban...,Erik Fisher,That is it. I feel like you’ve listened to tha...


In [106]:
#That's how it's supposed to be in the transcript
#Let's look at Hermann Hesse.

i = statement_df[statement_df['statement'] == 'Hermann Hesse.'].index[0]

title_pete = statement_df.at[i, 'title']
pete_df = statement_df[statement_df['title'] == title_pete]
pete_df.tail(50)

Unnamed: 0,title,speaker,statement
53844,379: The Four Steps to Creating Chemistry with...,Barney Feinberg,"But just to go through that sentence quickly, ..."
53845,379: The Four Steps to Creating Chemistry with...,Barney Feinberg,"Then “We’re always looking for better ways,” s..."
53846,379: The Four Steps to Creating Chemistry with...,Barney Feinberg,What usually happens in a good conversation is...
53847,379: The Four Steps to Creating Chemistry with...,Pete Mockaitis,That’s interesting how you said maybe there a ...
53848,379: The Four Steps to Creating Chemistry with...,Barney Feinberg,Maybe I misheard it. Maybe I was listening for...
53849,379: The Four Steps to Creating Chemistry with...,Barney Feinberg,"To listen for values, you want to be using wha..."
53850,379: The Four Steps to Creating Chemistry with...,Pete Mockaitis,"Oh yeah, got it."
53851,379: The Four Steps to Creating Chemistry with...,Barney Feinberg,Yeah. How did that feel?
53852,379: The Four Steps to Creating Chemistry with...,Pete Mockaitis,"Well, it was-"
53853,379: The Four Steps to Creating Chemistry with...,Barney Feinberg,Thinking about that vacation.


In [107]:
#That's how it's supposed to be in the transcript
#Let's look at Pete Mockaitis,

i = statement_df[statement_df['statement'] == 'Pete Mockaitis,'].index[0]

title_pete = statement_df.at[i, 'title']
pete_df = statement_df[statement_df['title'] == title_pete]
pete_df.head(50)

Unnamed: 0,title,speaker,statement
65270,305: Screwing Up Masterfully with Kristen Hade...,Pete Mockaitis,"Kristen, thanks so much for joining us here on..."
65271,305: Screwing Up Masterfully with Kristen Hade...,Kristen Hadeed,Thank you for having me.
65272,305: Screwing Up Masterfully with Kristen Hade...,Pete Mockaitis,"Oh, I think we’re going to have so much fun he..."
65273,305: Screwing Up Masterfully with Kristen Hade...,Kristen Hadeed,I’m the worst Gator football fan ever. I went ...
65274,305: Screwing Up Masterfully with Kristen Hade...,Pete Mockaitis,He was a student there?
65275,305: Screwing Up Masterfully with Kristen Hade...,Kristen Hadeed,He was a student. Yes. That was when he was – ...
65276,305: Screwing Up Masterfully with Kristen Hade...,Pete Mockaitis,"He’s living large, college student having some..."
65277,305: Screwing Up Masterfully with Kristen Hade...,Kristen Hadeed,Yeah. I’d say so.
65278,305: Screwing Up Masterfully with Kristen Hade...,Pete Mockaitis,That’s funny. They treat the athletes well. Th...
65279,305: Screwing Up Masterfully with Kristen Hade...,Kristen Hadeed,Yeah.


In [108]:
#Anoter mispelling, but this is going to be a little more challenging because we don't want Pete Mockaitis, to be changed to Pete Mockaitis in every instance

df_i = df[df['title'] == title_pete].index[0]
df.iloc[df_i-3:df_i+2]

Unnamed: 0.1,Unnamed: 0,title,text,date
530,533,308: How to Make Creative Ideas Irresistible w...,"\nPete Mockaitis\nAllen, thanks for joining us...","January 16, 2023"
531,534,307: Persuasive Speaking with Carmine Gallo - ...,"\nPete Mockaitis\nCarmine, thanks so much for ...","January 16, 2023"
532,535,306: Taking Care of Your Brain With Dr. Mike D...,"\nPete Mockaitis\nDr. Mike, thanks so much for...","January 16, 2023"
533,536,305: Screwing Up Masterfully with Kristen Hade...,"\nPete Mockaitis\nKristen, thanks so much for ...","January 16, 2023"
534,537,304: Resigning Perfectly with Joseph Liu - How...,"\nPete Mockaitis\nJoseph, thanks so much for j...","January 16, 2023"


In [109]:
df.reset_index(drop=True, inplace = True)
new_df.reset_index(drop=True, inplace = True)

df_i = df[df['title'] == title_pete].index[0]
df.iloc[df_i-3:df_i+2]

Unnamed: 0.1,Unnamed: 0,title,text,date
530,533,308: How to Make Creative Ideas Irresistible w...,"\nPete Mockaitis\nAllen, thanks for joining us...","January 16, 2023"
531,534,307: Persuasive Speaking with Carmine Gallo - ...,"\nPete Mockaitis\nCarmine, thanks so much for ...","January 16, 2023"
532,535,306: Taking Care of Your Brain With Dr. Mike D...,"\nPete Mockaitis\nDr. Mike, thanks so much for...","January 16, 2023"
533,536,305: Screwing Up Masterfully with Kristen Hade...,"\nPete Mockaitis\nKristen, thanks so much for ...","January 16, 2023"
534,537,304: Resigning Perfectly with Joseph Liu - How...,"\nPete Mockaitis\nJoseph, thanks so much for j...","January 16, 2023"


In [110]:
# Replace 'Pete Mockaitis,' with 'Pete Mockaitis' in the specific row and column
def replace_pete_comma(text):
    lines = text.split("\n")
    for i, line in enumerate(lines):
        if line.strip() == "Pete Mockaitis,":
            lines[i] = "Pete Mockaitis"
    return "\n".join(lines)

df.at[df_i, 'text'] = replace_pete_comma(df.at[df_i, 'text'])

episode = df[df['title'] == title_pete]
episode.reset_index(drop=True, inplace = True)
episode_text = episode.at[0, 'text']
print(episode_text)


Pete Mockaitis
Kristen, thanks so much for joining us here on the How to Be Awesome At Your Job podcast.
Kristen Hadeed
Thank you for having me.
Pete Mockaitis
Oh, I think we’re going to have so much fun here. First, I want to hear the tale, I understand that one time you actually cleaned Tim Tebow’s house while in college. Did you bump into him? You didn’t know who he was? What’s the story here?
Kristen Hadeed
I’m the worst Gator football fan ever. I went to the University of Florida, which is a big school, cares about football. Yes, I cleaned Tim Tebow’s apartment several times and I did not even know it was him.
Pete Mockaitis
He was a student there?
Kristen Hadeed
He was a student. Yes. That was when he was – we won the National Championship. I should have known who he was, but nope. No clue.
Pete Mockaitis
He’s living large, college student having someone clean his house.
Kristen Hadeed
Yeah. I’d say so.
Pete Mockaitis
That’s funny. They treat the athletes well. There’s probably 

In [111]:
#And we can target this specific line in the working dataframe

new_df[new_df['text'] == 'Pete Mockaitis,']
new_df.reset_index(drop=True, inplace = True)

pete_i = new_df[new_df['text'] == 'Pete Mockaitis,'].index[0]

new_df.at[pete_i, 'text'] = 'Pete Mockaitis'
new_df.iloc[pete_i-2:pete_i+3]

Unnamed: 0,title,text
104631,305: Screwing Up Masterfully with Kristen Hade...,I remember – I was so shocked. I couldn’t even...
104632,305: Screwing Up Masterfully with Kristen Hade...,The work was absolutely horrible. You’re clean...
104633,305: Screwing Up Masterfully with Kristen Hade...,Pete Mockaitis
104634,305: Screwing Up Masterfully with Kristen Hade...,Understood. Was there like a speech or did the...
104635,305: Screwing Up Masterfully with Kristen Hade...,Kristen Hadeed


In [112]:
#Recreate our dataframe
statement_df = create_statement_df(new_df)

#Get the list of names
dirty_names = dirty_data(statement_df)
Counter(dirty_names).most_common()

[('Yeah.', 469),
 ('Okay.', 390),
 ('Right.', 309),
 ('Yes.', 201),
 ('Exactly.', 132),
 ('Absolutely.', 92),
 ('Sure.', 84),
 ('Alright.', 42),
 ('No.', 34),
 ('Wow.', 32),
 ('Certainly.', 32),
 ('Yup.', 27),
 ('Cool.', 24),
 ('Perfect.', 15),
 ('Totally.', 14),
 ('Awesome.', 14),
 ('Oh.', 11),
 ('Nice.', 11),
 ('Understood.', 11),
 ('Really?', 10),
 ('Correct.', 10),
 ('Congratulations.', 10),
 ('Exactly. Exactly.', 10),
 ('Beautiful.', 10),
 ('Lovely.', 9),
 ('Thanks.', 9),
 ('Right?', 8),
 ('Excellent.', 8),
 ('Right. Right.', 7),
 ('Intriguing.', 7),
 ('Yes. Yes.', 7),
 ('Gotcha.', 7),
 ('Right. Exactly.', 7),
 ('Great.', 7),
 ('Okay. Cool.', 6),
 ('Hotdog.', 6),
 ('Yep.', 6),
 ('Agreed.', 6),
 ('Interesting.', 5),
 ('Okay. Understood.', 5),
 ('Thanks, Pete.', 5),
 ('Fascinating.', 5),
 ('Good.', 5),
 ('Yeah', 5),
 ('Maybe.', 4),
 ('Indeed.', 4),
 ('Zeigarnik.', 4),
 ('Okay. Good.', 4),
 ('Whoa.', 4),
 ('Okay. Yeah.', 4),
 ('Okay', 3),
 ('Brilliant.', 3),
 ('Absolutely. Absolutely

In [113]:
#Let's look at Pete Mockaitis\u2028Okay

i = statement_df[statement_df['statement'] == 'Pete Mockaitis\u2028Okay.'].index[0]

title_pete = statement_df.at[i, 'title']
pete_df = statement_df[statement_df['title'] == title_pete]
statement_df.iloc[i-2:i+3]

Unnamed: 0,title,speaker,statement
70633,265: Getting the Most Out of Each Day with Pet...,Peter Shankman,"Pete Mockaitis So, the corollary then, on the ..."
70634,265: Getting the Most Out of Each Day with Pet...,Peter Shankman,"It varies. We could do everything from, “Oh, m..."
70635,265: Getting the Most Out of Each Day with Pet...,Peter Shankman,Pete Mockaitis Okay.
70636,265: Getting the Most Out of Each Day with Pet...,Peter Shankman,"Right? And it’s so funny because people, “Oh, ..."
70637,265: Getting the Most Out of Each Day with Pet...,Peter Shankman,I’m pretty sure that we both live on the same ...


In [114]:
i = statement_df[statement_df['statement'] == 'Matthew Abrahams\u2028Yes.'].index[0]

title_pete = statement_df.at[i, 'title']
statement_df.iloc[i-2:i+3]

Unnamed: 0,title,speaker,statement
72161,253: How to Speak Out...Without a Freak Out wi...,Matthew Abrahams,"You seem so natural, Pete. It’s amazing. We kn..."
72162,253: How to Speak Out...Without a Freak Out wi...,Pete Mockaitis,"I’m not a robot. Cool. So, I dig that. And the..."
72163,253: How to Speak Out...Without a Freak Out wi...,Pete Mockaitis,Matthew Abrahams Yes.
72164,253: How to Speak Out...Without a Freak Out wi...,Pete Mockaitis,And I’ve seen sort of a similar view in terms ...
72165,253: How to Speak Out...Without a Freak Out wi...,Matthew Abrahams,Absolutely. There’s a wonderful saying from th...


In [115]:
#We need to change these \u2028's into \n's
df['text'] = df['text'].apply(lambda x: str(x).replace(u'\u2028', '\n '))

#And to fix that in our working dataframe, we actually need to recreate it.

lines = []

for index, row in df.iterrows():
    text = row['text']
    title = row['title']
    transcript = text.split("\n")
    for line in transcript:
        line = BeautifulSoup(line, 'html.parser').get_text(strip=True)
        lines.append([title, line])  
        
new_df = pd.DataFrame(lines, columns = ['title', 'text'])
new_df['text'] = new_df['text'].str.strip()
new_df['title'] = new_df['title'].str.strip()

#Recreate our dataframe
statement_df = create_statement_df(new_df)

#Get the list of names
dirty_names = dirty_data(statement_df)
Counter(dirty_names).most_common()

[('Yeah.', 469),
 ('Okay.', 391),
 ('Right.', 309),
 ('Yes.', 202),
 ('Exactly.', 132),
 ('Absolutely.', 92),
 ('Sure.', 84),
 ('Alright.', 42),
 ('No.', 34),
 ('Wow.', 32),
 ('Certainly.', 32),
 ('Yup.', 27),
 ('Cool.', 24),
 ('Perfect.', 15),
 ('Totally.', 14),
 ('Awesome.', 14),
 ('Oh.', 11),
 ('Nice.', 11),
 ('Understood.', 11),
 ('Really?', 10),
 ('Correct.', 10),
 ('Congratulations.', 10),
 ('Exactly. Exactly.', 10),
 ('Beautiful.', 10),
 ('Lovely.', 9),
 ('Thanks.', 9),
 ('Right?', 8),
 ('Excellent.', 8),
 ('Right. Right.', 7),
 ('Intriguing.', 7),
 ('Yes. Yes.', 7),
 ('Gotcha.', 7),
 ('Right. Exactly.', 7),
 ('Great.', 7),
 ('Okay. Cool.', 6),
 ('Hotdog.', 6),
 ('Yep.', 6),
 ('Agreed.', 6),
 ('Interesting.', 5),
 ('Okay. Understood.', 5),
 ('Thanks, Pete.', 5),
 ('Fascinating.', 5),
 ('Good.', 5),
 ('Yeah', 5),
 ('Maybe.', 4),
 ('Indeed.', 4),
 ('Zeigarnik.', 4),
 ('Okay. Good.', 4),
 ('Whoa.', 4),
 ('Okay. Yeah.', 4),
 ('Okay', 3),
 ('Brilliant.', 3),
 ('Absolutely. Absolutely

In [116]:
#Let's look at Nir Eyal.

i = statement_df[statement_df['statement'] == 'Nir Eyal.'].index[0]

title_pete = statement_df.at[i, 'title']
pete_df = statement_df[statement_df['title'] == title_pete]
statement_df.iloc[i-2:i+3]

Unnamed: 0,title,speaker,statement
25374,623: Mastering the 7 Habits of Highly Effectiv...,Pete Mockaitis,"Okay, thank you. And maybe the last question b..."
25375,623: Mastering the 7 Habits of Highly Effectiv...,Jennifer Colosimo,"Well, of course, there’s a lot in The 7 Habits..."
25376,623: Mastering the 7 Habits of Highly Effectiv...,Pete Mockaitis,Nir Eyal.
25377,623: Mastering the 7 Habits of Highly Effectiv...,Jennifer Colosimo,"Yeah, Indistractable. Have you read that book?"
25378,623: Mastering the 7 Habits of Highly Effectiv...,Pete Mockaitis,He’s been on the show.


In [117]:
#Let's look at Karlo Siriban:

i = statement_df[statement_df['statement'] == 'Karlo Siriban:'].index[0]

title_pete = statement_df.at[i, 'title']
pete_df = statement_df[statement_df['title'] == title_pete]
statement_df.iloc[i-4:i+6]

Unnamed: 0,title,speaker,statement
67793,309: Preventing Burnout in Yourself and Your W...,Karlo Siriban,Mine is How to Be Black by Baratunde Thurston.
67794,309: Preventing Burnout in Yourself and Your W...,Pete Mockaitis,I met him at a book signing and I have a signe...
67795,309: Preventing Burnout in Yourself and Your W...,Karlo Siriban,Really?
67796,309: Preventing Burnout in Yourself and Your W...,Pete Mockaitis,"I did, yes."
67797,309: Preventing Burnout in Yourself and Your W...,Pete Mockaitis,Karlo Siriban:
67798,309: Preventing Burnout in Yourself and Your W...,Pete Mockaitis,Oh you lucky duck.
67799,309: Preventing Burnout in Yourself and Your W...,Pete Mockaitis,"He’s good friends with my buddy, Mawi, who was..."
67800,309: Preventing Burnout in Yourself and Your W...,Anne Donovan,I love it.
67801,309: Preventing Burnout in Yourself and Your W...,Pete Mockaitis,I guess I have to get rid of this book now. Ve...
67802,309: Preventing Burnout in Yourself and Your W...,Anne Donovan,I love it.


In [118]:
#Looks like a typo

df['text'] = df['text'].apply(lambda x: str(x).replace(u'Karlo Siriban:', u'Karlo Siriban'))
new_df.replace('Karlo Siriban:', 'Karlo Siriban', inplace = True)

#Recreate our dataframe
statement_df = create_statement_df(new_df)

#Get the list of names
dirty_names = dirty_data(statement_df)
Counter(dirty_names).most_common()

[('Yeah.', 469),
 ('Okay.', 391),
 ('Right.', 309),
 ('Yes.', 202),
 ('Exactly.', 132),
 ('Absolutely.', 92),
 ('Sure.', 84),
 ('Alright.', 42),
 ('No.', 34),
 ('Wow.', 32),
 ('Certainly.', 32),
 ('Yup.', 27),
 ('Cool.', 24),
 ('Perfect.', 15),
 ('Totally.', 14),
 ('Awesome.', 14),
 ('Oh.', 11),
 ('Nice.', 11),
 ('Understood.', 11),
 ('Really?', 10),
 ('Correct.', 10),
 ('Congratulations.', 10),
 ('Exactly. Exactly.', 10),
 ('Beautiful.', 10),
 ('Lovely.', 9),
 ('Thanks.', 9),
 ('Right?', 8),
 ('Excellent.', 8),
 ('Right. Right.', 7),
 ('Intriguing.', 7),
 ('Yes. Yes.', 7),
 ('Gotcha.', 7),
 ('Right. Exactly.', 7),
 ('Great.', 7),
 ('Okay. Cool.', 6),
 ('Hotdog.', 6),
 ('Yep.', 6),
 ('Agreed.', 6),
 ('Interesting.', 5),
 ('Okay. Understood.', 5),
 ('Thanks, Pete.', 5),
 ('Fascinating.', 5),
 ('Good.', 5),
 ('Yeah', 5),
 ('Maybe.', 4),
 ('Indeed.', 4),
 ('Zeigarnik.', 4),
 ('Okay. Good.', 4),
 ('Whoa.', 4),
 ('Okay. Yeah.', 4),
 ('Okay', 3),
 ('Brilliant.', 3),
 ('Absolutely. Absolutely

In [119]:
#Found a for sure typo

df['text'] = df['text'].apply(lambda x: str(x).replace(u'[Pete Mockaitis', u'Pete Mockaitis'))
new_df.replace('[Pete Mockaitis', 'Pete Mockaitis', inplace = True)

#Let's look at Paul Zak.

i = statement_df[statement_df['statement'] == 'Paul Zak.'].index[0]

title_pete = statement_df.at[i, 'title']
pete_df = statement_df[statement_df['title'] == title_pete]
statement_df.iloc[i-4:i+6]

Unnamed: 0,title,speaker,statement
90753,137: Calming Performance Anxiety Like a Pro wi...,Tom Hanson,"And then the honor, no one fulfills all their ..."
90754,137: Calming Performance Anxiety Like a Pro wi...,Tom Hanson,"And then if that hasn’t happened, the person w..."
90755,137: Calming Performance Anxiety Like a Pro wi...,Tom Hanson,"When I work with the little league team, or a ..."
90756,137: Calming Performance Anxiety Like a Pro wi...,Tom Hanson,I’m using your guy’s book there – Zak?
90757,137: Calming Performance Anxiety Like a Pro wi...,Pete Mockaitis,Paul Zak.
90758,137: Calming Performance Anxiety Like a Pro wi...,Tom Hanson,Great interview you did with him. Thank you fo...
90759,137: Calming Performance Anxiety Like a Pro wi...,Tom Hanson,How do you build trust? Business moves at the ...
90760,137: Calming Performance Anxiety Like a Pro wi...,Tom Hanson,"So the words like declaration, you’ve got to h..."
90761,137: Calming Performance Anxiety Like a Pro wi...,Tom Hanson,You see how there’s a clarity in that. It does...
90762,137: Calming Performance Anxiety Like a Pro wi...,Pete Mockaitis,"Understood. Well, that’s a great sort of set o..."


In [120]:
#Let's look at Uncle Rico

i = statement_df[statement_df['statement'] == 'Uncle Rico.'].index[0]

title_pete = statement_df.at[i, 'title']
pete_df = statement_df[statement_df['title'] == title_pete]
statement_df.iloc[i-4:i+6]

Unnamed: 0,title,speaker,statement
94371,"105: “Yes, and…” approaches to improv-ing work...",Pete Mockaitis,I’m in favor. I don’t know if there’s like a “...
94372,"105: “Yes, and…” approaches to improv-ing work...",Pete Mockaitis,
94373,"105: “Yes, and…” approaches to improv-ing work...",Bob Kulhan,My wife had a natural birth with both our chil...
94374,"105: “Yes, and…” approaches to improv-ing work...",Bob Kulhan,
94375,"105: “Yes, and…” approaches to improv-ing work...",Pete Mockaitis,Uncle Rico.
94376,"105: “Yes, and…” approaches to improv-ing work...",Pete Mockaitis,
94377,"105: “Yes, and…” approaches to improv-ing work...",Bob Kulhan,"I was told by my track coach, “Breathe into yo..."
94378,"105: “Yes, and…” approaches to improv-ing work...",Bob Kulhan,
94379,"105: “Yes, and…” approaches to improv-ing work...",Pete Mockaitis,"Okay. Well, I think people really do kind of f..."
94380,"105: “Yes, and…” approaches to improv-ing work...",Pete Mockaitis,


Note: review episode 105

In [121]:
#Let's look at Daniel Perez.

i = statement_df[statement_df['statement'] == 'Daniel Perez.'].index[0]

title_pete = statement_df.at[i, 'title']
pete_df = statement_df[statement_df['title'] == title_pete]
statement_df.iloc[i-4:i+6]

Unnamed: 0,title,speaker,statement
95911,094: Upgrading Your Professional Style with Sh...,Pete Mockaitis,"Cool. I got to get your take, have you read Th..."
95912,094: Upgrading Your Professional Style with Sh...,Sharon Haver,Who’s that by?
95913,094: Upgrading Your Professional Style with Sh...,Pete Mockaitis,"Details Magazine, I believe."
95914,094: Upgrading Your Professional Style with Sh...,Sharon Haver,"I know, but it has an author."
95915,094: Upgrading Your Professional Style with Sh...,Pete Mockaitis,Daniel Perez.
95916,094: Upgrading Your Professional Style with Sh...,Sharon Haver,"No, because I read one a few years ago. I’m ac..."
95917,094: Upgrading Your Professional Style with Sh...,Pete Mockaitis,"How about a favorite tool, something you use o..."
95918,094: Upgrading Your Professional Style with Sh...,Sharon Haver,Oh my God. You know I’m like the internet tech...
95919,094: Upgrading Your Professional Style with Sh...,Pete Mockaitis,"A favorite habit, a personal practice of yours..."
95920,094: Upgrading Your Professional Style with Sh...,Sharon Haver,"Oh my God, just know when to give up, know whe..."


In [122]:
#That's how it's supposed to be
#And we found a for sure typo

df['text'] = df['text'].apply(lambda x: str(x).replace(u'Pete Mockatis', u'Pete Mockaitis'))
new_df.replace('Pete Mockatis', 'Pete Mockaitis', inplace = True)

#Recreate our dataframe
statement_df = create_statement_df(new_df)

#Get the list of names
dirty_names = dirty_data(statement_df)
Counter(dirty_names).most_common()

[('Yeah.', 469),
 ('Okay.', 391),
 ('Right.', 309),
 ('Yes.', 202),
 ('Exactly.', 132),
 ('Absolutely.', 92),
 ('Sure.', 84),
 ('Alright.', 42),
 ('No.', 34),
 ('Wow.', 32),
 ('Certainly.', 32),
 ('Yup.', 27),
 ('Cool.', 24),
 ('Perfect.', 15),
 ('Totally.', 14),
 ('Awesome.', 14),
 ('Oh.', 11),
 ('Nice.', 11),
 ('Understood.', 11),
 ('Really?', 10),
 ('Correct.', 10),
 ('Congratulations.', 10),
 ('Exactly. Exactly.', 10),
 ('Beautiful.', 10),
 ('Lovely.', 9),
 ('Thanks.', 9),
 ('Right?', 8),
 ('Excellent.', 8),
 ('Right. Right.', 7),
 ('Intriguing.', 7),
 ('Yes. Yes.', 7),
 ('Gotcha.', 7),
 ('Right. Exactly.', 7),
 ('Great.', 7),
 ('Okay. Cool.', 6),
 ('Hotdog.', 6),
 ('Yep.', 6),
 ('Agreed.', 6),
 ('Interesting.', 5),
 ('Okay. Understood.', 5),
 ('Thanks, Pete.', 5),
 ('Fascinating.', 5),
 ('Good.', 5),
 ('Yeah', 5),
 ('Maybe.', 4),
 ('Indeed.', 4),
 ('Zeigarnik.', 4),
 ('Okay. Good.', 4),
 ('Whoa.', 4),
 ('Okay. Yeah.', 4),
 ('Okay', 3),
 ('Brilliant.', 3),
 ('Absolutely. Absolutely

In [123]:
#Let's look at John Poelstra: Yes.

i = statement_df[statement_df['statement'] == 'John Poelstra: Yes.'].index[0]

title_pete = statement_df.at[i, 'title']
pete_df = statement_df[statement_df['title'] == title_pete]
statement_df.iloc[i-4:i+6]

Unnamed: 0,title,speaker,statement
100152,058: Better Meetings with John Poelstra - How ...,John Poelstra,need to be moved?
100153,058: Better Meetings with John Poelstra - How ...,John Poelstra,
100154,058: Better Meetings with John Poelstra - How ...,Pete Mockaitis,Oh that’s so good. That reminds me of David Al...
100155,058: Better Meetings with John Poelstra - How ...,Pete Mockaitis,"to do list, if it just says “mom,” that’s not ..."
100156,058: Better Meetings with John Poelstra - How ...,Pete Mockaitis,John Poelstra: Yes.
100157,058: Better Meetings with John Poelstra - How ...,John Poelstra,What are we going to do about mom? Are we goin...
100158,058: Better Meetings with John Poelstra - How ...,John Poelstra,Are we just going to give her a call because i...
100159,058: Better Meetings with John Poelstra - How ...,John Poelstra,I have seen agendas that just say “release dat...
100160,058: Better Meetings with John Poelstra - How ...,John Poelstra,"sense that, “Okay the release date is an impor..."
100161,058: Better Meetings with John Poelstra - How ...,John Poelstra,dose of thinking to determine what is the anti...


In [124]:
#This is another typo. We'll need to make some adjustments to our raw dataframe and recreate our working dataframe

#We need to change this text
df['text'] = df['text'].apply(lambda x: str(x).replace(u'John Poelstra: Yes.', 'John Poelstra\n Yes.\n'))

#And to fix that in our working dataframe, we actually need to recreate it.

lines = []

for index, row in df.iterrows():
    text = row['text']
    title = row['title']
    transcript = text.split("\n")
    for line in transcript:
        line = BeautifulSoup(line, 'html.parser').get_text(strip=True)
        lines.append([title, line])  
        
new_df = pd.DataFrame(lines, columns = ['title', 'text'])
new_df['text'] = new_df['text'].str.strip()
new_df['title'] = new_df['title'].str.strip()

#Recreate our dataframe
statement_df = create_statement_df(new_df)

#Get the list of names
dirty_names = dirty_data(statement_df)
Counter(dirty_names).most_common()

[('Yeah.', 469),
 ('Okay.', 391),
 ('Right.', 309),
 ('Yes.', 203),
 ('Exactly.', 132),
 ('Absolutely.', 92),
 ('Sure.', 84),
 ('Alright.', 42),
 ('No.', 34),
 ('Wow.', 32),
 ('Certainly.', 32),
 ('Yup.', 27),
 ('Cool.', 24),
 ('Perfect.', 15),
 ('Totally.', 14),
 ('Awesome.', 14),
 ('Oh.', 11),
 ('Nice.', 11),
 ('Understood.', 11),
 ('Really?', 10),
 ('Correct.', 10),
 ('Congratulations.', 10),
 ('Exactly. Exactly.', 10),
 ('Beautiful.', 10),
 ('Lovely.', 9),
 ('Thanks.', 9),
 ('Right?', 8),
 ('Excellent.', 8),
 ('Right. Right.', 7),
 ('Intriguing.', 7),
 ('Yes. Yes.', 7),
 ('Gotcha.', 7),
 ('Right. Exactly.', 7),
 ('Great.', 7),
 ('Okay. Cool.', 6),
 ('Hotdog.', 6),
 ('Yep.', 6),
 ('Agreed.', 6),
 ('Interesting.', 5),
 ('Okay. Understood.', 5),
 ('Thanks, Pete.', 5),
 ('Fascinating.', 5),
 ('Good.', 5),
 ('Yeah', 5),
 ('Maybe.', 4),
 ('Indeed.', 4),
 ('Zeigarnik.', 4),
 ('Okay. Good.', 4),
 ('Whoa.', 4),
 ('Okay. Yeah.', 4),
 ('Okay', 3),
 ('Brilliant.', 3),
 ('Absolutely. Absolutely

In [125]:
#And we found a for sure typo

df['text'] = df['text'].apply(lambda x: str(x).replace(u'Michael Kerr|', u'Michael Kerr'))
new_df.replace('Michael Kerr|', 'Michael Kerr', inplace = True)

#Recreate our dataframe
statement_df = create_statement_df(new_df)

#Get the list of names
dirty_names = dirty_data(statement_df)
Counter(dirty_names).most_common()

[('Yeah.', 469),
 ('Okay.', 391),
 ('Right.', 309),
 ('Yes.', 203),
 ('Exactly.', 132),
 ('Absolutely.', 92),
 ('Sure.', 84),
 ('Alright.', 42),
 ('No.', 34),
 ('Wow.', 32),
 ('Certainly.', 32),
 ('Yup.', 27),
 ('Cool.', 24),
 ('Perfect.', 15),
 ('Totally.', 14),
 ('Awesome.', 14),
 ('Oh.', 11),
 ('Nice.', 11),
 ('Understood.', 11),
 ('Really?', 10),
 ('Correct.', 10),
 ('Congratulations.', 10),
 ('Exactly. Exactly.', 10),
 ('Beautiful.', 10),
 ('Lovely.', 9),
 ('Thanks.', 9),
 ('Right?', 8),
 ('Excellent.', 8),
 ('Right. Right.', 7),
 ('Intriguing.', 7),
 ('Yes. Yes.', 7),
 ('Gotcha.', 7),
 ('Right. Exactly.', 7),
 ('Great.', 7),
 ('Okay. Cool.', 6),
 ('Hotdog.', 6),
 ('Yep.', 6),
 ('Agreed.', 6),
 ('Interesting.', 5),
 ('Okay. Understood.', 5),
 ('Thanks, Pete.', 5),
 ('Fascinating.', 5),
 ('Good.', 5),
 ('Yeah', 5),
 ('Maybe.', 4),
 ('Indeed.', 4),
 ('Zeigarnik.', 4),
 ('Okay. Good.', 4),
 ('Whoa.', 4),
 ('Okay. Yeah.', 4),
 ('Okay', 3),
 ('Brilliant.', 3),
 ('Absolutely. Absolutely

In [126]:
#Let's look at Tonya Dalton:

i = statement_df[statement_df['statement'] == 'Tonya Dalton:'].index[0]

title_pete = statement_df.at[i, 'title']
pete_df = statement_df[statement_df['title'] == title_pete]
statement_df.iloc[i-4:i+6]

Unnamed: 0,title,speaker,statement
38607,506: Finding the Joy of Missing Out with Tonya...,Tonya Dalton,"So what happens is, if we’re checking email ev..."
38608,506: Finding the Joy of Missing Out with Tonya...,Tonya Dalton,"If instead we batch it, we have these bigger b..."
38609,506: Finding the Joy of Missing Out with Tonya...,Tonya Dalton,Now I have worked with other people who are li...
38610,506: Finding the Joy of Missing Out with Tonya...,Pete Mockaitis,"And so for you personally, four bouts of 15 to..."
38611,506: Finding the Joy of Missing Out with Tonya...,Pete Mockaitis,Tonya Dalton:
38612,506: Finding the Joy of Missing Out with Tonya...,Pete Mockaitis,"Yes. And like anything else, when you’re batch..."
38613,506: Finding the Joy of Missing Out with Tonya...,Pete Mockaitis,Anytime that we’re batching tasks and we’re do...
38614,506: Finding the Joy of Missing Out with Tonya...,Pete Mockaitis,
38615,506: Finding the Joy of Missing Out with Tonya...,Pete Mockaitis,
38616,506: Finding the Joy of Missing Out with Tonya...,Pete Mockaitis,"All right, so Tonya, when folks are on board, ..."


In [127]:
#That's a for sure typo

df['text'] = df['text'].apply(lambda x: str(x).replace(u'Tonya Dalton:', u'Tonya Dalton'))
new_df.replace('Tonya Dalton:', 'Tonya Dalton', inplace = True)

#Recreate our dataframe
statement_df = create_statement_df(new_df)

#Get the list of names
dirty_names = dirty_data(statement_df)
Counter(dirty_names).most_common()

[('Yeah.', 469),
 ('Okay.', 391),
 ('Right.', 309),
 ('Yes.', 203),
 ('Exactly.', 132),
 ('Absolutely.', 92),
 ('Sure.', 84),
 ('Alright.', 42),
 ('No.', 34),
 ('Wow.', 32),
 ('Certainly.', 32),
 ('Yup.', 27),
 ('Cool.', 24),
 ('Perfect.', 15),
 ('Totally.', 14),
 ('Awesome.', 14),
 ('Oh.', 11),
 ('Nice.', 11),
 ('Understood.', 11),
 ('Really?', 10),
 ('Correct.', 10),
 ('Congratulations.', 10),
 ('Exactly. Exactly.', 10),
 ('Beautiful.', 10),
 ('Lovely.', 9),
 ('Thanks.', 9),
 ('Right?', 8),
 ('Excellent.', 8),
 ('Right. Right.', 7),
 ('Intriguing.', 7),
 ('Yes. Yes.', 7),
 ('Gotcha.', 7),
 ('Right. Exactly.', 7),
 ('Great.', 7),
 ('Okay. Cool.', 6),
 ('Hotdog.', 6),
 ('Yep.', 6),
 ('Agreed.', 6),
 ('Interesting.', 5),
 ('Okay. Understood.', 5),
 ('Thanks, Pete.', 5),
 ('Fascinating.', 5),
 ('Good.', 5),
 ('Yeah', 5),
 ('Maybe.', 4),
 ('Indeed.', 4),
 ('Zeigarnik.', 4),
 ('Okay. Good.', 4),
 ('Whoa.', 4),
 ('Okay. Yeah.', 4),
 ('Okay', 3),
 ('Brilliant.', 3),
 ('Absolutely. Absolutely

In [128]:
#And we've fixed all of those issues! 
#Let's look at any statement that is shorter than our longest name (4 words) and see if we find anything else amiss:

def count_words(statement):
    return len(statement.split())

filtered_df = statement_df
filtered_df['wordcount'] = statement_df['statement'].apply(count_words)

filtered_df = filtered_df[filtered_df['wordcount'] < 5]

short_statements = filtered_df['statement'].tolist()
Counter(short_statements).most_common()

[('', 6392),
 ('Yeah.', 469),
 ('Okay.', 391),
 ('Right.', 309),
 ('All right.', 253),
 ('Yes.', 203),
 ('Exactly.', 132),
 ('Thank you.', 100),
 ('And a favorite habit?', 96),
 ('Absolutely.', 92),
 ('Sure.', 84),
 ('Oh, yeah.', 76),
 ('That’s right.', 71),
 ('And a favorite book?', 66),
 ('There you go.', 63),
 ('Oh, sure.', 43),
 ('Alright.', 42),
 ('Oh yeah.', 40),
 ('No.', 34),
 ('You’re welcome.', 32),
 ('Wow.', 32),
 ('Certainly.', 32),
 ('Oh, thank you.', 31),
 ('No kidding.', 31),
 ('That’s good.', 28),
 ('Oh, wow.', 28),
 ('Yeah, exactly.', 27),
 ('Yup.', 27),
 ('Cool.', 24),
 ('Thanks for having me.', 24),
 ('And a favorite tool?', 23),
 ('Thank you so much.', 22),
 ('Oh, cool.', 22),
 ('That’s true.', 22),
 ('Mm-hmm.', 22),
 ('Yeah, absolutely.', 21),
 ('Got you.', 19),
 ('Well, thank you.', 18),
 ('Oh, okay.', 17),
 ('Yeah, yeah.', 17),
 ('Oh sure.', 17),
 ('That’s cool.', 15),
 ('Let’s do it.', 15),
 ('Perfect.', 15),
 ('Right, right.', 15),
 ('My pleasure.', 14),
 ('Tota

In [129]:
#Let's look at All right. Pete Mockaitis.

i = statement_df[statement_df['statement'] == 'All right. Pete Mockaitis.'].index[0]

title_pete = statement_df.at[i, 'title']
pete_df = statement_df[statement_df['title'] == title_pete]
statement_df.iloc[i-4:i+6]

Unnamed: 0,title,speaker,statement,wordcount
8855,"769: How to Command the Room, Connect with You...",Pete Mockaitis,That’s true. Lithuanian.,3
8856,"769: How to Command the Room, Connect with You...",Laura Sicola,"So, that one, I would slow it. Lithuanian, you...",15
8857,"769: How to Command the Room, Connect with You...",Pete Mockaitis,Yeah.,1
8858,"769: How to Command the Room, Connect with You...",Laura Sicola,"Very cool. So, then I would slow that one down...",46
8859,"769: How to Command the Room, Connect with You...",Pete Mockaitis,All right. Pete Mockaitis.,4
8860,"769: How to Command the Room, Connect with You...",Laura Sicola,"Mockaitis. So, I’d slow it down even a hair mo...",12
8861,"769: How to Command the Room, Connect with You...",Pete Mockaitis,Okay.,1
8862,"769: How to Command the Room, Connect with You...",Laura Sicola,Because it’ll feel weird to you but it won’t s...,14
8863,"769: How to Command the Room, Connect with You...",Pete Mockaitis,Okay. Pete Mockaitis.,3
8864,"769: How to Command the Room, Connect with You...",Laura Sicola,Yeah.,1


In [130]:
#That's how it's supposed to be
#Let's look at \u200b

i = statement_df[statement_df['statement'] == '\u200b'].index[0]

title_pete = statement_df.at[i, 'title']
pete_df = statement_df[statement_df['title'] == title_pete]
statement_df.iloc[i-4:i+6]

Unnamed: 0,title,speaker,statement,wordcount
9205,767: How to Build Tremendous Mental Strength w...,Pete Mockaitis,for some new delight.,4
9206,767: How to Build Tremendous Mental Strength w...,Pete Mockaitis,"​The dark thought, the shame, the malice,",7
9207,767: How to Build Tremendous Mental Strength w...,Pete Mockaitis,"meet them at the door laughing,",6
9208,767: How to Build Tremendous Mental Strength w...,Pete Mockaitis,and invite them in.,4
9209,767: How to Build Tremendous Mental Strength w...,Pete Mockaitis,​,1
9210,767: How to Build Tremendous Mental Strength w...,Pete Mockaitis,"Be grateful for whoever comes,",5
9211,767: How to Build Tremendous Mental Strength w...,Pete Mockaitis,because each has been sent,5
9212,767: How to Build Tremendous Mental Strength w...,Pete Mockaitis,as a guide from beyond.”,5
9213,767: How to Build Tremendous Mental Strength w...,Pete Mockaitis,"So, that’s a poem. I think there’s some profun...",70
9214,767: How to Build Tremendous Mental Strength w...,Amy Morin,I think there’s a lot of power in just sometim...,84


Note: revisit episode 767

In [131]:
#There's some more stuff in here worth looking at:

# Count the occurrences of each element in the list
counter = Counter(short_statements)

# Get the most common elements with count less than 5
filtered_items = [(item, count) for item, count in counter.most_common() if count < 5]

# Sort the filtered items alphabetically
alphabetized_items = sorted(filtered_items, key=lambda x: x[0])

# Print the alphabetized list
for item in alphabetized_items:
    print(item)

('#Kardashians.', 1)
('$17.25?', 1)
('$300 Superman costume.', 1)
('-are uncomfortable talking.', 1)
('-dramatic torture or killing-', 1)
('.', 1)
('12.', 1)
('18-ish, right.', 1)
('19%?', 1)
('2007.', 1)
('2009, yeah.', 1)
('28.', 1)
('4, 0.', 1)
('75.', 1)
('A 100%, that’s right.', 1)
('A 100%.', 1)
('A bot.', 1)
('A favorite book?', 1)
('A favorite habit?', 1)
('A favorite research study?', 1)
('A favorite tool?', 1)
('A favorite tool? PowerPoint.', 1)
('A fellow guest.', 1)
('A finger.', 1)
('A guitar.', 1)
('A habit itself.', 1)
('A hundred percent.', 1)
('A hundred-forty total.', 1)
('A kid murderer?', 1)
('A little bit evil.', 1)
('A little bit.', 1)
('A little hint?', 1)
('A little nervous too.', 1)
('A particular nugget?', 2)
('A particular…', 1)
('A pleasure.', 2)
('A quake.', 1)
('A quote?', 1)
('A rationalization.', 1)
('A specific story or…?', 1)
('A teacher. Yeah. Right.', 1)
('ASL.', 1)
('About five pounds.', 1)
('About sleep?', 1)
('Absolutely not.', 2)
('Absolutely rig

In [132]:
#Let's look at that weird .

i = statement_df[statement_df['statement'] == '.'].index[0]

title_pete = statement_df.at[i, 'title']
pete_df = statement_df[statement_df['title'] == title_pete]
statement_df.iloc[i-4:i+6]

Unnamed: 0,title,speaker,statement,wordcount
99703,061: Crafting Your Perfect Day with Craig Ball...,Craig Ballantyne,"I’ve watched that video, and I wish I had that...",78
99704,061: Crafting Your Perfect Day with Craig Ball...,Craig Ballantyne,You could either watch Game of Thrones or you ...,78
99705,061: Crafting Your Perfect Day with Craig Ball...,Craig Ballantyne,,0
99706,061: Crafting Your Perfect Day with Craig Ball...,Pete Mockaitis,That sounds tweetable,3
99707,061: Crafting Your Perfect Day with Craig Ball...,Pete Mockaitis,.,1
99708,061: Crafting Your Perfect Day with Craig Ball...,Craig Ballantyne,Yeah,1
99709,061: Crafting Your Perfect Day with Craig Ball...,Pete Mockaitis,Could you give us maybe some examples of what ...,75
99710,061: Crafting Your Perfect Day with Craig Ball...,Craig Ballantyne,"Great question again, and I want to say one th...",86
99711,061: Crafting Your Perfect Day with Craig Ball...,Craig Ballantyne,"Now, you can also do it at night. I have a fri...",147
99712,061: Crafting Your Perfect Day with Craig Ball...,Craig Ballantyne,It’s much easier and easy is a relative term h...,84


Note: revisit episode 061 and delete random .

In [133]:
#We'll delete that later
#Let's look at Jonathan.

i = statement_df[statement_df['statement'] == 'Jonathan.'].index[0]

title_pete = statement_df.at[i, 'title']
pete_df = statement_df[statement_df['title'] == title_pete]
statement_df.iloc[i-4:i+6]

Unnamed: 0,title,speaker,statement,wordcount
33156,552: The Foundational Principle that Separates...,Pete Mockaitis,"It’s like, “I would have to remove this from y...",82
33157,552: The Foundational Principle that Separates...,Patrick Lencioni,"Yeah, I absolutely agree. You know, Pete, I’m ...",127
33158,552: The Foundational Principle that Separates...,Pete Mockaitis,That’s cool.,2
33159,552: The Foundational Principle that Separates...,Patrick Lencioni,"So, it’s going to be fun watching. What’s your...",12
33160,552: The Foundational Principle that Separates...,Pete Mockaitis,Jonathan.,1
33161,552: The Foundational Principle that Separates...,Patrick Lencioni,Jonathan. It’s going to be fun talking to you ...,15
33162,552: The Foundational Principle that Separates...,Pete Mockaitis,"Oh, yes. Yes, I think so too.",7
33163,552: The Foundational Principle that Separates...,Patrick Lencioni,That’s an exciting thing.,4
33164,552: The Foundational Principle that Separates...,Pete Mockaitis,"So, here’s a scenario I thought I might run by...",167
33165,552: The Foundational Principle that Separates...,Pete Mockaitis,"And so then, I got you in my ear, thinking abo...",154


In [134]:
#That's how it's supposed to be

#Let's look at Okay [laughter].

i = statement_df[statement_df['statement'] == 'Okay [laughter].'].index[0]

title_pete = statement_df.at[i, 'title']
pete_df = statement_df[statement_df['title'] == title_pete]
statement_df.iloc[i-4:i+6]

Unnamed: 0,title,speaker,statement,wordcount
104231,005: Pitching Like a Champion with Chris Westf...,Pete Mockaitis,Re trying to be persuasive. I think we talked ...,49
104232,005: Pitching Like a Champion with Chris Westf...,Chris Westfall,Have you ever heard somebody say the facts spe...,11
104233,005: Pitching Like a Champion with Chris Westf...,Pete Mockaitis,I have heard that.,4
104234,005: Pitching Like a Champion with Chris Westf...,Chris Westfall,"Or the numbers speak for themselves, or the dr...",21
104235,005: Pitching Like a Champion with Chris Westf...,Pete Mockaitis,Okay [laughter].,2
104236,005: Pitching Like a Champion with Chris Westf...,Chris Westfall,Which is why we’re talking.,5
104237,005: Pitching Like a Champion with Chris Westf...,Pete Mockaitis,All right.,2
104238,005: Pitching Like a Champion with Chris Westf...,Chris Westfall,To simply say the facts speak for themselves i...,21
104239,005: Pitching Like a Champion with Chris Westf...,Pete Mockaitis,All right.,2
104240,005: Pitching Like a Champion with Chris Westf...,Chris Westfall,If we could simply read things and get where w...,25


In [135]:
#That's how it's supposed to be

#Let's look at Pat McDaniel.

i = statement_df[statement_df['statement'] == 'Pat McDaniel.'].index[0]

title_pete = statement_df.at[i, 'title']
pete_df = statement_df[statement_df['title'] == title_pete]
statement_df.iloc[i-4:i+6]

Unnamed: 0,title,speaker,statement,wordcount
94893,101: Optimal Decision-Making with Pat McDaniel...,Pat McDaniel,"Well, I always want to say the I is my favorit...",137
94894,101: Optimal Decision-Making with Pat McDaniel...,Pete Mockaitis,"That’s great, thank you. And how about a favor...",15
94895,101: Optimal Decision-Making with Pat McDaniel...,Pat McDaniel,"Well, I thought about that. And one that I hav...",117
94896,101: Optimal Decision-Making with Pat McDaniel...,Pete Mockaitis,I can relate to that. It’s like some maybe sof...,23
94897,101: Optimal Decision-Making with Pat McDaniel...,Pete Mockaitis,Pat McDaniel.,2
94898,101: Optimal Decision-Making with Pat McDaniel...,Pete Mockaitis,Yeah. Or you just get so busy that your brain ...,79
94899,101: Optimal Decision-Making with Pat McDaniel...,Pete Mockaitis,And what it does is you can pick the chime and...,124
94900,101: Optimal Decision-Making with Pat McDaniel...,Pete Mockaitis,"Oh, that’s really cool. And I’ve done that in ...",60
94901,101: Optimal Decision-Making with Pat McDaniel...,Pat McDaniel,"Well, actually it only goes 6 times, so usuall...",27
94902,101: Optimal Decision-Making with Pat McDaniel...,Pete Mockaitis,"No, that’s fine. Cool. It sort of sounds like ...",33


In [136]:
#That's a for sure typo

df['text'] = df['text'].apply(lambda x: str(x).replace(u'Pat McDaniel.', u'Pat McDaniel'))
new_df.replace('Pat McDaniel.', 'Pat McDaniel', inplace = True)

#Recreate our dataframe
statement_df = create_statement_df(new_df)


#Let's look at The ums [inaudible 00:26:21].

i = statement_df[statement_df['statement'] == 'The ums [inaudible 00:26:21].'].index[0]

title_pete = statement_df.at[i, 'title']
pete_df = statement_df[statement_df['title'] == title_pete]
statement_df.iloc[i-4:i+6]

Unnamed: 0,title,speaker,statement
101829,043: Finding Your Voice Power with Renee Grant...,Renee Grant-Williams,If you want to improve your speaking start lis...
101830,043: Finding Your Voice Power with Renee Grant...,Pete Mockaitis,Okay. That sounds good. You’ve given us a few ...
101831,043: Finding Your Voice Power with Renee Grant...,Renee Grant-Williams,In general speakers?
101832,043: Finding Your Voice Power with Renee Grant...,Pete Mockaitis,"Right. I’m just thinking about, you’re in a pr..."
101833,043: Finding Your Voice Power with Renee Grant...,Renee Grant-Williams,The ums [inaudible 00:26:21].
101834,043: Finding Your Voice Power with Renee Grant...,Pete Mockaitis,Ums. Okay.
101835,043: Finding Your Voice Power with Renee Grant...,Renee Grant-Williams,They’re hard to … I was watching a political r...
101836,043: Finding Your Voice Power with Renee Grant...,Pete Mockaitis,It’s youthfulness versus experience. It’s like...
101837,043: Finding Your Voice Power with Renee Grant...,Renee Grant-Williams,Right.
101838,043: Finding Your Voice Power with Renee Grant...,Pete Mockaitis,For having a little bit of grey hair.


In [137]:
#That's how it's supposed to be

#Let's look at Zajonc.

i = statement_df[statement_df['statement'] == 'Zajonc.'].index[0]

title_pete = statement_df.at[i, 'title']
pete_df = statement_df[statement_df['title'] == title_pete]
statement_df.iloc[i-4:i+6]

Unnamed: 0,title,speaker,statement
70453,293: Body Language Insights that Get You Promo...,Pete Mockaitis,That makes me smile saying it.
70454,293: Body Language Insights that Get You Promo...,Pete Mockaitis,
70455,293: Body Language Insights that Get You Promo...,Denise Dudley,"Yeah, I know."
70456,293: Body Language Insights that Get You Promo...,Denise Dudley,
70457,293: Body Language Insights that Get You Promo...,Pete Mockaitis,Zajonc.
70458,293: Body Language Insights that Get You Promo...,Pete Mockaitis,
70459,293: Body Language Insights that Get You Promo...,Denise Dudley,"Zajonc, yeah. He did all kinds of interesting ..."
70460,293: Body Language Insights that Get You Promo...,Denise Dudley,
70461,293: Body Language Insights that Get You Promo...,Pete Mockaitis,That’s intriguing. It sounds like part of the ...
70462,293: Body Language Insights that Get You Promo...,Pete Mockaitis,


In [138]:
#That's how it's supposed to be

#Let's look at the timestamps starting with[00:12:12]

i = statement_df[statement_df['statement'] == '[00:12:12]'].index[0]

title_pete = statement_df.at[i, 'title']
pete_df = statement_df[statement_df['title'] == title_pete]
statement_df.iloc[i-4:i+6]

Unnamed: 0,title,speaker,statement
78631,238: The Ingredients of a Great First Impressi...,Pete Mockaitis,"Well, it’s funny, I’m right now thinking about..."
78632,238: The Ingredients of a Great First Impressi...,Pete Mockaitis,"So, I was playing the game and I went back to ..."
78633,238: The Ingredients of a Great First Impressi...,Ann Demarais,"So, he was like generally interested and total..."
78634,238: The Ingredients of a Great First Impressi...,Pete Mockaitis,"Well, I don’t think he was really interested, ..."
78635,238: The Ingredients of a Great First Impressi...,Pete Mockaitis,[00:12:12]
78636,238: The Ingredients of a Great First Impressi...,Pete Mockaitis,"And so, I thought that was just noteworthy bec..."
78637,238: The Ingredients of a Great First Impressi...,Ann Demarais,"Right. So, he was connecting and elevating and..."
78638,238: The Ingredients of a Great First Impressi...,Pete Mockaitis,Right. And I think the levity in particular is...
78639,238: The Ingredients of a Great First Impressi...,Ann Demarais,"Part of it is being in the moment. I mean, he ..."
78640,238: The Ingredients of a Great First Impressi...,Pete Mockaitis,"Yes, that’s great way to say it, feeding off t..."


In [139]:
#We need to isolate these and setup a systematic approach for handling them

# Filter the DataFrame to get rows where the 'Statement' column starts with '[some number:some number'
filtered_df = statement_df[statement_df['statement'].str.match(r'^\[\d{1,2}:\d{2}')]

filtered_df.head()

Unnamed: 0,title,speaker,statement
1483,825: The Six Steps of Masterful Delegation wit...,Aaron Schmookler,[39:21]
13395,732: How Aspiring Leaders Can Succeed Today wi...,Pete Mockaitis,[06:02]
15411,714: How to Find Success and Purpose with Tany...,Tanya Dalton,[03:22]
17972,689: How Introverts Win at Work with Jennifer ...,Jennifer Kahnweiler,[06:05]
19410,675: How to Boost Your Brain for Better Happin...,Pete Mockaitis,[03:12]


In [140]:
filtered_df.shape

(75, 3)

In [141]:
#Let's take a gander at these
short_statements = filtered_df['statement'].tolist()
Counter(short_statements).most_common()

[('[15:00]', 3),
 ('[6:00]', 3),
 ('[3:00]', 3),
 ('[06:05]', 2),
 ('[03:12]', 2),
 ('[36:01]', 2),
 ('[18:00]', 2),
 ('[24:00]', 2),
 ('[12:00]', 2),
 ('[33:00]', 2),
 ('[39:21]', 1),
 ('[06:02]', 1),
 ('[03:22]', 1),
 ('[09:16]', 1),
 ('[27:05]', 1),
 ('[03:31]', 1),
 ('[15:50]', 1),
 ('[33:28]', 1),
 ('[36:18]', 1),
 ('[09:07]', 1),
 ('[30:02]', 1),
 ('[06:38]', 1),
 ('[39:03]', 1),
 ('[39:25]', 1),
 ('[30:04]', 1),
 ('[03:02]', 1),
 ('[06:01]', 1),
 ('[09:15]', 1),
 ('[12:28]', 1),
 ('[15:01]', 1),
 ('[18:15]', 1),
 ('[21:16]', 1),
 ('[24:07]', 1),
 ('[27:10]', 1),
 ('[30:27]', 1),
 ('[33:04]', 1),
 ('[36:20]', 1),
 ('[39:05]', 1),
 ('[42:12]', 1),
 ('[45:00]', 1),
 ('[48:03]', 1),
 ('[51:02]', 1),
 ('[24:27]', 1),
 ('[11:55]', 1),
 ('[51:00]', 1),
 ('[54:00]', 1),
 ('[57:00]', 1),
 ('[9:00]', 1),
 ('[21:00]', 1),
 ('[27:00]', 1),
 ('[30:00]', 1),
 ('[36:00]', 1),
 ('[39:00]', 1),
 ('[00:33:03]', 1),
 ('[00:18:14]', 1),
 ('[00:18:10]', 1),
 ('[00:12:12]', 1),
 ('[00:33:20]', 1),
 (

In [142]:
#We've gone through and reviewed a sufficient chunk of these in the original transcript. I feel comfortable just removing them

for index, row in filtered_df.iterrows():
    timestamp = filtered_df.at[index, 'statement']
    df['text'] = df['text'].apply(lambda x: str(x).replace(timestamp, ''))
    new_df.replace(timestamp, '', inplace = True)

In [143]:
#Let's move on to some other things we should consider
#Some of these messages are something to do with a sponsor and that doesn't contribute to our conversational dialogue

#Recreate our dataframe
statement_df = create_statement_df(new_df)
#Find the messages that include the words insert sponsor
filtered_df = statement_df[statement_df['statement'].str.contains('insert sponsor', case = False)]

filtered_df.head(15)

Unnamed: 0,title,speaker,statement
19144,678: How to Win Trust and Connect Masterfully ...,Riaz Meghji,Insert sponsor: Care.com
32638,557: How to Outthink Fear with Dr. Mark McLaug...,Mark McLaughlin,{Insert sponsor: Formstack}
35132,534: Moving from Top Performer to Excellent Le...,Ryan Hawk,{Insert Sponsor: Feder Play
40048,493: How to Amplify Your Impact through Great ...,Anese Cavanaugh,{Insert Sponsor Simple habit}
46652,440: Accomplishing More in Less Time by Buildi...,Pete Mockaitis,{Insert sponsor High Brew Coffee: One of the m...
63244,335: Become a High Performer in Eight (Scienti...,Marc Effron,{Insert Sponsor here}
79550,230: How to Get an MBA Education (and more!) f...,Pete Mockaitis,[Insert sponsor here]
80199,225: How to Build Your Dream Network with J. K...,J. Kelly Hoey,INSERT SPONSOR
83173,199: Supercharging Your Productivity with Erik...,Erik Fisher,[Insert Sponsor]
85406,181: How to Hone Your Strengths at a Job You L...,Pete Mockaitis,[INSERT SPONSOR HERE]


In [144]:
#We can remove those
for index, row in filtered_df.iterrows():
    timestamp = filtered_df.at[index, 'statement']
    df['text'] = df['text'].apply(lambda x: str(x).replace(timestamp, ''))
    new_df.replace(timestamp, '', inplace = True)
    
#We can also remove \u200b
df['text'] = df['text'].apply(lambda x: str(x).replace(u'\u200b', u''))
new_df.replace('\u200b', '', inplace = True)

#And let's address the empty statements
new_df = new_df[new_df['text'] != '']

#Recreate our dataframe
statement_df = create_statement_df(new_df)

In [145]:
#And let's look to see if there's anything else glaring to address in the statements
filtered_df = statement_df
filtered_df['wordcount'] = statement_df['statement'].apply(count_words)

filtered_df = filtered_df[filtered_df['wordcount'] < 5]

short_statements = filtered_df['statement'].tolist()

# Count the occurrences of each element in the list
counter = Counter(short_statements)

# Get the most common elements
filtered_items = counter.most_common()

# Sort the filtered items alphabetically
alphabetized_items = sorted(filtered_items, key=lambda x: x[0])

# Print the alphabetized list
for item in alphabetized_items:
    print(item)

('#Kardashians.', 1)
('$17.25?', 1)
('$300 Superman costume.', 1)
('-are uncomfortable talking.', 1)
('-dramatic torture or killing-', 1)
('.', 1)
('12.', 1)
('18-ish, right.', 1)
('19%?', 1)
('2007.', 1)
('2009, yeah.', 1)
('28.', 1)
('4, 0.', 1)
('75.', 1)
('A 100%, that’s right.', 1)
('A 100%.', 1)
('A bot.', 1)
('A favorite book?', 1)
('A favorite habit?', 1)
('A favorite research study?', 1)
('A favorite tool?', 1)
('A favorite tool? PowerPoint.', 1)
('A fellow guest.', 1)
('A finger.', 1)
('A guitar.', 1)
('A habit itself.', 1)
('A hundred percent.', 1)
('A hundred-forty total.', 1)
('A kid murderer?', 1)
('A little bit evil.', 1)
('A little bit.', 1)
('A little hint?', 1)
('A little nervous too.', 1)
('A particular nugget?', 2)
('A particular…', 1)
('A pleasure.', 2)
('A quake.', 1)
('A quote?', 1)
('A rationalization.', 1)
('A specific story or…?', 1)
('A teacher. Yeah. Right.', 1)
('ASL.', 1)
('About five pounds.', 1)
('About sleep?', 1)
('Absolutely not.', 2)
('Absolutely rig

In [146]:
#I'm feeling really good about where this is. Let's review the episodes we made notes to revisit:
#415, 234, 105, 767, and 061 (specifically that period)

#Let's start with episode 415
ep_df = statement_df[statement_df['title'].str.startswith('415')]
ep_df.head(10)

Unnamed: 0,title,speaker,statement,wordcount
48135,415: Pursuing Your Passion the Smart Way with ...,Pete Mockaitis,"Brad, welcome back to the How to be Awesome at...",13
48136,415: Pursuing Your Passion the Smart Way with ...,Brad Stulberg,"Hey, thanks so much for having me.",7
48137,415: Pursuing Your Passion the Smart Way with ...,Pete Mockaitis,"Well, I’m excited to dig into your next book, ...",20
48138,415: Pursuing Your Passion the Smart Way with ...,Brad Stulberg,My love of cats. How do you know I love cats?,11
48139,415: Pursuing Your Passion the Smart Way with ...,Pete Mockaitis,"Well, there’s a form I have guests fill out ab...",10
48140,415: Pursuing Your Passion the Smart Way with ...,Brad Stulberg,"Oh, I said I loved-",5
48141,415: Pursuing Your Passion the Smart Way with ...,Pete Mockaitis,"Oh, yeah, you totally … You just gave it up th...",36
48142,415: Pursuing Your Passion the Smart Way with ...,Brad Stulberg,"I’ve got two, as you said, Sonny and Bryant an...",31
48143,415: Pursuing Your Passion the Smart Way with ...,Pete Mockaitis,"Well, tell me what are some of the goofy behav...",10
48144,415: Pursuing Your Passion the Smart Way with ...,Brad Stulberg,"The goofy behaviors. Well, let’s see. So Sonny...",119


In [147]:
ep_df.tail(10)

Unnamed: 0,title,speaker,statement,wordcount
48228,415: Pursuing Your Passion the Smart Way with ...,Brad Stulberg,Meditation. That is a daily practice for me an...,57
48229,415: Pursuing Your Passion the Smart Way with ...,Pete Mockaitis,Is there particular nugget you share that real...,22
48230,415: Pursuing Your Passion the Smart Way with ...,Brad Stulberg,I think it’s really important to ask yourself ...,121
48231,415: Pursuing Your Passion the Smart Way with ...,Pete Mockaitis,"And Brad, if folks want to learn more or get i...",17
48232,415: Pursuing Your Passion the Smart Way with ...,Brad Stulberg,So you can get in touch on Twitter where I am ...,30
48233,415: Pursuing Your Passion the Smart Way with ...,Pete Mockaitis,And do you have a final challenge or call to a...,21
48234,415: Pursuing Your Passion the Smart Way with ...,Brad Stulberg,I obviously am going to encourage folks to rea...,94
48235,415: Pursuing Your Passion the Smart Way with ...,Brad Stulberg,We spent a lot of time talking about this neat...,67
48236,415: Pursuing Your Passion the Smart Way with ...,Pete Mockaitis,"Well Brad, thank you so much for sharing the g...",27
48237,415: Pursuing Your Passion the Smart Way with ...,Brad Stulberg,Thanks so much Pete. I really enjoyed being on...,11


In [148]:
#Looks good to me, let's look at 234

ep_df = statement_df[statement_df['title'].str.startswith('234')]
ep_df.head(10)

Unnamed: 0,title,speaker,statement,wordcount


In [149]:
#I think we already removed that one...
#Let's get the index of 233 and use that to get the index for the tail of 234
ep_df = statement_df[statement_df['title'].str.startswith('233')]
ep_df.head(10)

Unnamed: 0,title,speaker,statement,wordcount
74456,233: Best Practices for Better Relationships a...,Pete Mockaitis,"Todd, thanks so much for joining us here on th...",18
74457,233: Best Practices for Better Relationships a...,Todd Davis,"Thank you, Pete. I’m happy to be here.",8
74458,233: Best Practices for Better Relationships a...,Pete Mockaitis,"Well, I’m excited myself. So, I want to get th...",33
74459,233: Best Practices for Better Relationships a...,Todd Davis,"Yeah, thanks for asking. It is, it’s an honor....",118
74460,233: Best Practices for Better Relationships a...,Todd Davis,He used the analogy of the old nursery rhyme T...,158
74461,233: Best Practices for Better Relationships a...,Pete Mockaitis,"Understood, intriguing. And so now, I remember...",50
74462,233: Best Practices for Better Relationships a...,Todd Davis,"I did, and what a privilege for me. As I said,...",86
74463,233: Best Practices for Better Relationships a...,Todd Davis,I used to at one point in my career work in th...,91
74464,233: Best Practices for Better Relationships a...,Pete Mockaitis,"I love it. Well, could you maybe share an anec...",160
74465,233: Best Practices for Better Relationships a...,Todd Davis,"Well, what a great story that you just shared....",139


In [150]:
statement_df.iloc[74445:74455]

Unnamed: 0,title,speaker,statement,wordcount
74445,235: The Power of Finding Your Why with David ...,Pete Mockaitis,All right. Thank you. And how about a favorite...,19
74446,235: The Power of Finding Your Why with David ...,David Mead,Can I give you a habit?,6
74447,235: The Power of Finding Your Why with David ...,Pete Mockaitis,"Oh, let’s take it.",4
74448,235: The Power of Finding Your Why with David ...,David Mead,"Yeah, so a habit that I have is every time rig...",107
74449,235: The Power of Finding Your Why with David ...,David Mead,"I’m not thinking about, “Well, jeez, I hope th...",80
74450,235: The Power of Finding Your Why with David ...,Pete Mockaitis,"Oh, thanks. And do you have a final challenge ...",24
74451,235: The Power of Finding Your Why with David ...,David Mead,"I think, again, it ties to what we’ve been tal...",75
74452,235: The Power of Finding Your Why with David ...,David Mead,Human beings are not inspired to make a huge p...,81
74453,235: The Power of Finding Your Why with David ...,David Mead,"It shouldn’t be like, we shouldn’t feel lucky ...",80
74454,235: The Power of Finding Your Why with David ...,Pete Mockaitis,"All right. Well, David, this has been a real t...",42


In [151]:
#Yep. 234 has already been removed!
#Let's look at 105

ep_df = statement_df[statement_df['title'].str.startswith('105')]
ep_df.head(10)

Unnamed: 0,title,speaker,statement,wordcount
88578,"105: “Yes, and…” approaches to improv-ing work...",Pete Mockaitis,"Bob, thanks so much for joining us here on the...",18
88579,"105: “Yes, and…” approaches to improv-ing work...",Bob Kulhan,"Thanks for having me, Pete. I’m excited to be ...",10
88580,"105: “Yes, and…” approaches to improv-ing work...",Pete Mockaitis,"Oh, I’m so excited to have you. I think we’re ...",45
88581,"105: “Yes, and…” approaches to improv-ing work...",Bob Kulhan,Uh-oh. You’re setting the expectations pretty ...,8
88582,"105: “Yes, and…” approaches to improv-ing work...",Pete Mockaitis,Don’t fail me and the thousands listening.,7
88583,"105: “Yes, and…” approaches to improv-ing work...",Bob Kulhan,I’ll do the best I can.,6
88584,"105: “Yes, and…” approaches to improv-ing work...",Pete Mockaitis,"Super. Actually, we’ll start with a rather ser...",104
88585,"105: “Yes, and…” approaches to improv-ing work...",Bob Kulhan,I could introduce you to many. So it’s really ...,65
88586,"105: “Yes, and…” approaches to improv-ing work...",Pete Mockaitis,I would like a hard-nosed business money-lovin...,13
88587,"105: “Yes, and…” approaches to improv-ing work...",Bob Kulhan,"Oh, absolutely. Okay. So this was actually men...",71


In [152]:
ep_df.tail(10)

Unnamed: 0,title,speaker,statement,wordcount
88694,"105: “Yes, and…” approaches to improv-ing work...",Pete Mockaitis,Yeah.,1
88695,"105: “Yes, and…” approaches to improv-ing work...",Bob Kulhan,“There’s more than one way to hit a piñata” se...,55
88696,"105: “Yes, and…” approaches to improv-ing work...",Pete Mockaitis,"Oh, good. Thank you. And what would you say is...",27
88697,"105: “Yes, and…” approaches to improv-ing work...",Bob Kulhan,Contact Business Improv. That’s my company. Or...,24
88698,"105: “Yes, and…” approaches to improv-ing work...",Pete Mockaitis,There you go. Releasing shortly. Coming out ve...,9
88699,"105: “Yes, and…” approaches to improv-ing work...",Bob Kulhan,Yeah. January 24th. Stanford University Press....,10
88700,"105: “Yes, and…” approaches to improv-ing work...",Pete Mockaitis,Very cool. And would you say you have a final ...,29
88701,"105: “Yes, and…” approaches to improv-ing work...",Bob Kulhan,Yes. Try. Just simply try. Create places in wh...,54
88702,"105: “Yes, and…” approaches to improv-ing work...",Pete Mockaitis,"Fantastic. Thank you. Well, Bob, this has been...",36
88703,"105: “Yes, and…” approaches to improv-ing work...",Bob Kulhan,Yeah! Woo woo! The pleasure is mine. I appreci...,15


In [153]:
#Looks good to me, let's look at 767

ep_df = statement_df[statement_df['title'].str.startswith('767')]
ep_df.head(10)

Unnamed: 0,title,speaker,statement,wordcount
8273,767: How to Build Tremendous Mental Strength w...,Pete Mockaitis,"Amy, welcome to How to be Awesome at Your Job.",10
8274,767: How to Build Tremendous Mental Strength w...,Amy Morin,"Hey, thanks for having me.",5
8275,767: How to Build Tremendous Mental Strength w...,Pete Mockaitis,"Well, I’m so excited to dig into your wisdom. ...",42
8276,767: How to Build Tremendous Mental Strength w...,Amy Morin,"It is true. So, I guess six years ago, we deci...",128
8277,767: How to Build Tremendous Mental Strength w...,Pete Mockaitis,"So, as we speak, you’re on a sailboat?",8
8278,767: How to Build Tremendous Mental Strength w...,Amy Morin,"I am, yes.",3
8279,767: How to Build Tremendous Mental Strength w...,Pete Mockaitis,I don’t see anything rocking.,5
8280,767: How to Build Tremendous Mental Strength w...,Amy Morin,"Yeah. So, a lot of the time, because I need su...",28
8281,767: How to Build Tremendous Mental Strength w...,Pete Mockaitis,"And so, has that been working out well for you...",30
8282,767: How to Build Tremendous Mental Strength w...,Amy Morin,"Yeah. So, there are some pros and cons. The pr...",101


In [154]:
ep_df.tail(10)

Unnamed: 0,title,speaker,statement,wordcount
8444,767: How to Build Tremendous Mental Strength w...,Pete Mockaitis,"And a favorite habit, something you do that he...",15
8445,767: How to Build Tremendous Mental Strength w...,Amy Morin,I would say running every day.,6
8446,767: How to Build Tremendous Mental Strength w...,Pete Mockaitis,And is there a key nugget you share that peopl...,28
8447,767: How to Build Tremendous Mental Strength w...,Amy Morin,"Yeah, I think I said something to the effect o...",27
8448,767: How to Build Tremendous Mental Strength w...,Pete Mockaitis,All right. And if folks want to learn more or ...,18
8449,767: How to Build Tremendous Mental Strength w...,Amy Morin,My website AmyMorinLCSW.com.,3
8450,767: How to Build Tremendous Mental Strength w...,Pete Mockaitis,And do you have a final challenge or call to a...,20
8451,767: How to Build Tremendous Mental Strength w...,Amy Morin,I would say set a goal this week and challenge...,42
8452,767: How to Build Tremendous Mental Strength w...,Pete Mockaitis,"All right. Amy, thank you. This has been a tre...",20
8453,767: How to Build Tremendous Mental Strength w...,Amy Morin,Thank you. I appreciate it.,5


In [155]:
#Looks good to me, that leaves 061 and that "."

i = statement_df[statement_df['statement'] == '.'].index[0]
statement_df.iloc[i-4:i+6]

Unnamed: 0,title,speaker,statement,wordcount
93468,061: Crafting Your Perfect Day with Craig Ball...,Craig Ballantyne,"For example, this summer alone, I spent 3 week...",118
93469,061: Crafting Your Perfect Day with Craig Ball...,Craig Ballantyne,"I’ve watched that video, and I wish I had that...",78
93470,061: Crafting Your Perfect Day with Craig Ball...,Craig Ballantyne,You could either watch Game of Thrones or you ...,78
93471,061: Crafting Your Perfect Day with Craig Ball...,Pete Mockaitis,That sounds tweetable,3
93472,061: Crafting Your Perfect Day with Craig Ball...,Pete Mockaitis,.,1
93473,061: Crafting Your Perfect Day with Craig Ball...,Craig Ballantyne,Yeah,1
93474,061: Crafting Your Perfect Day with Craig Ball...,Pete Mockaitis,Could you give us maybe some examples of what ...,75
93475,061: Crafting Your Perfect Day with Craig Ball...,Craig Ballantyne,"Great question again, and I want to say one th...",86
93476,061: Crafting Your Perfect Day with Craig Ball...,Craig Ballantyne,"Now, you can also do it at night. I have a fri...",147
93477,061: Crafting Your Perfect Day with Craig Ball...,Craig Ballantyne,It’s much easier and easy is a relative term h...,84


In [156]:
#For the initial dataframe, we should be able to do this pretty easily
df['text'] = df['text'].apply(lambda x: str(x).replace(u'That sounds tweetable\n.', u'That sounds tweetable.\n'))

#For the working dataframe, we should be able to just remove that line.
new_df = new_df[new_df['text'] != '.']

#And recreate our dataframe
statement_df = create_statement_df(new_df)

In [157]:
statement_df.tail()

Unnamed: 0,title,speaker,statement
98339,001: Communicating with Inspiration and Clarit...,Mawi Asgedom,"Now, actually the really sad thing about the s..."
98340,001: Communicating with Inspiration and Clarit...,Mawi Asgedom,"It was really simple, but he says, “I didn’t s..."
98341,001: Communicating with Inspiration and Clarit...,Mawi Asgedom,"Are you going to say, “Well I spent my life do..."
98342,001: Communicating with Inspiration and Clarit...,Mawi Asgedom,"I would challenge anybody here, life is too pr..."
98343,001: Communicating with Inspiration and Clarit...,Pete Mockaitis,Fantastic. Mawi thanks so much for kicking us ...


In [160]:
#Last thing I want to do is concatenate the strings so that we only get a new line when the speaker changes

lines = []
current_speaker = None
current_episode = None
current_text = ''
for index, row in statement_df.iterrows():
    if row['speaker'] == current_speaker and row['title'] == current_episode:
        current_text += ' ' + row['statement']
    else:
        if current_speaker is not None:
            lines.append([current_episode, current_speaker, current_text])
        current_speaker = row['speaker']
        current_episode = row['title']
        current_text = row['statement']

#Add the last row
lines.append([current_episode, current_speaker, current_text])
#Create the final dataframe
final_df = pd.DataFrame(lines, columns = ['title', 'speaker', 'statement'])

final_df.tail(10)

Unnamed: 0,title,speaker,statement
60747,001: Communicating with Inspiration and Clarit...,Mawi Asgedom,"All the business tools, Google Drive, Google D..."
60748,001: Communicating with Inspiration and Clarit...,Pete Mockaitis,Yeah.
60749,001: Communicating with Inspiration and Clarit...,Mawi Asgedom,Okay? We don’t even realize how productive tec...
60750,001: Communicating with Inspiration and Clarit...,Pete Mockaitis,That’s fantastic. How about a favorite habit t...
60751,001: Communicating with Inspiration and Clarit...,Mawi Asgedom,Yeah. I would say the early morning. One of th...
60752,001: Communicating with Inspiration and Clarit...,Pete Mockaitis,All right. Favorite way to find you if you wan...
60753,001: Communicating with Inspiration and Clarit...,Mawi Asgedom,"Yeah, mawilearning.com, M-A-W-I learning dot c..."
60754,001: Communicating with Inspiration and Clarit...,Pete Mockaitis,Favorite parting tip or thought or a call to a...
60755,001: Communicating with Inspiration and Clarit...,Mawi Asgedom,"Yeah. One thing I thought about a lot, and I t..."
60756,001: Communicating with Inspiration and Clarit...,Pete Mockaitis,Fantastic. Mawi thanks so much for kicking us ...


In [161]:
#Let's double check our work
statement_df.tail(10)

Unnamed: 0,title,speaker,statement
98334,001: Communicating with Inspiration and Clarit...,Pete Mockaitis,All right. Favorite way to find you if you wan...
98335,001: Communicating with Inspiration and Clarit...,Mawi Asgedom,"Yeah, mawilearning.com, M-A-W-I learning dot c..."
98336,001: Communicating with Inspiration and Clarit...,Pete Mockaitis,Favorite parting tip or thought or a call to a...
98337,001: Communicating with Inspiration and Clarit...,Mawi Asgedom,"Yeah. One thing I thought about a lot, and I t..."
98338,001: Communicating with Inspiration and Clarit...,Mawi Asgedom,They assign you a mentor to help you with that...
98339,001: Communicating with Inspiration and Clarit...,Mawi Asgedom,"Now, actually the really sad thing about the s..."
98340,001: Communicating with Inspiration and Clarit...,Mawi Asgedom,"It was really simple, but he says, “I didn’t s..."
98341,001: Communicating with Inspiration and Clarit...,Mawi Asgedom,"Are you going to say, “Well I spent my life do..."
98342,001: Communicating with Inspiration and Clarit...,Mawi Asgedom,"I would challenge anybody here, life is too pr..."
98343,001: Communicating with Inspiration and Clarit...,Pete Mockaitis,Fantastic. Mawi thanks so much for kicking us ...


In [163]:
#Looks good, let's do a little more checking
final_df.at[60755, 'statement']

'Yeah. One thing I thought about a lot, and I think folks in the corporate world will really be able to identify with this. I hope I’m not convincing you right now to quit your jobs and do something else, because that’s not my intent. I remember when I was giving the graduation speech at Harvard, it’s a lot of pressure. It’s 30,000 people that you’re speaking to. It’s graduation day, you don’t have your notes, you have to speak from memory, and there’s not a podium it’s just a thin mic in the crowd. They assign you a mentor to help you with that, who reviews your speaking. You have speaking training with him 2, 3 times a week. By the way, one of the things he taught, this is a more minor point, was, “You’ve got to hit the adjectives when you speak because the adjectives describe.” Instead of saying, “Wow, that’s a beautiful car,” you say, “That’s a beautiful car.” Now all of a sudden it pops. That’s another little speaking tip for people, hit the adjective. Now, actually the really sad

In [165]:
#And what did it do for the end of an episode
final_df.iloc[45:51]

Unnamed: 0,title,speaker,statement
45,835: How to Thrive amid Stress and Irritation ...,Sharon Melnick,Here’s the challenge that I want to leave you ...
46,835: How to Thrive amid Stress and Irritation ...,Pete Mockaitis,"All right. Beautiful. Sharon, this has been a ..."
47,835: How to Thrive amid Stress and Irritation ...,Sharon Melnick,Thank you.
48,834: How to End Micromanagement Once and For A...,Pete Mockaitis,"Lia, welcome back to How to be Awesome at Your..."
49,834: How to End Micromanagement Once and For A...,Lia Garvin,Thank you so much for having me. So excited to...
50,834: How to End Micromanagement Once and For A...,Pete Mockaitis,"Well, I’m excited to dig into what you’ve been..."


In [170]:
#And what did it do for the end of an episode
final_df.iloc[124:135]

Unnamed: 0,title,speaker,statement
124,834: How to End Micromanagement Once and For A...,Pete Mockaitis,All right. And if folks want to learn more or ...
125,834: How to End Micromanagement Once and For A...,Lia Garvin,"Yeah, so I would say connect with me on Linked..."
126,834: How to End Micromanagement Once and For A...,Pete Mockaitis,Okay. And do you have a final challenge or cal...
127,834: How to End Micromanagement Once and For A...,Lia Garvin,"Yeah, I think for folks, giving yourself a lit..."
128,834: How to End Micromanagement Once and For A...,Pete Mockaitis,"All right. Lia, thanks. This has been a treat...."
129,834: How to End Micromanagement Once and For A...,Lia Garvin,Thank you so much.
130,834: How to End Micromanagement Once and For A...,Pete Mockaitis,All right.
131,833: The Four-Step Process to Influencing Peop...,Pete Mockaitis,"Andres, welcome to How to be Awesome at Your Job."
132,833: The Four-Step Process to Influencing Peop...,Andres Lares,Thank you for having me.
133,833: The Four-Step Process to Influencing Peop...,Pete Mockaitis,"Well, I’m so excited to hear your insights on ..."


In [171]:
#And let's save it as a csv

final_df.to_csv("../../data/htbaayj/speaker_tagged.csv")