# CNN Presidential Debate Speech Analysis

## Setup

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

### Scraper

In [2]:
url = "https://www.cnn.com/2024/06/27/politics/read-biden-trump-debate-rush-transcript/index.html"
html = requests.get(url).text

In [3]:
soup = BeautifulSoup(html)

In [4]:
article = soup.find('div', class_='article__content')
article

<div class="article__content" data-editable="content" data-reorderable="content" itemprop="articleBody">
<div class="source inline-placeholder" data-article-gutter="true" data-uri="cms.cnn.com/_components/source/instances/clxy4twa2002p65qj0f1o9t3m@published">
<cite class="source__cite">
<span class="source__location" data-editable="location"></span>
<span class="source__text" data-editable="source">CNN</span>
         — 
    </cite>
</div>
<p class="paragraph inline-placeholder vossi-paragraph-primary-core-light" data-article-gutter="true" data-component-name="paragraph" data-editable="text" data-uri="cms.cnn.com/_components/paragraph/instances/clxy4twa2002q65qj0td56mfh@published">
<a href="https://www.cnn.com/politics/joe-biden">President Joe Biden </a>and former President <a href="https://www.cnn.com/politics/president-donald-trump-45">Donald Trump </a>participated in their <a href="https://www.cnn.com/politics/live-news/cnn-debate-trump-biden-06-27-24/index.html">first debate </a>of

In [5]:
text = []
for p in article:
    text.append(p.text)

text

['\n',
 '\n\n\nCNN\n        \xa0—\xa0\n    \n',
 '\n',
 '\nPresident Joe Biden and former President Donald Trump participated in their first debate of the 2024 election season on CNN in Atlanta Thursday.\n    ',
 '\n',
 '\n            Read the final, corrected transcript of the debate below:\n    ',
 '\n',
 '\nJAKE TAPPER, CNN MODERATOR:\xa0We’re live from Georgia, a key battleground state in the race for the White House. In just moments, the current U.S. president will debate the former U.S. president as their parties’ presumptive nominees, a first in American history.\n    ',
 '\n',
 '\n            We want to welcome our viewers in the United States and around the world to our studios in Atlanta.\n    ',
 '\n',
 '\n            This is the CNN presidential debate.\n    ',
 '\n',
 '\nDANA BASH, CNN MODERATOR:\xa0This debate is being produced by CNN and it’s coming to you live on CNN, CNN International,\xa0CNN.com, CNN Max, and CNN Espanol.\n    ',
 '\n',
 '\n            This is a pivot

### Assign text to speaker

In [6]:
transcript = ' '.join(text)

# List of speakers
speakers = ['BIDEN', 'TRUMP', 'BASH', 'TAPPER']

# Dictionary to store each speaker's text
speaker_texts = {speaker: [] for speaker in speakers}

# Regular expression to split the transcript
pattern = r'({}):\s*'.format('|'.join(speakers))
segments = re.split(pattern, transcript)

current_speaker = None
for segment in segments:
    if segment in speakers:
        current_speaker = segment
    elif current_speaker and segment.strip():
        speaker_texts[current_speaker].append(segment.strip())

speaker_texts

{'BIDEN': ['You have to take a look at what I was left when I became president, what Mr. Trump left me.\n     \n \n            We had an economy that was in freefall. The pandemic are so badly handled, many people were dying. All he said was, it’s not that serious. Just inject a little bleach in your arm. It’d be all right.\n     \n \n            The economy collapsed. There were no jobs. Unemployment rate rose to 15 percent. It was terrible.\n     \n \n            And so, what we had to do is try to put things back together again. That’s exactly what we began to do. We created 15,000 new jobs. We brought on – in a position where we have 800,000 new manufacturing jobs.\n     \n \n            But there’s more to be done. There’s more to be done. Working class people are still in trouble.\n     \n \n            I come from Scranton, Pennsylvania. I come from a household where the kitchen table – if things weren’t able to be met during the month was a problem. Price of eggs, the price of 

### Save speaker_texts as json

In [7]:
# save speaker_texts as json
#import json
#with open('speaker_texts.json', 'w') as f:
#    json.dump(speaker_texts, f)

## Basic Word Count

In [8]:
# Function to count words
def word_count(text):
    return len(re.findall(r'\w+', text))

# Count words for TRUMP
trump_words = sum(word_count(segment) for segment in speaker_texts['TRUMP'])
trump_words

8274

In [9]:
# Function to count words
def word_count(text):
    return len(re.findall(r'\w+', text))

# Count words for BIDEN
biden_words = sum(word_count(segment) for segment in speaker_texts['BIDEN'])
biden_words

7021

In [10]:
speaker_texts['TRUMP']

# find the longest sentence

# Function to find the longest sentence
def longest_sentence(text):
    sentences = re.findall(r'[^.!?]+[.!?]', text)
    longest = max(sentences, key=len)
    return longest

# Find the longest sentence for TRUMP  
longest_trump = longest_sentence(' '.join(speaker_texts['TRUMP']))
longest_trump

'\n     \n \n            I’ll tell you what happened, he was so bad with Afghanistan, it was such a horrible embarrassment, most embarrassing moment in the history of our country, that when Putin watched that and he saw the incompetence that he should – he should have fired those generals like I fired the one that you mentioned, and so he’s got no love lost.'

In [11]:
speaker_texts['BIDEN']

# find the longest sentence

# Function to find the longest sentence
def longest_sentence(text):
    sentences = re.findall(r'[^.!?]+[.!?]', text)
    longest = max(sentences, key=len)
    return longest

# Find the longest sentence for TRUMP  
longest_biden = longest_sentence(' '.join(speaker_texts['BIDEN']))
longest_biden

'\n     \n \n            I made sure that we’re in a situation where all those black families and those black individuals who provided had to take out student loans that were ballooning, that if they were engaged in nursing and anything having to do with volunteerism, if they paid their bills for 10 years on their student debt, all the rest was forgiven after 10 years.'

## Speech Analyzer

### Setup

In [12]:
from collections import Counter

In [13]:
def process_text(text):
    # Convert to lowercase and split into words
    words = re.findall(r'\w+', text.lower())
    # Remove common stop words (you can expand this list)
    stop_words = set([
        'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
        'he', 'i', 'it', 'we', 'that', 's', 'they', 'you', 'is', 'are', 'was', 'were', 'be',
        'this', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should',
        'there', 'their', 'them', 'these', 'those', 'am', 'as', 'if', 'so', 'than', 'then',
        'no', 'not', 'only', 'very', 'can', 'just', 'should', 'now', 't', 're', 'who', 'what', 'said', 'like'
    ])
    return [word for word in words if word not in stop_words]

def analyze_speech(speaker):
    all_words = process_text(' '.join(speaker_texts[speaker]))
    word_counts = Counter(all_words)
    
    print(f"\nAnalysis for {speaker}:")
    print(f"Total words spoken (excluding stop words): {len(all_words)}")
    print(f"Number of times 'economy' was mentioned: {word_counts['economy']}")
    print(f"Number of times 'jobs' was mentioned: {word_counts['jobs']}")
    print(f"Number of times 'abortion' was mentioned: {word_counts['abortion']}")
    print(f"Number of times 'climate' was mentioned: {word_counts['climate']}")
    print(f"Number of times 'golf' was mentioned: {word_counts['golf']}")
    print(f"Number of times 'border' was mentioned: {word_counts['border']}")
    print(f"Number of times 'childcare' was mentioned: {word_counts['childcare']}")
    print(f"Number of times 'child' was mentioned: {word_counts['child']}")
    print(f"Number of times 'vaccine' was mentioned: {word_counts['vaccine']}")
    print(f"Number of times 'Russia' was mentioned: {word_counts['russia']} versus the number of times Ukraine was used' {word_counts['ukraine']}")
    print(f"Number of times 'taxes' was mentioned: {word_counts['taxes']}")
    print(f"Number of times Biden or Trump called something 'great': {word_counts['great']}")
    print(f"Number of times Biden referred to Trump as 'Trump' {word_counts['trump']} versus the number of times he referred to Trump as 'this guy' {word_counts['guy']}")
    print(f"Number of times the speech was 'inaudible' : {word_counts['inaudible']}")    
    print(f"Number of times the speech was 'Hamas' : {word_counts['hamas']}")
    print(f"Number of times the speech was 'Palestinian' : {word_counts['palestinian']} versus the number of times Israel was used' {word_counts['israel']}")
    print(f"Number of times the speech was 'World War Three' : {word_counts['war']}")

    print("\nTop 15 most common words:")
    for word, count in word_counts.most_common(15):
        print(f"{word}: {count}")
    
    return word_counts


In [14]:
trump_words = analyze_speech('TRUMP')


Analysis for TRUMP:
Total words spoken (excluding stop words): 4163
Number of times 'economy' was mentioned: 3
Number of times 'jobs' was mentioned: 9
Number of times 'abortion' was mentioned: 3
Number of times 'climate' was mentioned: 0
Number of times 'golf' was mentioned: 1
Number of times 'border' was mentioned: 24
Number of times 'childcare' was mentioned: 0
Number of times 'child' was mentioned: 1
Number of times 'vaccine' was mentioned: 3
Number of times 'Russia' was mentioned: 15 versus the number of times Ukraine was used' 12
Number of times 'taxes' was mentioned: 7
Number of times Biden or Trump called something 'great': 18
Number of times Biden referred to Trump as 'Trump' 4 versus the number of times he referred to Trump as 'this guy' 10
Number of times the speech was 'inaudible' : 2
Number of times the speech was 'Hamas' : 5
Number of times the speech was 'Palestinian' : 2 versus the number of times Israel was used' 5
Number of times the speech was 'World War Three' : 11


In [15]:
biden_words = analyze_speech('BIDEN')


Analysis for BIDEN:
Total words spoken (excluding stop words): 3580
Number of times 'economy' was mentioned: 10
Number of times 'jobs' was mentioned: 13
Number of times 'abortion' was mentioned: 2
Number of times 'climate' was mentioned: 6
Number of times 'golf' was mentioned: 1
Number of times 'border' was mentioned: 9
Number of times 'childcare' was mentioned: 7
Number of times 'child' was mentioned: 6
Number of times 'vaccine' was mentioned: 0
Number of times 'Russia' was mentioned: 0 versus the number of times Ukraine was used' 8
Number of times 'taxes' was mentioned: 5
Number of times Biden or Trump called something 'great': 4
Number of times Biden referred to Trump as 'Trump' 7 versus the number of times he referred to Trump as 'this guy' 14
Number of times the speech was 'inaudible' : 3
Number of times the speech was 'Hamas' : 4
Number of times the speech was 'Palestinian' : 0 versus the number of times Israel was used' 6
Number of times the speech was 'World War Three' : 14

T

### Selected Words Dataframe

In [16]:
# Create DataFrame
df = pd.DataFrame({
    'TRUMP': trump_words,
    'BIDEN': biden_words
}).fillna(0).astype(int)

# Ensure all requested words are in the DataFrame
requested_words = ['economy', 'jobs', 'abortion', 'climate', 'golf', 'women', 'border', 
                   'childcare', 'vaccine', 'russia', 'taxes', 'great', 'guy', 'inaudible']
for word in requested_words:
    if word not in df.index:
        df.loc[word] = [0, 0]

# Sort DataFrame by total occurrences
df['Total'] = df['TRUMP'] + df['BIDEN']
df = df.sort_values('Total', ascending=False).drop('Total', axis=1)


In [17]:
requested_words = ['economy', 'jobs', 'abortion', 'climate', 'golf', 'healthcare', 'border', 
                   'childcare', 'vaccine', 'ukraine', 'russia', 'taxes', 'great', 'guy', 'inaudible', 'china']

df2 = pd.DataFrame({
    'TRUMP': trump_words,
    'BIDEN': biden_words
}).fillna(0).astype(int)

# only get the requested words
df2 = df2.loc[requested_words]
df2

# # save df2 as json
# import json
# with open('words.json', 'w') as f:
#     json.dump(df2.to_dict(), f)

Unnamed: 0,TRUMP,BIDEN
economy,3,10
jobs,9,13
abortion,3,2
climate,0,6
golf,1,1
healthcare,0,1
border,24,9
childcare,0,7
vaccine,3,0
ukraine,12,8


### Cleaning word count

In [2]:
# import json
import json
# open word-count.json
with open('word-count.json', 'r') as f:
    data = json.load(f)


In [3]:
data

{'columns': ['TRUMP', 'BIDEN'],
 'index': ['000',
  '1',
  '10',
  '100',
  '12',
  '13',
  '14',
  '15',
  '159',
  '16',
  '160',
  '170',
  '18',
  '19',
  '1990s',
  '2',
  '20',
  '200',
  '2010',
  '2035',
  '20th',
  '21',
  '225',
  '235',
  '24',
  '25',
  '28',
  '30',
  '35',
  '38',
  '39',
  '40',
  '400',
  '42',
  '44',
  '5',
  '50',
  '500',
  '51',
  '58',
  '6',
  '60',
  '6th',
  '70',
  '8',
  '800',
  '9',
  'abandoned',
  'ability',
  'able',
  'abortion',
  'about',
  'absolutely',
  'aca',
  'accept',
  'acceptable',
  'accepted',
  'accepting',
  'accidentally',
  'accord',
  'according',
  'accountable',
  'ace',
  'aced',
  'acknowledge',
  'acknowledged',
  'across',
  'act',
  'acts',
  'actually',
  'add',
  'addiction',
  'addition',
  'additional',
  'administration',
  'admired',
  'admits',
  'afghan',
  'afghanistan',
  'afraid',
  'after',
  'again',
  'against',
  'age',
  'agent',
  'agents',
  'ago',
  'agree',
  'agreed',
  'agreement',
  'agric

In [4]:
words_to_keep = [
    "abandoned", "ability", "abortion", "absolutely", "accept", "acceptable", "accountable", "acknowledge", "administration", "admired", "afghan", "afghanistan", "agriculture", "airlift", "alliances", "allies", "america", "american", "americans", "antisemitic", "appeal", "appoint", "approval", "argue", "ashamed", "asia", "asylum", "attack", "audience", "availability", "average", "babies", "baghdadi", "ballistic", "ballooning", "bankrupt", "beautiful", "believed", "beneficiary", "benefited", "bipartisan", "birth", "blacks", "bleach", "blew", "bloodbath", "bounceback", "bounced", "brain", "brandon", "build", "built", "businesses", "cabinet", "cages", "california", "campaign", "candidate", "capitol", "caravans", "career", "carolina", "caucus", "ceasefire", "celsius", "cemetery", "challenged", "championships", "charlottesville", "cheap", "cheapest", "childcare", "china", "chips", "circumstance", "citizens", "climate", "collapsed", "colleagues", "college", "colleges", "communities", "competent", "complainer", "computer", "conference", "congress", "conjecture", "consequence", "conservative", "conservatives", "constitutional", "contest", "contributors", "convicted", "cops", "corporate", "corps", "coverage", "covid", "created", "creating", "crime", "crimes", "criminal", "criticized", "crosstalk", "cuddles", "debacle", "debunk", "debunked", "decimated", "defeat", "deficit", "democracy", "democrat", "democrats", "denounce", "department", "deserve", "deserves", "destroy", "destroyed", "destroying", "devastated", "development", "disinformation", "dislocation", "documentary", "domes", "drained", "dream", "drug", "drugs", "dying", "economic", "economically", "economists", "economy", "education", "elected", "election", "eliminate", "eliminated", "embarrassment", "empire", "emptying", "endorse", "endorsed", "endorsing", "energy", "engaged", "enjoying", "enterprises", "environment", "environmental", "envy", "equip", "equipment", "equipped", "essentially", "establish", "europe", "european", "exaggerating", "exams", "exoneration", "exorbitant", "experts", "exploding", "extensive", "extreme", "facilities", "failing", "families", "fbi", "federal", "felon", "fentanyl", "filmmaker", "fire", "fired", "fires", "florida", "flowing", "foolishness", "force", "foreign", "forest", "forgive", "forgiven", "former", "founding", "founders", "france", "frankly", "fraud", "freefall", "friends", "funded", "funding", "funeral", "games", "garden", "general", "generals", "gentleman", "germany", "glioblastoma", "gorgeous", "gotaways", "government", "governor", "grants", "greed", "groceries", "grow", "growing", "growth", "guarantee", "handicap", "handled", "hbcus", "headaches", "healthcare", "heroes", "hezbollah", "historians", "historic", "historically", "history", "hitler", "horrible", "horribly", "horror", "hospitals", "hostage", "hostages", "hotels", "housing", "humanity", "hungary", "hurt", "hurting", "illegal", "illegally", "immaculate", "immigrant", "impact", "important", "imposed", "incompetence", "increase", "increased", "increasing", "increasingly", "incredible", "incredibly", "independent", "india", "indicted", "indictment", "indictments", "industries", "inflation", "inherited", "initiative", "inject", "innocent", "insane", "insignificant", "institutions", "insulin", "insurance", "intelligence", "invaded", "invented", "invest", "invested", "investment", "involved", "iran", "iraq", "israel", "israeli", "israelis", "jackson", "japan", "joking", "jong", "journal", "juries", "justice", "justices", "kansas", "killed", "killers", "killing", "laboratories", "laden", "laptop", "laureates", "leadership", "legislation", "liberal", "liberals", "liar", "literally", "loans", "location", "loser", "losers", "luxury", "lying", "machinery", "machines", "malfeasance", "manchurian", "mandate", "mandating", "manhattan", "manufacturing", "market", "materials", "medicaid", "medical", "medicare", "members", "mental", "merit", "migrant", "migrants", "military", "millionaires", "minnesota", "minority", "misinformation", "misrepresentation", "missile", "molesting", "morals", "mothers", "murdered", "nazis", "negotiate", "negotiations", "netanyahu", "nuclear", "nursing", "obliterated", "occupied", "occupying", "offense", "officers", "officials", "operations", "opponent", "opportunities", "opportunity", "opposed", "opposite", "organized", "outrageous", "overseas", "palestinian", "palestinians", "pandemic", "patriots", "patrons", "peaceful", "peacefully", "pelosi", "penalties", "pennsylvania", "pharmaceutical", "polluters", "pollution", "population", "portland", "powerful", "precursors", "predators", "pregnant", "prescription", "presidential", "prisons", "progressive", "proposing", "prosecutor", "protections", "provide", "provided", "providing", "pushing", "qaida", "quadrupled", "radical", "radicals", "ranking", "rape", "raped", "raping", "recession", "reduce", "reduced", "reelected", "reform", "refusing", "regulations", "release", "religious", "reputation", "required", "respect", "respected", "responsibility", "responsible", "restore", "retribution", "revenue", "rewarded", "rigged", "rioting", "ripoff", "ripped", "ripping", "ronald", "ronny", "safest", "salamani", "samsung", "scholar", "scholars", "scholarship", "scranton", "seattle", "segregation", "senator", "separated", "separating", "serious", "seriously", "shame", "singing", "sisters", "situation", "smart", "snapped", "solitary", "solvent", "soviet", "spouses", "spurred", "staff", "stake", "statistic", "statues", "statute", "stealing", "strengthen", "student", "stupid", "stupidly", "success", "successful", "sucker", "suckers", "supreme", "swamp", "swastikas", "tariff", "tariffs", "tech", "technology", "terrible", "terror", "terrorist", "terrorists", "texas", "therapeutics", "thousands", "torches", "trafficking", "tremendous", "tremendously", "trillion", "trillionaires", "trillions", "tripled", "troops", "trusts", "tuition", "ukraine", "ukrainian", "unacceptable", "unbelievable", "uncivilized", "unemployment", "unfortunately", "union", "universities", "unsafe", "unselect", "ushered", "vaccine", "veterans", "veto", "virginia", "voluntarily", "volunteerism", "voters", "voting", "wages", "wealthy", "weaponization", "weapons", "whiner", "wonderful", "worried", "worse", "worst", "zelenskyy", "zones"
]


In [5]:
# Filter the data to keep only the requested words
filtered_data = {
    "columns": data["columns"],
    "index": [word for word in data["index"] if word in words_to_keep],
    "data": [row for i, row in enumerate(data["data"]) if data["index"][i] in words_to_keep]
}

# Create a DataFrame from the filtered data
df_filtered = pd.DataFrame(filtered_data["data"], columns=filtered_data["columns"], index=filtered_data["index"])

# Reset the index if needed
df_filtered = df_filtered.reset_index()

# Rename the index column if needed
df_filtered = df_filtered.rename(columns={'index': 'word'})

# Display the first few rows of the filtered DataFrame
print(df_filtered.tail())

NameError: name 'pd' is not defined

In [None]:
# Save the filtered DataFrame as a JSON file

# df_filtered.to_json('filtered_word_count.json', orient='split')
# drop index


## Key Mentions

In [7]:
# import filtered_word_count.json
import json
with open('filtered_word_count.json', 'r') as f:
    data = json.load(f)

# Create a DataFrame from the filtered data
df_filtered = pd.DataFrame(data["data"], columns=data["columns"])
df_filtered.head()


NameError: name 'pd' is not defined

In [71]:
# get TRUMP sort descending
df_sorted = df_filtered.sort_values('TRUMP', ascending=False)
df_sorted.head(20)

# Biden blacks, economy, jobs, border, tax, ukraine, inflation, childcare, woman 

Unnamed: 0,word,TRUMP,BIDEN
43,border,24,9
206,history,23,15
299,money,23,5
197,great,18,4
456,worst,16,6
373,Russia,15,0
430,Ukraine,12,8
171,fired,12,2
450,war,11,14
70,China,11,2


In [80]:
# create a dataset:
top_words = {'TRUMP':{"border": 24, "money": 23, "Russia": 15, "Ukraine": 12, "war": 11, "China": 11, "tax": 9, "jobs": 9, "inflation": 8, "veterans":8},'BIDEN':{"war": 14, "jobs": 13, "economy": 10, "tax": 9, "border": 9, "Ukraine": 8, "inflation": 7, "childcare": 7, "veterans": 6, "climate": 6}}

# create as dataframe
top_words = pd.DataFrame(top_words)
top_words

# save as csv



Unnamed: 0,TRUMP,BIDEN
border,24.0,9.0
money,23.0,
Russia,15.0,
Ukraine,12.0,8.0
war,11.0,14.0
China,11.0,
tax,9.0,9.0
jobs,9.0,13.0
inflation,8.0,7.0
economy,,10.0


In [39]:
# import word-count.json
import json
with open('word-count.json', 'r') as f:
    data2 = json.load(f)

# Create a DataFrame from the filtered data
df2 = pd.DataFrame(data2["data"], columns=data2["columns"], index=data2["index"])
df2.head()


Unnamed: 0,TRUMP,BIDEN
0,4,18
1,2,4
10,5,8
100,5,2
12,1,0


In [63]:
# get TRUMP sort descending
df_sorted2 = df2.sort_values('TRUMP', ascending=False)
# words said more than 10 times
df_sorted2


# Trump border, money, great, russia, ukraine, war,china, security, tax, jobs, inflation
# Biden blacks, economy, jobs, border, tax, ukraine, inflation, childcare, woman 


Unnamed: 0,TRUMP,BIDEN
people,71,38
because,66,27
our,65,19
all,59,45
country,46,14
going,45,42
him,34,12
up,34,10
never,32,5
about,32,29
