In [6]:
import pandas as pd
import re

from nltk.corpus import stopwords
from string import punctuation

from collections import Counter, defaultdict

import glob
from collections import Counter

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to /Users/kevinbaum/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


We start by accessing the five files that were pulled during the week on Monday 02/12/2024 and combining them into a DataFrame.

In [7]:
# File paths
files = ['Data/news_0212.csv', 'Data/news_0213.csv', 
         'Data/news_0214.csv', 'Data/news_0215.csv',
        'Data/news_0216.csv']  

# Read each CSV file into a DataFrame and store them in a list
file_dfs = [pd.read_csv(file) for file in files]

# Concatenate all DataFrames into one
combined_df = pd.concat(file_dfs, ignore_index=True)
combined_df


Unnamed: 0,source,url,content
0,cnn,https://www.cnn.com/2024/02/12/politics/cq-bro...,CNN — Chairman of the Joint Chiefs of Staff Ge...
1,cnn,https://www.cnn.com/2024/02/12/politics/trump-...,CNN — Former President Donald Trump has endors...
2,cnn,https://www.cnn.com/2024/02/12/politics/senate...,The Senate is inching closer to final passage ...
3,cnn,https://www.cnn.com/2024/02/12/politics/bidens...,Washington CNN — President Joe Biden and King ...
4,cnn,https://www.cnn.com/2024/02/12/politics/trump-...,CNN — Former President Donald Trump on Monday ...
...,...,...,...
348,foxnews,https://www.foxnews.com/politics/fox-news-poli...,Welcome to Fox News’ Politics newsletter with ...
349,foxnews,https://www.foxnews.com/politics/fox-news-poli...,Welcome to Fox News’ Politics newsletter with ...
350,foxnews,https://www.foxnews.com/politics/fox-news-poli...,Welcome to Fox News’ Politics newsletter with ...
351,foxnews,https://www.foxnews.com/politics/democrats-win...,close Video Dems flipping NY House seat threat...


Before proceeding, we remove the duplicates in the DataFrame.

In [8]:
df = combined_df.drop_duplicates().reset_index(drop=True)
df

Unnamed: 0,source,url,content
0,cnn,https://www.cnn.com/2024/02/12/politics/cq-bro...,CNN — Chairman of the Joint Chiefs of Staff Ge...
1,cnn,https://www.cnn.com/2024/02/12/politics/trump-...,CNN — Former President Donald Trump has endors...
2,cnn,https://www.cnn.com/2024/02/12/politics/senate...,The Senate is inching closer to final passage ...
3,cnn,https://www.cnn.com/2024/02/12/politics/bidens...,Washington CNN — President Joe Biden and King ...
4,cnn,https://www.cnn.com/2024/02/12/politics/trump-...,CNN — Former President Donald Trump on Monday ...
...,...,...,...
235,foxnews,https://www.foxnews.com/politics/house-republi...,close Video Rep. Ronny Jackson demands Biden t...
236,foxnews,https://www.foxnews.com/politics/gop-senators-...,close Video Biden and the Democrats just do no...
237,foxnews,https://www.foxnews.com/politics/doj-defends-s...,close Video Former US attorney discusses Speci...
238,foxnews,https://www.foxnews.com/politics/fox-news-poli...,Welcome to Fox News’ Politics newsletter with ...


### Checking the Results of the Web Scraping

Confirming that the CNN content was scraped successfully

In [9]:
df[df['source']=='cnn'].head()

Unnamed: 0,source,url,content
0,cnn,https://www.cnn.com/2024/02/12/politics/cq-bro...,CNN — Chairman of the Joint Chiefs of Staff Ge...
1,cnn,https://www.cnn.com/2024/02/12/politics/trump-...,CNN — Former President Donald Trump has endors...
2,cnn,https://www.cnn.com/2024/02/12/politics/senate...,The Senate is inching closer to final passage ...
3,cnn,https://www.cnn.com/2024/02/12/politics/bidens...,Washington CNN — President Joe Biden and King ...
4,cnn,https://www.cnn.com/2024/02/12/politics/trump-...,CNN — Former President Donald Trump on Monday ...


Confirming that the Fox News content was scraped successfully

In [10]:
df[df['source']=='foxnews'].head()

Unnamed: 0,source,url,content
47,foxnews,https://www.foxnews.com/politics/biden-takes-j...,close Video Biden takes jab at special counsel...
48,foxnews,https://www.foxnews.com/politics/rfk-jr-apolog...,close Video RFK Jr. drops surprise campaign ad...
49,foxnews,https://www.foxnews.com/politics/bidens-upcomi...,close Video Biden won't take cognitive test in...
50,foxnews,https://www.foxnews.com/politics/kamala-harris...,close Video Marc Thiessen questions whether Bi...
51,foxnews,https://www.foxnews.com/politics/climate-activ...,close Video Biden’s export suspension on lique...


Let's take a full look at one of the rows for both CNN and Fox to see if there are any obvious steps that stand out that we want to clean up.

In [11]:
# Set pandas to display the full content of a column
# We will do this only temporarily and reset it after
# testing one column
pd.set_option('display.max_colwidth', None)

In [12]:
# Display one row from CNN
print("CNN Article Content:")
print(df[df['source'] == 'cnn'].iloc[0])

CNN Article Content:
source                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             

In [13]:
# Display one row from CNN
print("Fox Article Content:")
print(df[df['source'] == 'foxnews'].iloc[0])

Fox Article Content:
source                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             

In [14]:
# Reset the columns so that we go back to truncating the "content" column
pd.reset_option('display.max_colwidth')

## Data Cleaning, Tokenizing, and Normalizing

### Removing Unwanted Prefixes

We see from looking at the first 5 rows of the CNN and Fox records that the content of the articles starts with "CNN --" or "(city name) CNN" for CNN and "close Video" for Fox. Since this is noise in our attempt to topic model, we will remove this part of the content body using the function below.

In [15]:
# Function to remove prefixes

def remove_prefix(row):
    # Pattern to match "CNN —" if it includes a city's name before it
    cnn_pattern = r'^(?:[\w\s]+\s)?CNN — '
    
    # For CNN, remove pattern if it matches
    if row['source'] == 'cnn':
        return re.sub(cnn_pattern, '', row['content'])
    
    # For Fox News, remove "close Video " prefix
    elif row['source'] == 'foxnews' and row['content'].startswith('close Video '):
        return row['content'][12:]
    
    # Return original content if no prefix to remove
    else:
        return row['content']

df['content'] = df.apply(remove_prefix, axis=1)

In [16]:
# Checking to see how CNN looks without the prefixes
df[df['source']=='cnn']

Unnamed: 0,source,url,content
0,cnn,https://www.cnn.com/2024/02/12/politics/cq-bro...,Chairman of the Joint Chiefs of Staff Gen. CQ ...
1,cnn,https://www.cnn.com/2024/02/12/politics/trump-...,Former President Donald Trump has endorsed Nor...
2,cnn,https://www.cnn.com/2024/02/12/politics/senate...,The Senate is inching closer to final passage ...
3,cnn,https://www.cnn.com/2024/02/12/politics/bidens...,President Joe Biden and King Abdullah II of Jo...
4,cnn,https://www.cnn.com/2024/02/12/politics/trump-...,Former President Donald Trump on Monday asked ...
...,...,...,...
220,cnn,https://www.cnn.com/2024/02/15/politics/navy-f...,Members of Congress pressed the CEO of the nat...
221,cnn,https://www.cnn.com/2024/02/16/politics/kamala...,US Vice President Kamala Harris on Friday call...
222,cnn,https://www.cnn.com/2024/01/30/politics/trump-...,New York state Judge Arthur Engoron has the fu...
223,cnn,https://www.cnn.com/2024/02/15/politics/border...,The acting deputy chief of the US Border Patro...


In [17]:
# Checking to see how Fox looks without the prefixes
df[df['source']=='foxnews']

Unnamed: 0,source,url,content
47,foxnews,https://www.foxnews.com/politics/biden-takes-j...,Biden takes jab at special counsel report with...
48,foxnews,https://www.foxnews.com/politics/rfk-jr-apolog...,RFK Jr. drops surprise campaign ad during Supe...
49,foxnews,https://www.foxnews.com/politics/bidens-upcomi...,Biden won't take cognitive test in physical ex...
50,foxnews,https://www.foxnews.com/politics/kamala-harris...,Marc Thiessen questions whether Biden is capab...
51,foxnews,https://www.foxnews.com/politics/climate-activ...,Biden’s export suspension on liquefied natural...
...,...,...,...
235,foxnews,https://www.foxnews.com/politics/house-republi...,Rep. Ronny Jackson demands Biden take cognitiv...
236,foxnews,https://www.foxnews.com/politics/gop-senators-...,Biden and the Democrats just do not care: Sen....
237,foxnews,https://www.foxnews.com/politics/doj-defends-s...,Former US attorney discusses Special Counsel H...
238,foxnews,https://www.foxnews.com/politics/fox-news-poli...,Welcome to Fox News’ Politics newsletter with ...


### Remove Unwanted First Sentences

We see that some of the CNN articles begin with the following sentences: "A version of this story appeared in CNN’s What Matters newsletter. To get it in your inbox, sign up for free here." We also see that some of the Fox articles begin with the phrase "Welcome to Fox News" in the first sentence. To remove this noise, we will write a function below that handles it. We need to run this function twice in order to fully clean out the noise. Also, once we run the function twice, we need to re-run the "remove_prefix" function again as the prefixes will be present after removing some of the unwanted first sentences. 

In [18]:
def remove_first_sentence(row):
    # Split the content into sentences based on '.', '?', and '!'
    sentences = re.split(r'(?<=[.!?]) +', row['content'])
    
    # Initialize updated_content with the original content in case none of the conditions apply
    updated_content = row['content']
    
    if len(sentences) > 1:  # Check if there's more than one sentence
        first_sentence = sentences[0]  # Get the first sentence
        
        if 'Welcome to Fox News' in first_sentence:
            # Join all sentences except the first one. We start at 2 because the word "D.C." as in "Washington D.C."
            updated_content = ' '.join(sentences[2:])
            
        elif 'A version of this story appeared' in first_sentence:
            # Removing 2 sentences since CNN includes 2 unwanted sentences in this instance.
            updated_content = ' '.join(sentences[2:])
            
        elif first_sentence.strip().startswith("What's Happening?"):
            # Directly check if the first sentence is exactly "What's Happening?" and remove it if so
            updated_content = ' '.join(sentences[1:])

    # Remove sentences containing the phrase 'CLICK HERE TO GET THE FOX NEWS APP'
    updated_sentences = [sentence for sentence in sentences if 'FOX NEWS APP' not in sentence]
    updated_sentences = [sentence for sentence in sentences if 'Foxnews.com' not in sentence]
    updated_sentences = [sentence for sentence in sentences if 'Getty Images' not in sentence]
    updated_sentences = [sentence for sentence in sentences if 'CLICK HERE TO GET THE FOX NEWS APP' not in sentence]

    # Join the updated sentences back into content
    updated_content = ' '.join(updated_sentences)
            
    return updated_content

# Run first iteration of removing the first sentence
df['content'] = df.apply(remove_first_sentence, axis=1)

# Run second iteration to remove  additional noise on some of the rows
df['content'] = df.apply(remove_first_sentence, axis=1)

# Remove prefixes again after the unwanted sentences are removed
df['content'] = df.apply(remove_prefix, axis=1)

Let's check to see how our content looks now without the unwanted first and second sentences found in some of the articles.

In [19]:
# Checking to see how CNN looks without the unwanted first sentences.
df[df['source']=='cnn']

Unnamed: 0,source,url,content
0,cnn,https://www.cnn.com/2024/02/12/politics/cq-bro...,Chairman of the Joint Chiefs of Staff Gen. CQ ...
1,cnn,https://www.cnn.com/2024/02/12/politics/trump-...,Former President Donald Trump has endorsed Nor...
2,cnn,https://www.cnn.com/2024/02/12/politics/senate...,The Senate is inching closer to final passage ...
3,cnn,https://www.cnn.com/2024/02/12/politics/bidens...,President Joe Biden and King Abdullah II of Jo...
4,cnn,https://www.cnn.com/2024/02/12/politics/trump-...,Former President Donald Trump on Monday asked ...
...,...,...,...
220,cnn,https://www.cnn.com/2024/02/15/politics/navy-f...,Members of Congress pressed the CEO of the nat...
221,cnn,https://www.cnn.com/2024/02/16/politics/kamala...,US Vice President Kamala Harris on Friday call...
222,cnn,https://www.cnn.com/2024/01/30/politics/trump-...,New York state Judge Arthur Engoron has the fu...
223,cnn,https://www.cnn.com/2024/02/15/politics/border...,The acting deputy chief of the US Border Patro...


In [20]:
# Checking to see how Fox looks without the unwanted first sentences.
df[df['source']=='foxnews']

Unnamed: 0,source,url,content
47,foxnews,https://www.foxnews.com/politics/biden-takes-j...,Biden takes jab at special counsel report with...
48,foxnews,https://www.foxnews.com/politics/rfk-jr-apolog...,RFK Jr. drops surprise campaign ad during Supe...
49,foxnews,https://www.foxnews.com/politics/bidens-upcomi...,Biden won't take cognitive test in physical ex...
50,foxnews,https://www.foxnews.com/politics/kamala-harris...,Marc Thiessen questions whether Biden is capab...
51,foxnews,https://www.foxnews.com/politics/climate-activ...,Biden’s export suspension on liquefied natural...
...,...,...,...
235,foxnews,https://www.foxnews.com/politics/house-republi...,Rep. Ronny Jackson demands Biden take cognitiv...
236,foxnews,https://www.foxnews.com/politics/gop-senators-...,Biden and the Democrats just do not care: Sen....
237,foxnews,https://www.foxnews.com/politics/doj-defends-s...,Former US attorney discusses Special Counsel H...
238,foxnews,https://www.foxnews.com/politics/fox-news-poli...,Welcome to Fox News’ Politics newsletter with ...


Let's quickly remove references to images embedded into the body content, as it is also noise not needed for topic modeling.

In [21]:
# Remove image info

# Define a regular expression pattern to match content inside parentheses
pattern = r'\s*\([^)]*\)'

# Replace content inside parentheses with an empty string
df['content'] = df['content'].str.replace(pattern, '', regex=True)

Next, we look at the end of the articles as the content will often end with contributing author information or other material that is not relevant to the topic of the body content. We show the dataframe ending previews and then write a function to remove last sentences if they contain information that is not relevant. 

In [22]:
# Set pandas to display the full content of a column
pd.set_option('display.max_colwidth', None)

In [23]:
# Create a new column 'content_end_preview' to show the last part of the content
df['content_end_preview'] = df['content'].apply(lambda x: x[-500:])

In [24]:
# Checking the end of CNN articles
df[['source', 'url', 'content_end_preview']] [df['source'] == 'cnn']

Unnamed: 0,source,url,content_end_preview
0,cnn,https://www.cnn.com/2024/02/12/politics/cq-brown-nato-trump/index.html,"eir objectives,” the chairman said. “At the same time, not looking for a broader conflict with the United States.” There have been at least 170 attacks on US and coalition forces in Iraq, Syria and Jordan since October 17. The Pentagon said Monday that those attacks have resulted in 186 wounded or killed in action — including 130 traumatic brain injuries. Three US soldiers were killed in a drone attack in January on a US outpost in Jordan. This story has been updated with additional information."
1,cnn,https://www.cnn.com/2024/02/12/politics/trump-endorse-michael-whatley-lara-trump-rnc/index.html,"great job in his home state of North Carolina, and is committed to election integrity, which we must have to keep fraud out of our election so it can’t be stolen,” Trump said in a statement. “My very talented daughter-in-law, Lara Trump, has agreed to run as the RNC Co-Chair. Lara is an extremely talented communicator and is dedicated to all that MAGA stands for. She has told me she wants to accept this challenge and would be GREAT!” he also said. This is a developing story and will be updated."
2,cnn,https://www.cnn.com/2024/02/12/politics/senate-foreign-aid-bill-ukraine/index.html,"y be part of the bill, but went on to reject the bipartisan deal amid forceful attacks on the measure by Trump and top House Republicans. Over the weekend, Trump also wrote on Truth Social that the US should stop providing foreign aid unless it is structured as a loan, another sign of the political pressure Republicans continue to face amid efforts to send funding to US allies. This story and headline have been updated with additional developments. CNN’s Kate Sullivan contributed to this report."
3,cnn,https://www.cnn.com/2024/02/12/politics/bidens-meeting-with-jordanian-king-comes-at-flashpoint-in-israel-hamas-war/index.html,"ions toward an agreement would continue despite the Israeli prime minister’s comments, which Blinken said were referencing the “absolute non-starters” in the proposal. The full Hamas response proposes three phases, each lasting 45 days, including the withdrawal of Israeli troops from Gaza, a massive humanitarian effort, and freedom of movement for people throughout Gaza, according to a copy obtained by CNN. CNN’s MJ Lee, Priscilla Alvarez, Betsy Klein and Kevin Liptak contributed to this report."
4,cnn,https://www.cnn.com/2024/02/12/politics/trump-supreme-court-immunity-filing/index.html,"nist ban.” The court may have to decide how it wants to handle the former president’s immunity claim at the same time it is drafting an opinion in the ballot case. Together, the cases have thrust the court into the middle of this year’s presidential election in a way it has largely managed to avoid since its decision in Bush v. Gore effectively decided the 2000 election between former President George W. Bush and former Vice President Al Gore. This story has been updated with additional details."
...,...,...,...
220,cnn,https://www.cnn.com/2024/02/15/politics/navy-federal-congressional-black-caucus/index.html,d a separate analysis of public mortgage data by Senate banking committee staff that also found racial disparities in its lending. Navy Federal is also facing a federal class-action lawsuit from mortgage applicants who cite CNN’s reporting and allege that the credit union discriminated against them. A judge approved a motion to consolidate three separate lawsuits against the credit union into a single case last month. Editor’s Note: This story was update to include a statement from Navy Federal.
221,cnn,https://www.cnn.com/2024/02/16/politics/kamala-harris-trump-nato/index.html,"NATO allies and abandoned our treaty commitments. Imagine if we went easy on Putin. Let alone encouraged him,” Harris said. “History offers a clue. If we stand by while an aggressor invades its neighbor with impunity, they will keep going. In the case of Putin, that means all of Europe would be threatened. If we fail to impose severe consequences on Russia, other authoritarians across the globe would be emboldened,” Harris said. This story has been updated with additional developments on Friday."
222,cnn,https://www.cnn.com/2024/01/30/politics/trump-fraud-trial-verdict-what-to-watch-for/index.html,"oron’s law clerk, leading to the gag order. The judge later extended the order to include Trump’s attorneys from commenting on the judge’s private communications with his law clerk. The order does not limit public criticism of Engoron, the district attorney or other parts of the case. As the trial neared an end in December, Engoron reminisced. “In a strange way, I’m gonna miss this trial,” the judge said. “It’s been an experience.” This story has been updated with the anticipated verdict timing."
223,cnn,https://www.cnn.com/2024/02/15/politics/border-patrol-official-suspended-alleged-misconduct/index.html,"or off duty. Federal privacy laws prohibit discussion of individual cases.” The Washington Post first reported Martinez’s suspension. Martinez, a 31-year Border Patrol veteran, has also served as the chief patrol agent of the Laredo Sector and deputy chief patrol agent of the Rio Grande Valley Sector, according to the CBP . The US Border Patrol is a law enforcement entity under the umbrella of CBP, in the Department of Homeland Security. CNN’s Piper Hudspeth Blackburn contributed to this report."


In [25]:
# Checking the end of Fox News articles
df[['source', 'url', 'content_end_preview']] [df['source'] == 'foxnews']

Unnamed: 0,source,url,content_end_preview
47,foxnews,https://www.foxnews.com/politics/biden-takes-jab-hur-report-joke-memory-returns-speech-one-more-thing-forgot,"ogress."" CLICK TO GET THE FOX NEWS APP ""The recent Washington Post headline summed it up,"" Biden added, quoting the newspaper's story titled, ""Falling Inflation and Rising Growth Give the U.S. the World's Best Recovery."" ""The world's best recovery!"" Biden said. ""It's because you implemented what we did. You made it work."" Danielle Wallace is a reporter for Fox News Digital covering politics, crime, police and more. Story tips can be sent to danielle.wallace@fox.com and on Twitter: @danimwallace."
48,foxnews,https://www.foxnews.com/politics/rfk-jr-apologizes-family-super-bowl-ad-claims-no-involvement,"icks to stop him. The public sees through it all and won’t stand for it."" Kennedy initially sought to challenge President Biden in the 2024 Democratic presidential primary, but the DNC said it would not hold primary debates and stood behind the incumbent president. Fox News' Bradford Betz contributed to this report. Anders Hagstrom is a reporter with Fox News Digital covering national politics and major breaking news events. Send tips to Anders.Hagstrom@Fox.com, or on Twitter: @Hagstrom_Anders."
49,foxnews,https://www.foxnews.com/politics/bidens-upcoming-physical-exam-will-not-include-cognitive-test-white-house-says,"has been my experience with this president,"" she said. Biden's age is a major concern among U.S. voters, 86% of whom say he is too old to serve a second term, according to an ABC poll. A Sunday poll from ABC/Ipsos found that 86% of Americans believe Biden is too old to serve another term, including 73% of Democrats. Anders Hagstrom is a reporter with Fox News Digital covering national politics and major breaking news events. Send tips to Anders.Hagstrom@Fox.com, or on Twitter: @Hagstrom_Anders."
50,foxnews,https://www.foxnews.com/politics/kamala-harris-ready-serve-democrats-sound-alarm-about-bidens-age,"s crying and wet the bed,"" Begala quipped on CNN last Friday. ""This is terrible for Democrats. And anybody with a functioning brain knows that,"" he declared. GOP CAMPAIGN ARM LAUNCHES MEDIA BLITZ AGAINST DEMS WHO OPPOSED VIOLENT CRIME BILL AS CRISIS IN DC SPIRALS Then-Democrat presidential candidate Hillary Clinton makes a concession speech after being defeated by Donald Trump in New York on November 9, 2016. Brandon Gillespie is an associate editor at Fox News. Follow him on X at @BGillespieAL."
51,foxnews,https://www.foxnews.com/politics/climate-activists-arrested-shutting-down-biden-campaign-hq,"power plant electricity generation, push electric vehicles and incentivize the electrification of the residential sector. ""I mean, it literally is the existential threat. It’s even more consequential than nuclear power, nuclear war,"" he added. ""That would be horrible and awful, and it would just make the environment incredibly worse. But it’s about the environment."" The Biden campaign didn't immediately respond to a request for comment. Thomas Catenacci is a politics writer for Fox News Digital."
...,...,...,...
235,foxnews,https://www.foxnews.com/politics/house-republicans-push-biden-take-cognitive-test-hur-report-obvious-mental-decline,". Nick"" when reached by Fox News Digital on Friday morning. Earlier this week, White House press secretary Karine Jean-Pierre told reporters that Biden would not be taking a cognitive test as part of his regular physical exam. Elizabeth Elkind is a reporter for Fox News Digital focused on Congress as well as the intersection of Artificial Intelligence and politics. Previous digital bylines seen at Daily Mail and CBS News. Follow on Twitter at @liz_elkind and send tips to elizabeth.elkind@fox.com"
236,foxnews,https://www.foxnews.com/politics/gop-senators-urge-biden-admin-to-end-racial-discrimination-policy-in-chips-grants-before-it-breaks-the-law,"es to discriminate on the basis of race when making and enforcing contracts."" Sen. Cynthia Lummis, R-Wyo., conducts a news conference Aug. 9. 29. If she fails to rescind the policy, Cruz and his colleagues are demanding that she detail ""the reasons you believe the Guidance does not violate the United States Constitution or Title VI, or induce private parties to violate Section 1981."" Brooke Singman is a political correspondent and reporter for Fox News Digital, Fox News Channel and FOX Business."
237,foxnews,https://www.foxnews.com/politics/doj-defends-special-counsel-report-bidens-memory-consistent-legal-requirement-not-gratuitous,"later revealed to Fox News that it was Biden who brought up his son's 2015 death-not Hur. ""We conclude that no criminal charges are warranted in this matter,"" the report, released Thursday, states. Fox News' David Spunt and Jake Gibson contributed to this report. Sarah Rumpf-Whitten is a breaking news writer for Fox News Digital and Fox Business. She is a native of Massachusetts and is based in Orlando, Florida. Story tips and ideas can be sent to sarah.rumpf@fox.com and on X: @s_rumpfwhitten ."
238,foxnews,https://www.foxnews.com/politics/fox-news-politics-trump-vows-appeal,"run for president, serve as Manchin's VP ...Read more 'COMMONSENSE CONSERVATIVE': Former special forces soldier lands big endorsement in race to flip House seat ...Read more 'RACE OF HIS LIFE' : Dem Sen blasts GOP for not caring about immigration; record comes back to haunt him ...Read more Subscribe now to get Fox News Politics newsletter in your inbox. Get the latest updates from the 2024 campaign trail, exclusive interviews and more on FoxNews.com . This article was written by Fox News staff."


We see that indeed some articles end with information about the authors or otherwise irrelevant information. Below is our function to handle some of the instances. We need to run it multiple times, as each time it is run there is a new last sentence that counts as noise that we want to get rid of in some instances.

In [26]:
def remove_last_sentence(row):
    sentences = row['content'].split('. ')
    
    if len(sentences) > 1:  # Check if there's more than one sentence
        last_sentence = sentences[-1]  # Get the last sentence
        
        if ('This story has been updated with additional information.' in last_sentence or
            'contributed to this' in last_sentence or
            'will be updated' in last_sentence or
            'have been updated' in last_sentence or
            'APP Fox News' in last_sentence or
            'Fox News' in last_sentence or
            'FoxNews.com' in last_sentence or
            '@Fox.com' in last_sentence or
            'Fox News Digital' in last_sentence or
            'Fox News Channel and FOX Business' in last_sentence or

            'This story has been updated with additional reaction' in last_sentence or
            'This report has been updated with additional information' in last_sentence or
            'who covers politics' in last_sentence or
            'follow him on' in last_sentence or
            'Follow him on' in last_sentence or
            '@fox.com' in last_sentence or
            '@Fox.com' in last_sentence or
            'FoxNews.com' in last_sentence or
            'Fox News Digital' in last_sentence or
            'contributed to this' in last_sentence or
            'Politics newsletter' in last_sentence or
            'Fox News Digital' in last_sentence or
            'email' in last_sentence):
                
            updated_content = '. '.join(sentences[:-1])  # Join all sentences except the last one
            return updated_content
    
    return row['content']

# Apply the function to the DataFrame four times
df['content'] = df.apply(remove_last_sentence, axis=1)
df['content'] = df.apply(remove_last_sentence, axis=1)
df['content'] = df.apply(remove_last_sentence, axis=1)
df['content'] = df.apply(remove_last_sentence, axis=1)
df['content_end_preview'] = df['content'].apply(lambda x: x[-500:])

In [27]:
# Checking the end of Fox News articles after we run our function
df[['source', 'url', 'content_end_preview']] [df['source'] == 'foxnews']

Unnamed: 0,source,url,content_end_preview
47,foxnews,https://www.foxnews.com/politics/biden-takes-jab-hur-report-joke-memory-returns-speech-one-more-thing-forgot,"said, referring to companies charging the same amount for a product while reducing quantity. ""I'm calling on corporations to pass their savings on to consumers, for God's sake. We're making real progress."" CLICK TO GET THE FOX NEWS APP ""The recent Washington Post headline summed it up,"" Biden added, quoting the newspaper's story titled, ""Falling Inflation and Rising Growth Give the U.S. the World's Best Recovery."" ""The world's best recovery!"" Biden said. ""It's because you implemented what we did"
48,foxnews,https://www.foxnews.com/politics/rfk-jr-apologizes-family-super-bowl-ad-claims-no-involvement,"ever wars, and chronic disease. RFK Jr. offers us real change along with freedom, trust and hope. Like his uncle and his father, Kennedy is a corruption fighter, and it's no wonder the DNC is trying every old trick and inventing new tricks to stop him. The public sees through it all and won’t stand for it."" Kennedy initially sought to challenge President Biden in the 2024 Democratic presidential primary, but the DNC said it would not hold primary debates and stood behind the incumbent president"
49,foxnews,https://www.foxnews.com/politics/bidens-upcoming-physical-exam-will-not-include-cognitive-test-white-house-says,"and continues to find him to be ""sharp"" and ""on top of things."" ""When we have meetings with him and his staff he is constantly pushing us, trying to get more information, and so that has been my experience with this president,"" she said. Biden's age is a major concern among U.S. voters, 86% of whom say he is too old to serve a second term, according to an ABC poll. A Sunday poll from ABC/Ipsos found that 86% of Americans believe Biden is too old to serve another term, including 73% of Democrats"
50,foxnews,https://www.foxnews.com/politics/kamala-harris-ready-serve-democrats-sound-alarm-about-bidens-age,"Look, I’m a Biden supporter, and I slept like a baby last night: I woke up every two hours crying and wet the bed,"" Begala quipped on CNN last Friday. ""This is terrible for Democrats. And anybody with a functioning brain knows that,"" he declared. GOP CAMPAIGN ARM LAUNCHES MEDIA BLITZ AGAINST DEMS WHO OPPOSED VIOLENT CRIME BILL AS CRISIS IN DC SPIRALS Then-Democrat presidential candidate Hillary Clinton makes a concession speech after being defeated by Donald Trump in New York on November 9, 2016"
51,foxnews,https://www.foxnews.com/politics/climate-activists-arrested-shutting-down-biden-campaign-hq,"n onslaught of environmental regulations to curb fossil fuel power plant electricity generation, push electric vehicles and incentivize the electrification of the residential sector. ""I mean, it literally is the existential threat. It’s even more consequential than nuclear power, nuclear war,"" he added. ""That would be horrible and awful, and it would just make the environment incredibly worse. But it’s about the environment."" The Biden campaign didn't immediately respond to a request for comment"
...,...,...,...
235,foxnews,https://www.foxnews.com/politics/house-republicans-push-biden-take-cognitive-test-hur-report-obvious-mental-decline,"al practices and maiming patients. The White House sent another image of ""Dr. Nick"" when reached by Fox News Digital on Friday morning. Earlier this week, White House press secretary Karine Jean-Pierre told reporters that Biden would not be taking a cognitive test as part of his regular physical exam. Elizabeth Elkind is a reporter for Fox News Digital focused on Congress as well as the intersection of Artificial Intelligence and politics. Previous digital bylines seen at Daily Mail and CBS News"
236,foxnews,https://www.foxnews.com/politics/gop-senators-urge-biden-admin-to-end-racial-discrimination-policy-in-chips-grants-before-it-breaks-the-law,"race of their suppliers. Title VI forbids such discrimination,"" they wrote. ""In addition to instructing the federal government to violate the law, the Guidance also encourages private businesses to discriminate on the basis of race in violation of federal law, specifically Section 1981,"" they continued. ""Section 1981 makes it illegal for private companies to discriminate on the basis of race when making and enforcing contracts."" Sen. Cynthia Lummis, R-Wyo., conducts a news conference Aug. 9. 29"
237,foxnews,https://www.foxnews.com/politics/doj-defends-special-counsel-report-bidens-memory-consistent-legal-requirement-not-gratuitous,"tten notes while eating breakfast alongside senators on January 29, 2015. Two sources later revealed to Fox News that it was Biden who brought up his son's 2015 death-not Hur. ""We conclude that no criminal charges are warranted in this matter,"" the report, released Thursday, states. Fox News' David Spunt and Jake Gibson contributed to this report. Sarah Rumpf-Whitten is a breaking news writer for Fox News Digital and Fox Business. She is a native of Massachusetts and is based in Orlando, Florida"
238,foxnews,https://www.foxnews.com/politics/fox-news-politics-trump-vows-appeal,"he former president inflated his assets and committed fraud. Trump spoke exclusively to Fox News Digital shortly after Engoron's ruling was made public Friday afternoon. ""A crooked New York judge working with the very corrupt attorney general of New York State, who ran on the basis of ‘I will get trump’ before knowing me — before even knowing anything about me — just ruled that I have to pay a fine of $355 million based on absolutely nothing,"" Trump told Fox News Digital. ""No victims. No damages"


In [28]:
# Checking the end of CNN articles after we run our function
df[['source', 'url', 'content_end_preview']] [df['source'] == 'cnn']

Unnamed: 0,source,url,content_end_preview
0,cnn,https://www.cnn.com/2024/02/12/politics/cq-brown-nato-trump/index.html,"r militia groups and others to put pressure, to achieve their objectives,” the chairman said. “At the same time, not looking for a broader conflict with the United States.” There have been at least 170 attacks on US and coalition forces in Iraq, Syria and Jordan since October 17. The Pentagon said Monday that those attacks have resulted in 186 wounded or killed in action — including 130 traumatic brain injuries. Three US soldiers were killed in a drone attack in January on a US outpost in Jordan"
1,cnn,https://www.cnn.com/2024/02/12/politics/trump-endorse-michael-whatley-lara-trump-rnc/index.html,"h﻿as been with me from the beginning, has done a great job in his home state of North Carolina, and is committed to election integrity, which we must have to keep fraud out of our election so it can’t be stolen,” Trump said in a statement. “My very talented daughter-in-law, Lara Trump, has agreed to run as the RNC Co-Chair. Lara is an extremely talented communicator and is dedicated to all that MAGA stands for. She has told me she wants to accept this challenge and would be GREAT!” he also said"
2,cnn,https://www.cnn.com/2024/02/12/politics/senate-foreign-aid-bill-ukraine/index.html,"would have combined the foreign aid with a bipartisan border deal. Republicans had initially demanded that border security be part of the bill, but went on to reject the bipartisan deal amid forceful attacks on the measure by Trump and top House Republicans. Over the weekend, Trump also wrote on Truth Social that the US should stop providing foreign aid unless it is structured as a loan, another sign of the political pressure Republicans continue to face amid efforts to send funding to US allies"
3,cnn,https://www.cnn.com/2024/02/12/politics/bidens-meeting-with-jordanian-king-comes-at-flashpoint-in-israel-hamas-war/index.html,"stage deal in Gaza “delusional.” Secretary of State Antony Blinken previously said negotiations toward an agreement would continue despite the Israeli prime minister’s comments, which Blinken said were referencing the “absolute non-starters” in the proposal. The full Hamas response proposes three phases, each lasting 45 days, including the withdrawal of Israeli troops from Gaza, a massive humanitarian effort, and freedom of movement for people throughout Gaza, according to a copy obtained by CNN"
4,cnn,https://www.cnn.com/2024/02/12/politics/trump-supreme-court-immunity-filing/index.html,"nist ban.” The court may have to decide how it wants to handle the former president’s immunity claim at the same time it is drafting an opinion in the ballot case. Together, the cases have thrust the court into the middle of this year’s presidential election in a way it has largely managed to avoid since its decision in Bush v. Gore effectively decided the 2000 election between former President George W. Bush and former Vice President Al Gore. This story has been updated with additional details."
...,...,...,...
220,cnn,https://www.cnn.com/2024/02/15/politics/navy-federal-congressional-black-caucus/index.html,d a separate analysis of public mortgage data by Senate banking committee staff that also found racial disparities in its lending. Navy Federal is also facing a federal class-action lawsuit from mortgage applicants who cite CNN’s reporting and allege that the credit union discriminated against them. A judge approved a motion to consolidate three separate lawsuits against the credit union into a single case last month. Editor’s Note: This story was update to include a statement from Navy Federal.
221,cnn,https://www.cnn.com/2024/02/16/politics/kamala-harris-trump-nato/index.html,"NATO allies and abandoned our treaty commitments. Imagine if we went easy on Putin. Let alone encouraged him,” Harris said. “History offers a clue. If we stand by while an aggressor invades its neighbor with impunity, they will keep going. In the case of Putin, that means all of Europe would be threatened. If we fail to impose severe consequences on Russia, other authoritarians across the globe would be emboldened,” Harris said. This story has been updated with additional developments on Friday."
222,cnn,https://www.cnn.com/2024/01/30/politics/trump-fraud-trial-verdict-what-to-watch-for/index.html,"oron’s law clerk, leading to the gag order. The judge later extended the order to include Trump’s attorneys from commenting on the judge’s private communications with his law clerk. The order does not limit public criticism of Engoron, the district attorney or other parts of the case. As the trial neared an end in December, Engoron reminisced. “In a strange way, I’m gonna miss this trial,” the judge said. “It’s been an experience.” This story has been updated with the anticipated verdict timing."
223,cnn,https://www.cnn.com/2024/02/15/politics/border-patrol-official-suspended-alleged-misconduct/index.html,"“This is the case whether the alleged misconduct occurs on or off duty. Federal privacy laws prohibit discussion of individual cases.” The Washington Post first reported Martinez’s suspension. Martinez, a 31-year Border Patrol veteran, has also served as the chief patrol agent of the Laredo Sector and deputy chief patrol agent of the Rio Grande Valley Sector, according to the CBP . The US Border Patrol is a law enforcement entity under the umbrella of CBP, in the Department of Homeland Security"


Now that we have finished setting up our remove_last_sentence function, we can remove the "content_end_preview" column and reset the pandas display setting.

In [29]:
# Drop the content end preview column
df.drop(columns=['content_end_preview'], inplace=True)

# Reset the columns so that we go back to truncating the "content" column
pd.reset_option('display.max_colwidth')

### Standardizing Entity Names

Let's start by combining specified word pairs so that we handle cases where two or more words refer to a single entity, such as "Hunter Biden" or "Supreme Court." 

In [30]:
# Combine specified word pairs

df['content'] = df['content'].str.replace(r'\bHunter\s+Biden\b', 'HunterBiden', regex=True)
df['content'] = df['content'].str.replace(r'\bHUNTER\s+Biden\b', 'HunterBiden', regex=True)
df['content'] = df['content'].str.replace(r'\bSouth\s+Carolina\b', 'SouthCarolina', regex=True)
df['content'] = df['content'].str.replace(r'\bSupreme\s+Court\b', 'SupremeCourt', regex=True)
df['content'] = df['content'].str.replace(r'\bsupreme\s+court\b', 'SupremeCourt', regex=True)
df['content'] = df['content'].str.replace(r'\bCourt\s+House\b', 'CourtHouse', regex=True)
df['content'] = df['content'].str.replace(r'\bcourt\s+house\b', 'CourtHouse', regex=True)
df['content'] = df['content'].str.replace(r'\bHouse\s+Representative\b', 'HouseRep', regex=True)
df['content'] = df['content'].str.replace(r'\bhouse\s+representative\b', 'HouseRep', regex=True)
df['content'] = df['content'].str.replace(r'\bHouse\s+Rep\b', 'HouseRep', regex=True)
df['content'] = df['content'].str.replace(r'\bhouse\s+rep\b', 'HouseRep', regex=True)
df['content'] = df['content'].str.replace(r'\bvoters\b', 'voter', regex=True)
df['content'] = df['content'].str.replace(r'\bvotes\b', 'vote', regex=True)
df['content'] = df['content'].str.replace(r'\bdemocratic(?:s)?\b', 'Democrat', case=False, regex=True)
df['content'] = df['content'].str.replace(r'\bDemocrats\b', 'Democrat', regex=True)
df['content'] = df['content'].str.replace(r'\brepublicans\b', 'Republican', regex=True)
df['content'] = df['content'].str.replace(r'\bRepublicans\b', 'Republican', regex=True)
df['content'] = df['content'].str.replace(r'\bwhite\s+house\b', 'WhiteHouse', regex=True)
df['content'] = df['content'].str.replace(r'\bWhite\s+house\b', 'WhiteHouse', regex=True)
df['content'] = df['content'].str.replace(r'\bNew\s+York\b', 'NewYork', regex=True)

We now need to account for variations in Biden's and Trump's names. This is because we want the model to see the different spellings as referring to the same thing. We accomplish this with the following code. 

In [31]:
biden_variations = df['content'].str.findall(
    r'\bPresident\s+Joe\s+Biden\b|'  
    r'\bPresident\s+Biden\b|'         
    r'\bJoe\s+Biden(?:’s)?\b|'             
    r'\bBiden(?:’s|s)?\b|'  
    r'\bBIDEN\b|' 
    r'\bBiden\'s\b'                  
)
# Flatten the list of variations
biden_variations = [item for sublist in biden_variations for item in sublist]

# Count occurrences of each variation
biden_variation_counts = Counter(biden_variations)

# Replace variations of Biden's name with 'Biden' in the content column
df['content'] = df['content'].str.replace(
    r'\bPresident\s+Joe\s+Biden\b|'  
    r'\bPresident\s+Biden\b|'         
    r'\bJoe\s+Biden(?:’s)?\b|'             
    r'\bBiden(?:’s|s)?\b|'  
    r'\bBIDEN\b|' 
    r'\bBiden\'s\b'    
    , 'Biden', regex=True)

print("Occurrences of different variations of Biden's name:")
for variation, count in biden_variation_counts.items():
    print(f"{variation}: {count}")

Occurrences of different variations of Biden's name:
President Joe Biden: 116
Biden: 861
Biden’s: 156
President Biden: 159
Bidens: 23
Joe Biden: 137
Joe Biden’s: 13
President Joe Biden: 1
BIDEN: 105


In [32]:
# Count occurrences of 'Biden' after replacement
biden_count_after = df['content'].str.count('Biden').sum()

print("Occurrences of Biden after replacement:", biden_count_after)

Occurrences of Biden after replacement: 1709


In [33]:
# Find all variations of Trump's name in the content column
trump_variations = df['content'].str.findall(
    r'\bPresident\s+Donald\s+Trump\b|'  
    r'\bPresident\s+Trump\b|'         
    r'\bDonald\s+Trump(?:’s)?\b|'             
    r'\bTrump(?:’s)?\b|'   
    r'\bTRUMP(?:’S)?\b|'  
    r'\bFormer\s+President\s+Donald\s+Trump\b|' 
    r'\bDonald\s+J(?:ohn)?\s+Trump\b'            
)

# Flatten the list of variations
trump_variations = [item for sublist in trump_variations for item in sublist]

# Count occurrences of each variation
trump_variation_counts = Counter(trump_variations)

# Replace variations of Trump's name with 'Trump' in the content column
df['content'] = df['content'].str.replace(
    r'\bPresident\s+Donald\s+Trump\b|'  
    r'\bPresident\s+Trump\b|'         
    r'\bDonald\s+Trump(?:’s)?\b|'             
    r'\bTrump(?:’s)?\b|'   
    r'\bTRUMP(?:’S)?\b|'  
    r'\bFormer\s+President\s+Donald\s+Trump\b|' 
    r'\bDonald\s+J(?:ohn)?\s+Trump\b'               
    , 'Trump', regex=True) 

print("Occurrences of different variations of Trump's name:")
for variation, count in trump_variation_counts.items():
    print(f"{variation}: {count}")

Occurrences of different variations of Trump's name:
President Donald Trump: 78
Trump’s: 380
Trump: 1259
Former President Donald Trump: 32
President Trump: 81
Donald Trump: 90
Donald Trump’s: 15
Donald Trump: 1
TRUMP: 35
TRUMP’S: 2
President Trump: 1


In [34]:
# Count occurrences of 'Trump' after replacement
trump_count_after = df['content'].str.count('Trump').sum()

print("Occurrences of Trump after replacement:", trump_count_after)

Occurrences of Trump after replacement: 1977


### Data Preprocessing Pipeline

Next, we remove stop words and punctuation, and then we tokenize and prepare data for use in the model. 

In [35]:
punctuation = set(punctuation) # speeds up comparison
sw = stopwords.words("english")
extra_sw = ['cnn', 'fox', 'news', 'said', '–', '-', '--', '—','told', 'would', '…read', 'get', 'could', 
            'also', "it’s", 'think', 'time', 'even', 'former', 'party', 'i', '“i', 'she’s', 'says', 
            'images', 'getty', 'im', 'this', 'we', 'it', 'digital', 'the', 'that', 'story', 'doesn']
sw.extend(extra_sw)
whitespace_pattern = re.compile(r"\s+")

def remove_stop(tokens) :
    
    return [t for t in tokens if t.lower() not in sw]

def remove_punctuation(text, punct_set=punctuation) : 
    
    return("".join([ch for ch in text if ch not in punct_set]))

def tokenize(text) : 
    
    return re.split(whitespace_pattern, text)

def prepare(text, pipeline) : 
    
    tokens = str(text)
    
    for transform in pipeline : 
        tokens = transform(tokens)
        
    return(tokens)

pipeline = [str.lower, remove_punctuation, tokenize, remove_stop]

In [36]:
# Tokenize and preprocess each row
df['tokens'] = df['content'].apply(lambda x: prepare(x, pipeline=pipeline))

# Print the resulting dataframe
df.head()

Unnamed: 0,source,url,content,tokens
0,cnn,https://www.cnn.com/2024/02/12/politics/cq-bro...,Chairman of the Joint Chiefs of Staff Gen. CQ ...,"[chairman, joint, chiefs, staff, gen, cq, brow..."
1,cnn,https://www.cnn.com/2024/02/12/politics/trump-...,Trump has endorsed North Carolina Republican P...,"[trump, endorsed, north, carolina, republican,..."
2,cnn,https://www.cnn.com/2024/02/12/politics/senate...,The Senate is inching closer to final passage ...,"[senate, inching, closer, final, passage, 953,..."
3,cnn,https://www.cnn.com/2024/02/12/politics/bidens...,Biden and King Abdullah II of Jordan met Monda...,"[biden, king, abdullah, ii, jordan, met, monda..."
4,cnn,https://www.cnn.com/2024/02/12/politics/trump-...,Trump on Monday asked the SupremeCourt to step...,"[trump, monday, asked, supremecourt, step, cha..."


## Basic Descriptive Statistics

Below we write a function that allows us to view the results of our preprocessed data from CNN and Fox News. We find the total words (tokens), unique words, total characters, lexical diversity, and most common words for each news organization.

In [37]:
def descriptive_stats(tokens, num_tokens = 50, verbose=True) :
    """
        Given a list of tokens, print number of tokens, number of unique tokens, 
        number of characters, lexical diversity (https://en.wikipedia.org/wiki/Lexical_diversity), 
        and num_tokens most common tokens. Return a list with the number of tokens, number
        of unique tokens, lexical diversity, and number of characters. 
    
    """
    num_tokens = len(tokens)
    num_unique_tokens = len(set(tokens))  
    lexical_diversity = num_unique_tokens / num_tokens
    num_characters = sum(len(s) for s in tokens)
    
    if verbose :        
        print(f"There are {num_tokens} tokens in the data.")
        print(f"There are {num_unique_tokens} unique tokens in the data.")
        print(f"There are {num_characters} characters in the data.")
        print(f"The lexical diversity is {lexical_diversity:.3f} in the data.")        
        print (f"The ten most common words are:")
        print(Counter(tokens).most_common(10))
        
    return([num_tokens, num_unique_tokens,
            lexical_diversity,
            num_characters])

In [38]:
# calls to descriptive_stats here

print("CNN News Stats\n")

descriptive_stats(
    [token for tokens in df[df['source'] == 'cnn']['tokens']for token in tokens])

print('\n')
print("FoxNews Stats\n")

descriptive_stats(
    [token for tokens in df[df['source'] == 'foxnews']['tokens']for token in tokens])

CNN News Stats

There are 76965 tokens in the data.
There are 11340 unique tokens in the data.
There are 507542 characters in the data.
The lexical diversity is 0.147 in the data.
The ten most common words are:
[('trump', 1527), ('biden', 741), ('republican', 560), ('house', 459), ('us', 411), ('president', 402), ('election', 396), ('democrat', 360), ('case', 284), ('campaign', 254)]


FoxNews Stats

There are 40271 tokens in the data.
There are 7106 unique tokens in the data.
There are 262325 characters in the data.
The lexical diversity is 0.176 in the data.
The ten most common words are:
[('biden', 679), ('house', 403), ('trump', 330), ('republican', 245), ('president', 224), ('democrat', 193), ('us', 187), ('senate', 167), ('security', 155), ('special', 153)]


[40271, 7106, 0.17645452062278066, 262325]

## Saving the Data

Below we create a csv file to use for modeling.

In [39]:
# save df for next step

df.to_csv('data/cleaned.csv', index=False)