# MSADS509 Final Project M6 UE Wang

In [1]:
import pandas as pd
import re

from nltk.corpus import stopwords
from string import punctuation

from collections import Counter, defaultdict

import glob
from collections import Counter

import warnings
warnings.filterwarnings("ignore")

# Load data from desktop MSADS509_News_Project_Dataset folder

In [2]:
file_list = glob.glob('/Users/UE/Desktop/MSADS509_News_Project_Dataset/news_*.csv')

# Initialize an empty list to store DataFrames
df = []

# Iterate over each file, read it into a DataFrame, and append it to the list
for file in file_list:
    df.append(pd.read_csv(file))

# Concatenate all DataFrames in the list into a single DataFrame
combined_df = pd.concat(df, ignore_index=True)
combined_df

Unnamed: 0,source,url,content
0,cnn,https://www.cnn.com/2024/02/12/politics/cq-bro...,CNN — Chairman of the Joint Chiefs of Staff Ge...
1,cnn,https://www.cnn.com/2024/02/12/politics/trump-...,CNN — Former President Donald Trump has endors...
2,cnn,https://www.cnn.com/2024/02/12/politics/senate...,The Senate is inching closer to final passage ...
3,cnn,https://www.cnn.com/2024/02/12/politics/bidens...,Washington CNN — President Joe Biden and King ...
4,cnn,https://www.cnn.com/2024/02/12/politics/trump-...,CNN — Former President Donald Trump on Monday ...
...,...,...,...
138,foxnews,https://www.foxnews.com/politics/fox-news-poli...,Welcome to Fox News’ Politics newsletter with ...
139,foxnews,https://www.foxnews.com/politics/fox-news-poli...,Welcome to Fox News’ Politics newsletter with ...
140,foxnews,https://www.foxnews.com/politics/fox-news-poli...,Welcome to Fox News’ Politics newsletter with ...
141,foxnews,https://www.foxnews.com/politics/trumps-nato-c...,close Video The media doesn’t allow the public...


In [3]:
# Remove duplicates

df = combined_df.drop_duplicates().reset_index(drop=True)
df

Unnamed: 0,source,url,content
0,cnn,https://www.cnn.com/2024/02/12/politics/cq-bro...,CNN — Chairman of the Joint Chiefs of Staff Ge...
1,cnn,https://www.cnn.com/2024/02/12/politics/trump-...,CNN — Former President Donald Trump has endors...
2,cnn,https://www.cnn.com/2024/02/12/politics/senate...,The Senate is inching closer to final passage ...
3,cnn,https://www.cnn.com/2024/02/12/politics/bidens...,Washington CNN — President Joe Biden and King ...
4,cnn,https://www.cnn.com/2024/02/12/politics/trump-...,CNN — Former President Donald Trump on Monday ...
...,...,...,...
113,foxnews,https://www.foxnews.com/politics/white-house-s...,close Video President Biden on release of clas...
114,foxnews,https://www.foxnews.com/politics/house-gop-tes...,close Video Jean-Pierre defends Biden’s mental...
115,foxnews,https://www.foxnews.com/politics/biden-garners...,close Video It's very awkward to watch Biden t...
116,foxnews,https://www.foxnews.com/politics/fox-news-poli...,Welcome to Fox News’ Politics newsletter with ...


# Data Cleaning, Tokenizing, and Normalizing

In [4]:
# Function to remove prefixes

def remove_prefix(row):
    if row['source'] == 'cnn' and row['content'].startswith('CNN — '):
        return row['content'][6:]  
    elif row['source'] == 'cnn' and row['content'].startswith('Washington CNN — '):
        return row['content'][17:]  
    elif row['source'] == 'foxnews' and row['content'].startswith('close Video '):
        return row['content'][12:]  
    else:
        return row['content']

df['content'] = df.apply(remove_prefix, axis=1)

df

Unnamed: 0,source,url,content
0,cnn,https://www.cnn.com/2024/02/12/politics/cq-bro...,Chairman of the Joint Chiefs of Staff Gen. CQ ...
1,cnn,https://www.cnn.com/2024/02/12/politics/trump-...,Former President Donald Trump has endorsed Nor...
2,cnn,https://www.cnn.com/2024/02/12/politics/senate...,The Senate is inching closer to final passage ...
3,cnn,https://www.cnn.com/2024/02/12/politics/bidens...,President Joe Biden and King Abdullah II of Jo...
4,cnn,https://www.cnn.com/2024/02/12/politics/trump-...,Former President Donald Trump on Monday asked ...
...,...,...,...
113,foxnews,https://www.foxnews.com/politics/white-house-s...,President Biden on release of classified docs ...
114,foxnews,https://www.foxnews.com/politics/house-gop-tes...,Jean-Pierre defends Biden’s mental fitness and...
115,foxnews,https://www.foxnews.com/politics/biden-garners...,It's very awkward to watch Biden try to engage...
116,foxnews,https://www.foxnews.com/politics/fox-news-poli...,Welcome to Fox News’ Politics newsletter with ...


In [5]:
def remove_first_sentence(row):
    sentences = row['content'].split('. ')  # Split the content into sentences
    
    if len(sentences) > 1:  # Check if there's more than one sentence
        first_sentence = sentences[0]  # Get the first sentence
        if 'Welcome to Fox News' in first_sentence:
            updated_content = '. '.join(sentences[1:])  # Join all sentences except the first one
            return updated_content
    return row['content']

# Apply the function to the DataFrame
df['content'] = df.apply(remove_first_sentence, axis=1)

# Display the updated DataFrame
df


Unnamed: 0,source,url,content
0,cnn,https://www.cnn.com/2024/02/12/politics/cq-bro...,Chairman of the Joint Chiefs of Staff Gen. CQ ...
1,cnn,https://www.cnn.com/2024/02/12/politics/trump-...,Former President Donald Trump has endorsed Nor...
2,cnn,https://www.cnn.com/2024/02/12/politics/senate...,The Senate is inching closer to final passage ...
3,cnn,https://www.cnn.com/2024/02/12/politics/bidens...,President Joe Biden and King Abdullah II of Jo...
4,cnn,https://www.cnn.com/2024/02/12/politics/trump-...,Former President Donald Trump on Monday asked ...
...,...,...,...
113,foxnews,https://www.foxnews.com/politics/white-house-s...,President Biden on release of classified docs ...
114,foxnews,https://www.foxnews.com/politics/house-gop-tes...,Jean-Pierre defends Biden’s mental fitness and...
115,foxnews,https://www.foxnews.com/politics/biden-garners...,It's very awkward to watch Biden try to engage...
116,foxnews,https://www.foxnews.com/politics/fox-news-poli...,and updates from the 2024 campaign trail. What...


In [6]:
# Define variations of Biden's name
biden_variations = df['content'].str.findall(
    r'\bPresident\s+Joe\s+Biden\b|'  
    r'\bPresident\s+Biden\b|'         
    r'\bJoe\s+Biden(?:’s)?\b|'             
    r'\bBiden(?:’s)?\b|'   
    r'\bBIDEN\b|' 
    r'\bBiden\'s\b'            
)
# Flatten the list of variations
biden_variations = [item for sublist in biden_variations for item in sublist]

# Count occurrences of each variation
biden_variation_counts = Counter(biden_variations)

# Replace variations of Biden's name with 'Biden' in the content column
df['content'] = df['content'].str.replace(
    r'\bPresident\s+Joe\s+Biden\b|'  
    r'\bPresident\s+Biden\b|'         
    r'\bJoe\s+Biden(?:’s)?\b|'             
    r'\bBiden(?:’s)?\b|'   
    r'\bBIDEN\b|' 
    r'\bBiden\'s\b' 
    , 'Biden', regex=True)

print("Occurrences of different variations of Biden's name:")
for variation, count in biden_variation_counts.items():
    print(f"{variation}: {count}")

Occurrences of different variations of Biden's name:
President Joe Biden: 70
Biden: 723
Biden’s: 116
President Biden: 98
Joe Biden: 87
Joe Biden’s: 10
President Joe Biden: 1
BIDEN: 66


In [7]:
# Count occurrences of 'Biden' after replacement
biden_count_after = df['content'].str.count('Biden').sum()

print("Occurrences of Biden after replacement:", biden_count_after)

Occurrences of Biden after replacement: 1187


In [8]:
# Find all variations of Trump's name in the content column
trump_variations = df['content'].str.findall(
    r'\bPresident\s+Donald\s+Trump\b|'  
    r'\bPresident\s+Trump\b|'         
    r'\bDonald\s+Trump(?:’s)?\b|'             
    r'\bTrump(?:’s)?\b|'   
    r'\bTRUMP(?:’S)?\b|'  
    r'\bFormer\s+President\s+Donald\s+Trump\b|' 
    r'\bDonald\s+J(?:ohn)?\s+Trump\b'            
)

# Flatten the list of variations
trump_variations = [item for sublist in trump_variations for item in sublist]

# Count occurrences of each variation
trump_variation_counts = Counter(trump_variations)

# Replace variations of Trump's name with 'Trump' in the content column
df['content'] = df['content'].str.replace(
    r'\bPresident\s+Donald\s+Trump\b|'  
    r'\bPresident\s+Trump\b|'         
    r'\bDonald\s+Trump(?:’s)?\b|'             
    r'\bTrump(?:’s)?\b|'   
    r'\bTRUMP(?:’S)?\b|'  
    r'\bFormer\s+President\s+Donald\s+Trump\b|' 
    r'\bDonald\s+J(?:ohn)?\s+Trump\b'               
    , 'Trump', regex=True) 

print("Occurrences of different variations of Trump's name:")
for variation, count in trump_variation_counts.items():
    print(f"{variation}: {count}")

Occurrences of different variations of Trump's name:
President Donald Trump: 39
Trump’s: 217
Trump: 734
Former President Donald Trump: 21
President Trump: 57
Donald Trump: 56
Donald Trump’s: 7
Donald Trump: 1
TRUMP: 17


In [9]:
# Count occurrences of 'Trump' after replacement
trump_count_after = df['content'].str.count('Trump').sum()

print("Occurrences of Trump after replacement:", trump_count_after)

Occurrences of Trump after replacement: 1151


In [10]:
# Remove occurrences of "Getty Images"

df['content'] = df['content'].str.replace(r'\(.*?Getty\s+Images.*?\)', '', regex=True)

In [11]:
def remove_last_sentence(row):
    sentences = row['content'].split('. ')
    
    if len(sentences) > 1:  # Check if there's more than one sentence
        last_sentences = '. '.join(sentences[-1:])  # Get the last two sentences
        if 'This story has been updated with additional information.' in last_sentences \
                or 'contributed to this' in last_sentences \
                or 'will be updated' in last_sentences \
                or 'have been updated' in last_sentences \
                or 'APP Fox News' in last_sentences \
                or 'Fox News' in last_sentences \
                or 'Fox News Channel and FOX Business' in last_sentences:
            updated_content = '. '.join(sentences[:-1])  # Join all sentences except the last two
            return updated_content
    return row['content']

# Apply the function to the DataFrame
df['content'] = df.apply(remove_last_sentence, axis=1)

# Display the updated DataFrame
df

Unnamed: 0,source,url,content
0,cnn,https://www.cnn.com/2024/02/12/politics/cq-bro...,Chairman of the Joint Chiefs of Staff Gen. CQ ...
1,cnn,https://www.cnn.com/2024/02/12/politics/trump-...,Trump has endorsed North Carolina Republican P...
2,cnn,https://www.cnn.com/2024/02/12/politics/senate...,The Senate is inching closer to final passage ...
3,cnn,https://www.cnn.com/2024/02/12/politics/bidens...,Biden and King Abdullah II of Jordan met Monda...
4,cnn,https://www.cnn.com/2024/02/12/politics/trump-...,Trump on Monday asked the Supreme Court to ste...
...,...,...,...
113,foxnews,https://www.foxnews.com/politics/white-house-s...,Biden on release of classified docs report: 'T...
114,foxnews,https://www.foxnews.com/politics/house-gop-tes...,Jean-Pierre defends Biden mental fitness and k...
115,foxnews,https://www.foxnews.com/politics/biden-garners...,It's very awkward to watch Biden try to engage...
116,foxnews,https://www.foxnews.com/politics/fox-news-poli...,and updates from the 2024 campaign trail. What...


In [12]:
def remove_last_sentence2(row):
    sentences = row['content'].split('. ')
    
    if len(sentences) > 1:  # Check if there's more than one sentence
        remove_sentence = '. '.join(sentences[-1:])  
        if 'This story has been updated with additional information' in remove_sentence \
                or 'follow him on' in remove_sentence \
                or 'Follow him on' in remove_sentence \
                or '@fox.com' in remove_sentence:
            updated_content = '. '.join(sentences[:-1])  # Join all sentences except the last one
            return updated_content
    return row['content']

df['content'] = df.apply(remove_last_sentence2, axis=1)

df

Unnamed: 0,source,url,content
0,cnn,https://www.cnn.com/2024/02/12/politics/cq-bro...,Chairman of the Joint Chiefs of Staff Gen. CQ ...
1,cnn,https://www.cnn.com/2024/02/12/politics/trump-...,Trump has endorsed North Carolina Republican P...
2,cnn,https://www.cnn.com/2024/02/12/politics/senate...,The Senate is inching closer to final passage ...
3,cnn,https://www.cnn.com/2024/02/12/politics/bidens...,Biden and King Abdullah II of Jordan met Monda...
4,cnn,https://www.cnn.com/2024/02/12/politics/trump-...,Trump on Monday asked the Supreme Court to ste...
...,...,...,...
113,foxnews,https://www.foxnews.com/politics/white-house-s...,Biden on release of classified docs report: 'T...
114,foxnews,https://www.foxnews.com/politics/house-gop-tes...,Jean-Pierre defends Biden mental fitness and k...
115,foxnews,https://www.foxnews.com/politics/biden-garners...,It's very awkward to watch Biden try to engage...
116,foxnews,https://www.foxnews.com/politics/fox-news-poli...,and updates from the 2024 campaign trail. What...


In [13]:

punctuation = set(punctuation) # speeds up comparison
sw = stopwords.words("english")
extra_sw = ['cnn', 'fox', 'news', 'said', '–', '-', '--', 'told', 'would', '…read', 'get', 'could', 'also',
           "it’s", 'think', 'time', 'even', 'former']
sw.extend(extra_sw)
whitespace_pattern = re.compile(r"\s+")

def remove_stop(tokens) :
    
    return [t for t in tokens if t.lower() not in sw]

def remove_punctuation(text, punct_set=punctuation) : 
    
    return("".join([ch for ch in text if ch not in punct_set]))

def tokenize(text) : 
    
    return re.split(whitespace_pattern, text)

def prepare(text, pipeline) : 
    
    tokens = str(text)
    
    for transform in pipeline : 
        tokens = transform(tokens)
        
    return(tokens)

pipeline = [str.lower, remove_punctuation, tokenize, remove_stop]

In [14]:
# Tokenize and preprocess each row
df['tokens'] = df['content'].apply(lambda x: prepare(x, pipeline=pipeline))

# Print the resulting dataframe
df.head()

Unnamed: 0,source,url,content,tokens
0,cnn,https://www.cnn.com/2024/02/12/politics/cq-bro...,Chairman of the Joint Chiefs of Staff Gen. CQ ...,"[chairman, joint, chiefs, staff, gen, cq, brow..."
1,cnn,https://www.cnn.com/2024/02/12/politics/trump-...,Trump has endorsed North Carolina Republican P...,"[trump, endorsed, north, carolina, republican,..."
2,cnn,https://www.cnn.com/2024/02/12/politics/senate...,The Senate is inching closer to final passage ...,"[senate, inching, closer, final, passage, 953,..."
3,cnn,https://www.cnn.com/2024/02/12/politics/bidens...,Biden and King Abdullah II of Jordan met Monda...,"[biden, king, abdullah, ii, jordan, met, monda..."
4,cnn,https://www.cnn.com/2024/02/12/politics/trump-...,Trump on Monday asked the Supreme Court to ste...,"[trump, monday, asked, supreme, court, step, c..."


# Basic Descriptive Statistics

In [15]:
def descriptive_stats(tokens, num_tokens = 50, verbose=True) :
    """
        Given a list of tokens, print number of tokens, number of unique tokens, 
        number of characters, lexical diversity (https://en.wikipedia.org/wiki/Lexical_diversity), 
        and num_tokens most common tokens. Return a list with the number of tokens, number
        of unique tokens, lexical diversity, and number of characters. 
    
    """
    num_tokens = len(tokens)
    num_unique_tokens = len(set(tokens))  
    lexical_diversity = num_unique_tokens / num_tokens
    num_characters = sum(len(s) for s in tokens)
    
    if verbose :        
        print(f"There are {num_tokens} tokens in the data.")
        print(f"There are {num_unique_tokens} unique tokens in the data.")
        print(f"There are {num_characters} characters in the data.")
        print(f"The lexical diversity is {lexical_diversity:.3f} in the data.")        
        print (f"The ten most common words are:")
        print(Counter(tokens).most_common(10))
        
    return([num_tokens, num_unique_tokens,
            lexical_diversity,
            num_characters])

In [16]:
# calls to descriptive_stats here

print("CNN News Stats\n")

descriptive_stats(
    [token for tokens in df[df['source'] == 'cnn']['tokens']for token in tokens])

print('\n')
print("FoxNews Stats\n")

descriptive_stats(
    [token for tokens in df[df['source'] == 'foxnews']['tokens']for token in tokens])

CNN News Stats

There are 42408 tokens in the data.
There are 8307 unique tokens in the data.
There are 275630 characters in the data.
The lexical diversity is 0.196 in the data.
The ten most common words are:
[('trump', 906), ('biden', 506), ('president', 254), ('us', 253), ('election', 235), ('court', 201), ('house', 200), ('republican', 164), ('republicans', 158), ('campaign', 155)]


FoxNews Stats

There are 19153 tokens in the data.
There are 4727 unique tokens in the data.
There are 123402 characters in the data.
The lexical diversity is 0.247 in the data.
The ten most common words are:
[('biden', 506), ('house', 187), ('trump', 137), ('hunter', 125), ('president', 108), ('bobulinski', 92), ('senate', 84), ('report', 80), ('special', 74), ('border', 68)]


[19153, 4727, 0.24680206756121756, 123402]