# MSADS509 Final Project M5 UE Wang

## Importing Libraries

In [1]:
import pandas as pd
import re

from nltk.corpus import stopwords
from string import punctuation

from collections import Counter, defaultdict

## Load data from desktop

In [2]:
df = pd.read_csv('/Users/UE/Desktop/MSADS509_news_project.csv')
df

Unnamed: 0,source,url,content
0,cnn,https://www.cnn.com/2024/02/04/politics/senate...,CNN — Senators unveiled a long-awaited border ...
1,cnn,https://www.cnn.com/2024/02/03/politics/strike...,CNN — The US and UK have conducted strikes on ...
2,cnn,https://www.cnn.com/2024/02/04/politics/senate...,CNN — The Senate’s border deal and foreign aid...
3,cnn,https://www.cnn.com/2024/02/04/politics/read-t...,"CNN — On Sunday night, senators unveiled a lon..."
4,cnn,https://www.cnn.com/2024/02/04/politics/ronna-...,CNN — Former President Donald Trump suggested ...
...,...,...,...
64,foxnews,https://www.foxnews.com/politics/fox-news-poli...,Welcome to Fox News’ Politics newsletter with ...
65,foxnews,https://www.foxnews.com/politics/fox-news-poli...,Welcome to Fox News’ Politics newsletter with ...
66,foxnews,https://www.foxnews.com/politics/fox-news-poli...,Welcome to Fox News’ Politics newsletter with ...
67,foxnews,https://www.foxnews.com/politics/media-meltdow...,close Video Barstool Sports’ Dave Portnoy Resp...


## Data Cleaning and tokenizing

In [3]:
# Function to remove prefixes

def remove_prefix(row):
    if row['source'] == 'cnn' and (row['content'].startswith('CNN — ') or \
                                   row['content'].startswith('Washington CNN — ')):
        return row['content'][6:]  
    elif row['source'] == 'foxnews' and row['content'].startswith('close Video '):
        return row['content'][12:]  
    else:
        return row['content']

df['content'] = df.apply(remove_prefix, axis=1)

df


Unnamed: 0,source,url,content
0,cnn,https://www.cnn.com/2024/02/04/politics/senate...,Senators unveiled a long-awaited border deal a...
1,cnn,https://www.cnn.com/2024/02/03/politics/strike...,The US and UK have conducted strikes on Houthi...
2,cnn,https://www.cnn.com/2024/02/04/politics/senate...,The Senate’s border deal and foreign aid packa...
3,cnn,https://www.cnn.com/2024/02/04/politics/read-t...,"On Sunday night, senators unveiled a long-awai..."
4,cnn,https://www.cnn.com/2024/02/04/politics/ronna-...,Former President Donald Trump suggested in an ...
...,...,...,...
64,foxnews,https://www.foxnews.com/politics/fox-news-poli...,Welcome to Fox News’ Politics newsletter with ...
65,foxnews,https://www.foxnews.com/politics/fox-news-poli...,Welcome to Fox News’ Politics newsletter with ...
66,foxnews,https://www.foxnews.com/politics/fox-news-poli...,Welcome to Fox News’ Politics newsletter with ...
67,foxnews,https://www.foxnews.com/politics/media-meltdow...,Barstool Sports’ Dave Portnoy Responds To Atta...


In [4]:
def remove_last_sentence(row):
    sentences = row['content'].split('. ')
    
    if len(sentences) > 1:  # Check if there's more than one sentence
        last_sentences = '. '.join(sentences[-1:])  # Get the last two sentences
        if 'This story has been updated with additional information.' in last_sentences \
                or 'contributed to this' in last_sentences \
                or 'Fox News Channel and FOX Business' in last_sentences:
            updated_content = '. '.join(sentences[:-1])  # Join all sentences except the last two
            return updated_content
    return row['content']

# Apply the function to the DataFrame
df['content'] = df.apply(remove_last_sentence, axis=1)

# Display the updated DataFrame
df

Unnamed: 0,source,url,content
0,cnn,https://www.cnn.com/2024/02/04/politics/senate...,Senators unveiled a long-awaited border deal a...
1,cnn,https://www.cnn.com/2024/02/03/politics/strike...,The US and UK have conducted strikes on Houthi...
2,cnn,https://www.cnn.com/2024/02/04/politics/senate...,The Senate’s border deal and foreign aid packa...
3,cnn,https://www.cnn.com/2024/02/04/politics/read-t...,"On Sunday night, senators unveiled a long-awai..."
4,cnn,https://www.cnn.com/2024/02/04/politics/ronna-...,Former President Donald Trump suggested in an ...
...,...,...,...
64,foxnews,https://www.foxnews.com/politics/fox-news-poli...,Welcome to Fox News’ Politics newsletter with ...
65,foxnews,https://www.foxnews.com/politics/fox-news-poli...,Welcome to Fox News’ Politics newsletter with ...
66,foxnews,https://www.foxnews.com/politics/fox-news-poli...,Welcome to Fox News’ Politics newsletter with ...
67,foxnews,https://www.foxnews.com/politics/media-meltdow...,Barstool Sports’ Dave Portnoy Responds To Atta...


In [5]:
def remove_last_sentence2(row):
    sentences = row['content'].split('. ')
    
    if len(sentences) > 1:  # Check if there's more than one sentence
        remove_sentence = '. '.join(sentences[-1:])  
        if 'This story has been updated with additional information' in remove_sentence \
                or 'follow him on' in remove_sentence \
                or 'Follow him on' in remove_sentence \
                or '@fox.com' in remove_sentence:
            updated_content = '. '.join(sentences[:-1])  # Join all sentences except the last one
            return updated_content
    return row['content']

df['content'] = df.apply(remove_last_sentence2, axis=1)

df

Unnamed: 0,source,url,content
0,cnn,https://www.cnn.com/2024/02/04/politics/senate...,Senators unveiled a long-awaited border deal a...
1,cnn,https://www.cnn.com/2024/02/03/politics/strike...,The US and UK have conducted strikes on Houthi...
2,cnn,https://www.cnn.com/2024/02/04/politics/senate...,The Senate’s border deal and foreign aid packa...
3,cnn,https://www.cnn.com/2024/02/04/politics/read-t...,"On Sunday night, senators unveiled a long-awai..."
4,cnn,https://www.cnn.com/2024/02/04/politics/ronna-...,Former President Donald Trump suggested in an ...
...,...,...,...
64,foxnews,https://www.foxnews.com/politics/fox-news-poli...,Welcome to Fox News’ Politics newsletter with ...
65,foxnews,https://www.foxnews.com/politics/fox-news-poli...,Welcome to Fox News’ Politics newsletter with ...
66,foxnews,https://www.foxnews.com/politics/fox-news-poli...,Welcome to Fox News’ Politics newsletter with ...
67,foxnews,https://www.foxnews.com/politics/media-meltdow...,Barstool Sports’ Dave Portnoy Responds To Atta...


In [6]:

punctuation = set(punctuation) # speeds up comparison
sw = stopwords.words("english")
extra_sw = ['cnn', 'fox', 'news', 'said', '–']
sw.extend(extra_sw)
whitespace_pattern = re.compile(r"\s+")

def remove_stop(tokens) :
    
    return [t for t in tokens if t.lower() not in sw]

def remove_punctuation(text, punct_set=punctuation) : 
    
    return("".join([ch for ch in text if ch not in punct_set]))

def tokenize(text) : 
    
    return re.split(whitespace_pattern, text)

def prepare(text, pipeline) : 
    
    tokens = str(text)
    
    for transform in pipeline : 
        tokens = transform(tokens)
        
    return(tokens)

pipeline = [str.lower, remove_punctuation, tokenize, remove_stop]


In [7]:
# Tokenize and preprocess each row
df['tokens'] = df['content'].apply(lambda x: prepare(x, pipeline=pipeline))

# Print the resulting dataframe
df.head()

Unnamed: 0,source,url,content,tokens
0,cnn,https://www.cnn.com/2024/02/04/politics/senate...,Senators unveiled a long-awaited border deal a...,"[senators, unveiled, longawaited, border, deal..."
1,cnn,https://www.cnn.com/2024/02/03/politics/strike...,The US and UK have conducted strikes on Houthi...,"[us, uk, conducted, strikes, houthi, targets, ..."
2,cnn,https://www.cnn.com/2024/02/04/politics/senate...,The Senate’s border deal and foreign aid packa...,"[senate’s, border, deal, foreign, aid, package..."
3,cnn,https://www.cnn.com/2024/02/04/politics/read-t...,"On Sunday night, senators unveiled a long-awai...","[sunday, night, senators, unveiled, longawaite..."
4,cnn,https://www.cnn.com/2024/02/04/politics/ronna-...,Former President Donald Trump suggested in an ...,"[former, president, donald, trump, suggested, ..."


## Basic Descriptive Statistics

In [8]:
def descriptive_stats(tokens, num_tokens = 50, verbose=True) :
    """
        Given a list of tokens, print number of tokens, number of unique tokens, 
        number of characters, lexical diversity (https://en.wikipedia.org/wiki/Lexical_diversity), 
        and num_tokens most common tokens. Return a list with the number of tokens, number
        of unique tokens, lexical diversity, and number of characters. 
    
    """

    # Fill in the correct values here. 
    
    num_tokens = len(tokens)
    num_unique_tokens = len(set(tokens))  
    lexical_diversity = num_unique_tokens / num_tokens
    num_characters = sum(len(s) for s in tokens)
    
    if verbose :        
        print(f"There are {num_tokens} tokens in the data.")
        print(f"There are {num_unique_tokens} unique tokens in the data.")
        print(f"There are {num_characters} characters in the data.")
        print(f"The lexical diversity is {lexical_diversity:.3f} in the data.")        
        print (f"The five most common words are:")
        print(Counter(tokens).most_common(5))
        
    return([num_tokens, num_unique_tokens,
            lexical_diversity,
            num_characters])
    

In [9]:
# calls to descriptive_stats here

print("CNN News Stats\n")

descriptive_stats(
    [token for tokens in df[df['source'] == 'cnn']['tokens']for token in tokens])

print('\n')
print("FoxNews Stats\n")

descriptive_stats(
    [token for tokens in df[df['source'] == 'foxnews']['tokens']for token in tokens])

CNN News Stats

There are 29920 tokens in the data.
There are 7342 unique tokens in the data.
There are 193143 characters in the data.
The lexical diversity is 0.245 in the data.
The five most common words are:
[('trump', 330), ('us', 267), ('biden', 260), ('president', 162), ('would', 145)]


FoxNews Stats

There are 10897 tokens in the data.
There are 3732 unique tokens in the data.
There are 68551 characters in the data.
The lexical diversity is 0.342 in the data.
The five most common words are:
[('biden', 132), ('trump', 101), ('president', 97), ('new', 75), ('house', 63)]


[10897, 3732, 0.3424795815362026, 68551]