In [17]:
import datetime
import random
import requests
import time
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from collections import defaultdict
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

from pandas import json_normalize

## Scrape Data from Fox News and CNN

In [13]:
def return_text_if_not_none(element):
    return element.get_text(separator=' ', strip=True) if element else None

current_year = datetime.datetime.now().year

source = {'cnn': "https://www.cnn.com/politics",
          'foxnews': "https://www.foxnews.com/politics"}

news_pages = defaultdict(list)  # Use a list to store URLs and content

for source_name, source_page in source.items():
    
    # request the page and sleep
    r = requests.get(source_page)
    
    time.sleep(5 + 10 * random.random())
    
    soup = BeautifulSoup(r.content, 'html.parser')
    
    links = soup.find_all('a', href=True)
    
    for link in links:
        
        href = link['href']
        # Convert relative URLs to absolute URLs
        full_url = urljoin(source_page, href)
        
        # Check if the link contains "/politics/" and does not contain "/gallery/"
        if "/politics/" in full_url and "/gallery/" not in full_url:
            
            # Check if it's CNN and the URL has the format 'cnn.com/{}/'
            if source_name == 'cnn' and f"cnn.com/{current_year}/" in full_url:
                
                # Fetch the news content
                content_r = requests.get(full_url)
                
                content_soup = BeautifulSoup(content_r.content, 'html.parser')
                
                article_content = return_text_if_not_none(content_soup.find('div', {'class': 'article__content'}))
                
                news_pages[source_name].append({'url': full_url, 'content': article_content})
                
            # Check if it's FOXNEWS and the URL does not contain "/category/"
            elif source_name == 'foxnews' and "/category/" not in full_url:
                
                # Fetch the news content
                content_r = requests.get(full_url)
                
                content_soup = BeautifulSoup(content_r.content, 'html.parser')
                
                article_content = return_text_if_not_none(content_soup.find('div', {'class': 'article-content'}))
                
                news_pages[source_name].append({'url': full_url, 'content': article_content})
# Create a DataFrame

df = pd.DataFrame([(source_name, item['url'], item['content']) for source_name, items in 
                   news_pages.items() for item in items], columns=['source', 'url', 'content'])

df = df.drop_duplicates()

Unnamed: 0,source,url,content
0,cnn,https://www.cnn.com/2024/02/09/politics/biden-...,Washington CNN — Thrust into an election year ...
2,cnn,https://www.cnn.com/2024/02/09/politics/john-r...,CNN — Soon after John Roberts took his seat at...
3,cnn,https://www.cnn.com/2024/02/08/politics/analys...,CNN — Donald Trump had his best day of 2024 so...
4,cnn,https://www.cnn.com/2024/02/09/politics/rand-p...,The Senate is working to pass a $95.3 billion ...
5,cnn,https://www.cnn.com/2024/02/08/politics/nevada...,CNN — Donald Trump won Nevada’s Republican pre...


Confirming that the CNN content was scraped successfully

In [20]:
df[df['source']=='cnn'].head()

Unnamed: 0,source,url,content
0,cnn,https://www.cnn.com/2024/02/09/politics/biden-...,Washington CNN — Thrust into an election year ...
2,cnn,https://www.cnn.com/2024/02/09/politics/john-r...,CNN — Soon after John Roberts took his seat at...
3,cnn,https://www.cnn.com/2024/02/08/politics/analys...,CNN — Donald Trump had his best day of 2024 so...
4,cnn,https://www.cnn.com/2024/02/09/politics/rand-p...,The Senate is working to pass a $95.3 billion ...
5,cnn,https://www.cnn.com/2024/02/08/politics/nevada...,CNN — Donald Trump won Nevada’s Republican pre...


In [None]:
Confirming that the CNN content was scraped successfully

In [27]:
df[df['source']=='foxnews'].head()

Unnamed: 0,source,url,content,clean_content,tokens
55,foxnews,https://www.foxnews.com/politics/after-trump-r...,close Video Judges question whether Trump can ...,close Video Judges question whether Trump can ...,"[close, video, judges, question, whether, trum..."
57,foxnews,https://www.foxnews.com/politics/gavin-newsom-...,close Video Dana Perino: Crime in CA is gettin...,close Video Dana Perino Crime in CA is getting...,"[close, video, dana, perino, crime, ca, gettin..."
59,foxnews,https://www.foxnews.com/politics/bipartisan-la...,close Video 'Crisis level': Mark Kelly says ki...,close Video Crisis level Mark Kelly says killi...,"[close, video, crisis, level, mark, kelly, say..."
61,foxnews,https://www.foxnews.com/politics/former-navy-s...,close Video Tim Sheehy says Dems meddling in S...,close Video Tim Sheehy says Dems meddling in S...,"[close, video, tim, sheehy, says, dems, meddli..."
63,foxnews,https://www.foxnews.com/politics/senate-republ...,close Video Lt. Chris Olivarez: Senate bill 'd...,close Video Lt Chris Olivarez Senate bill did ...,"[close, video, lt, chris, olivarez, senate, bi..."


### Cleaning the Data

In [23]:
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove any punctuation
    text = re.sub(r'[^\w\s]', '', text)
    return text

df['clean_content'] = df['content'].apply(clean_text)

### Tokenize and Normalize

In [24]:
stop_words = set(stopwords.words('english'))

def tokenize_and_normalize(text):
    # Tokenize
    tokens = word_tokenize(text)
    # Normalize (lowercase and remove stopwords and punctuation)
    tokens = [token.lower() for token in tokens if token.lower() not in stop_words and token.isalpha()]
    return tokens

df['tokens'] = df['clean_content'].apply(tokenize_and_normalize)


### Descriptive Statistics

In [25]:
# Total number of articles
total_articles = len(df)

# Total number of tokens (words)
total_tokens = df['tokens'].apply(len).sum()

# Vocabulary size (unique words)
unique_tokens = set(token for tokens in df['tokens'] for token in tokens)
vocabulary_size = len(unique_tokens)

# Average tokens per article
avg_tokens_per_article = total_tokens / total_articles

# Most common tokens
all_tokens = [token for tokens in df['tokens'] for token in tokens]
token_counts = Counter(all_tokens)
most_common_tokens = token_counts.most_common(10)  # Top 10

# Compile statistics into a dictionary
stats = {
    'Total Articles': total_articles,
    'Total Tokens': total_tokens,
    'Vocabulary Size': vocabulary_size,
    'Average Tokens per Article': avg_tokens_per_article,
    'Most Common Tokens': most_common_tokens
}

# Display statistics
for stat, value in stats.items():
    print(f"{stat}: {value}")


Total Articles: 67
Total Tokens: 39155
Vocabulary Size: 7032
Average Tokens per Article: 584.4029850746268
Most Common Tokens: [('biden', 580), ('trump', 566), ('said', 408), ('president', 359), ('report', 202), ('would', 195), ('election', 193), ('bidens', 191), ('house', 190), ('court', 189)]
