# Congressional Text Analysis

The code below is to analyze congressional records over the past decade, to highlight key trends and delve into political polarization by party over time. Speaker analysis is WIP. 

## Key Imports

In [43]:
import pandas as pd
import numpy as np
import requests
import PyPDF2
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from urllib.parse import urlencode
from bs4 import BeautifulSoup
from collections import Counter
from textblob import TextBlob
import pickle
import matplotlib.pyplot as plt
from textblob import TextBlob
from scipy import stats
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import plotly.graph_objects as go
import re
from collections import defaultdict

## Key Data Aggregation and Collection Functions

In [None]:
def get_scrapeops_url(api_key, url):
    ''' function to scrape a given public url using ScrapeOps API'''
    
    payload = {'api_key': api_key, 'url': url}
    proxy_url = 'https://proxy.scrapeops.io/v1/?' + urlencode(payload)
    
    return proxy_url

In [2]:
def get_congressional_record_data(starting_congress, ending_congress, api_key):
    ''' function to aggregate congressional record urls for a certain period of congresses '''
    
    all_links = []

    # iterate through congresses
    for congress in range(starting_congress, ending_congress + 1):

        # produce generic congressional session link associated with this congress
        url = f"https://www.congress.gov/congressional-record/{congress}th-congress/browse-by-date"

        # scrape data associated with that URL
        r = requests.get(get_scrapeops_url(api_key, url))
        soup = BeautifulSoup(r.content, 'html.parser')

        # create list of all congressional record pdf links within the session's site, accounting for link format change after 116th congress  
        if congress > 116:
            pdf_links = soup.find_all('a', href=re.compile(r'(/\d{3}/crec/\d{4}/\d{2}/\d{2}/\d{3}/\d{3}/CREC-\d{4}-\d{2}-\d{2}\.pdf)'))
        else:
            pdf_links = soup.find_all('a', href=re.compile(r'(/\d{3}/crec/\d{4}/\d{2}/\d{2}/CREC-\d{4}-\d{2}-\d{2}\.pdf)'))

        all_links.extend([('https://www.congress.gov' + link['href']) for link in pdf_links])

    return all_links

In [3]:
def read_pdf_from_url(pdf_url):
    ''' function to read in daily congressional record pdf urls '''
    
    # Send a GET request to the PDF URL
    response = requests.get(pdf_url, stream=True)

    # Check if the request was successful
    if response.status_code == 200:
        # Open the PDF file in binary mode
        with open("temp_pdf.pdf", 'wb') as pdf_file:
            # Write the content of the response to the PDF file
            pdf_file.write(response.content)

        # Read the text from the downloaded PDF
        text = read_pdf_text("temp_pdf.pdf")

        # Delete the temporary PDF file
        import os
        os.remove("temp_pdf.pdf")

        return text
    else:
        return

## Text Cleaning Functions

In [4]:
def clean_text(text):
    ''' Remove newline characters, hyphens, and punctuation marks '''
    
    cleaned_text = re.sub(r'\n|-', ' ', text)
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)
    return cleaned_text

In [None]:
def remove_stopwords(words):
    ''' remove key stopwords from text '''
    
    # Get list of stopwords from NLTK
    stop_words = set(stopwords.words('english'))
    
    # Remove stopwords from the list of words
    filtered_words = [word.lower() for word in words if word.lower() not in stop_words and word != 'f']
    return filtered_words

In [5]:
def is_word(word):
    ''' check if word exists in WordNet '''
    return wordnet.synsets(word)

## Monthly Text Aggregation Functions

In [6]:
def group_pdf_urls(url_list):
    ''' iterate over list of urls and urls by month in a dct '''
    
    pdf_urls_by_month = {}
    # Iterate over the list of PDF URLs
    for pdf_url in pdf_urls:
        # Extract the date from the URL
        date = pdf_url.split('/')[-1]  # Extract the date from the second to last part of the URL
        year = date.split('-')[1]
        month = date.split('-')[2]
        day = date.split('-')[3]
        
        # Construct the month key (YYYY-MM)
        month_key = f"{year}-{month}"
        if month_key in pdf_urls_by_month:
            pdf_urls_by_month[month_key].append(pdf_url)
        else:
            pdf_urls_by_month[month_key] = [pdf_url]

    return pdf_urls_by_month

In [7]:
def read_through_pdfs(url_monthly_dct):
    ''' iterate through and read URLs, cleaning the text and producing monthly text blobs '''
    
    monthly_words_dct = {}

    # iterate through month, list of urls by month in dct
    for key, value in url_monthly_dct.items():
        print(key)
        monthly_pdf_words = []

        # iterate over each url in that mlonth
        for pdf_url in value:

            # read in text, clean and tokenize the text, remove stopwords  
            pdf_text = read_pdf_from_url(pdf_url)     
            cleaned_text = clean_text(pdf_text)
            words = word_tokenize(cleaned_text)
            filtered_words = remove_stopwords(words)

            # confirm words are words in WordNet library, add words to monthly list of words
            valid_words = [word for word in filtered_words if is_word(word)]
            monthly_pdf_words.extend(valid_words)
        monthly_words_dct[key] = monthly_pdf_words
    return monthly_words_dct

## Word Counting Functions

In [11]:
def count_words(filtered_words):
    ''' function to return a frequency dct of words with a frequency of over 20 in a given text list '''
    
    # Count the occurrences of each word
    word_counts = Counter(filtered_words)
    
    # Get the most common words and their frequencies
    most_common_words = word_counts.most_common()
    most_common_words_dct = dict(most_common_words)

    common_words = {word: count for word, count in most_common_words_dct.items() if count > 20}
    return common_words

In [12]:
def get_monthly_word_counts_df(monthly_words_dct):
    ''' function to produce monthly frequency df from monthly words dictionary '''
    
    monthly_word_count_lst = []
    months = []

    # iterate through each month, list of words associated with that month 
    for key, value in monthly_words_dct.items():

        # find words with frequency over 20 and their associated frequencies
        monthly_word_counts = count_words(value)
        monthly_word_count_lst.append(monthly_word_counts)
        months.append(key)

    # to reduce redundancies in word countings
    all_words = set()
    for monthly_word_count in monthly_word_count_lst:  # Assuming monthly_counts_list contains all monthly count dictionaries
        all_words.update(monthly_word_count.keys())
    all_words = list(all_words)
    df = pd.DataFrame(columns=all_words)

    # fill in df with monthly counts of that word
    for i, monthly_counts in enumerate(monthly_word_count_lst):
        month_data = {word: monthly_counts.get(word, 0) for word in all_words}
        df.loc[i] = month_data
    df['month'] = months
    
    df.fillna(0, inplace=True)
    display(df.head())
    return df

## Frequency Visualization

In [18]:
def plot_word_frequencies(df, words):
    ''' function to graph word frequencies over time with rolling avg '''
    
    plt.figure(figsize=(12, 6))
    
    for word in words:
        if word in df.columns:
            plt.plot(df['Month'], df[word], label=word)
    
    # If there is only one word, add a 12-month rolling average trend line
    if len(words) == 1:
        word = words[0]
        if word in df.columns:
            rolling_avg = df[word].rolling(window=12).mean()
            plt.plot(df['Month'], rolling_avg, label=f'{word} 12-Month Rolling Avg', linestyle='--')
    
    plt.xlabel('Month')
    plt.ylabel('Frequency')
    plt.title('Word Frequencies Over Time')
    plt.legend()
    plt.show()


## Sentiment Calculation

In [34]:
def calculate_sentiment(df):
    ''' function to add sentiment column to word count df with monthly sentiment scores '''
    
    # Initialize an empty list to store sentiment scores for each month
    sentiment_scores = []
    
    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        # Initialize variables to accumulate sentiment and word frequencies for the current month
        month_sentiment = 0
        total_words = 0
        
        # Iterate over each word and its frequency in the current row
        for word, frequency in row.items():
            # Skip the 'Month' column
            if word != 'Month' and word != 'month':
                try:
                    # Convert frequency to a float
                    frequency = float(frequency)
                except ValueError:
                    # Print a message if frequency is not a number and skip this word
                    print(f"Non-numeric frequency found: word='{word}', frequency='{frequency}'")
                    continue
                
                # Calculate the sentiment polarity of the word using TextBlob
                sentiment = TextBlob(word).sentiment.polarity
                # Accumulate the sentiment weighted by the word's frequency
                month_sentiment += sentiment * frequency
                # Accumulate the total word frequencies
                total_words += frequency
        
        # Calculate the average sentiment for the month if there are any words
        if total_words != 0:
            average_sentiment = month_sentiment / total_words
        else:
            average_sentiment = 0
        
        # Append the average sentiment for the month to the list
        sentiment_scores.append(average_sentiment)
    
    # Add the sentiment scores as a new column in the DataFrame
    df['Sentiment'] = sentiment_scores
    return df

In [41]:
def plot_sentiment_over_time(df):
    ''' function to plot sentiment over time with moving avg and r-squared values '''
    
    plt.figure(figsize=(12, 6))
    
    # Convert 'Month' to datetime if not already
    df['Month'] = pd.to_datetime(df['Month'], format='%Y-%m')
    df = df.sort_values('Month')
    
    # Plot the original sentiment scores
    plt.plot(df['Month'], df['Sentiment'], label='Sentiment')
    
    # Calculate and plot the 12-month rolling average
    df['12_Month_Rolling_Avg'] = df['Sentiment'].rolling(window=12).mean()
    plt.plot(df['Month'], df['12_Month_Rolling_Avg'], label='12-Month Rolling Avg', linestyle='--')
    
    # Calculate and plot the overall trend line
    x = list(range(len(df['Month'])))
    y = df['Sentiment'].values
    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
    plt.plot(df['Month'], intercept + slope * pd.Series(x), label='Overall Trend Line', linestyle=':')
    
    # Customize the plot
    plt.xlabel('Month')
    plt.ylabel('Sentiment Score')
    plt.title('Sentiment Over Time')
    plt.legend()
    
    # # Print the R^2 values and the line of best fit equations
    # print(f"Overall line of best fit: y = {slope:.4f}x + {intercept:.4f}")
    # print(f"Overall R^2: {r_value**2:.4f}")

    plt.show()

## Driver Function and Execution

In [9]:
def driver_function(starting_congress, ending_congress, api_key, pkl_exists):
    ''' driver function to execute data scraping and aggregation of congressional records '''

    # check if pkl exists to not repeat scraping if done already '''
    if pkl_exists:
        with open('data.pkl', 'rb') as f:
            monthly_words_dct = pickle.load(f)
            return monthly_words_dct
    else:
        # get urls and group them by month
        pdf_urls = get_congressional_record_data(starting_congress, ending_congress, api_key)
        pdf_urls_by_month = group_pdf_urls(pdf_urls)

        # pull monthly text associated with that month's congressional records
        monthly_words_dct = read_through_pdfs(pdf_urls_by_month)

    # produce monthly count df with words > 20 mentions in that month, sorting it 
    monthly_word_count_df = get_monthly_word_counts_df(monthly_words_dct)
    monthly_word_count_df.sort_values(by='Month', inplace=True)
    monthly_word_count_df['Month'] = pd.to_datetime(monthly_word_count_df['Month'])
    return monthly_words_dct, monthly_word_count_df

### Inputs

In [17]:
starting_congress = 113
ending_congress = 118
api_key = '2dc77943-2998-4800-ba89-49904eb04200'
pkl_exists = True
words = ['inflation']

### Execution

In [None]:
monthly_words_dct, monthly_word_count_df = driver_function(starting_congress, ending_congress, api_key, pkl_exists)
plot_word_frequencies(monthly_word_count_df, words)
calculate_sentiment(monthly_word_count_df)
plot_sentiment_over_time(monthly_word_count_df)

## WIP Code to Analyze Speaker and Party Sentiment

In [None]:
def extract_speakers(text):
    # Using regular expression to extract speaker dialogues
    speaker_pattern = re.compile(r'\b(?:Ms\.|Mr\.|Mrs\.) ([A-Z]+)\b')
    dialogues = []
    
    for match in speaker_pattern.finditer(text):
        start = match.end()
        speaker = match.group(1)
        
        end = text.find('\n', start)
        dialogue = text[start:end].strip()
        
        dialogues.append((speaker, dialogue))
    
    return dialogues

def determine_party(speaker, rep_list, dem_list):
    if speaker in rep_list:
        return 'Republican'
    elif speaker in dem_list:
        return 'Democrat'
    else:
        return None

def calculate_party_sentiment(monthly_data, republican_congressppl, democratic_congressppl):
    monthly_sentiments = defaultdict(lambda: {'Democrat': [], 'Republican': []})
    
    for month, text in monthly_data.items():
        # Extract dialogues
        dialogues = extract_speakers(text)
        
        for speaker, dialogue in dialogues:
            # Determine speaker's party
            party = determine_party(speaker, republican_congressppl, democratic_congressppl)
            
            if party:
                # Limit dialogue to 100 words
                limited_dialogue = ' '.join(dialogue.split()[:100])
                
                # Calculate sentiment
                sentiment = TextBlob(limited_dialogue).sentiment.polarity
                
                # Add sentiment to the respective party list
                monthly_sentiments[month][party].append(sentiment)
    
    # Calculate average sentiment for each party by month
    average_monthly_sentiments = {
        month: {
            'Democrat': (sum(scores['Democrat']) / len(scores['Democrat']) if scores['Democrat'] else 0),
            'Republican': (sum(scores['Republican']) / len(scores['Republican']) if scores['Republican'] else 0)
        }
        for month, scores in monthly_sentiments.items()
    }
    
    return average_monthly_sentiments



In [None]:
def plot_sentiment_over_time(average_monthly_sentiments):
    # Prepare data for plotting
    data = {
        'Month': [],
        'Democrat_Sentiment': [],
        'Republican_Sentiment': []
    }
    
    for month, sentiments in sorted(average_monthly_sentiments.items()):
        data['Month'].append(month)
        data['Democrat_Sentiment'].append(sentiments['Democrat'])
        data['Republican_Sentiment'].append(sentiments['Republican'])
    
    df = pd.DataFrame(data)
    df['Month'] = pd.to_datetime(df['Month'], format='%Y-%m')
    df = df.sort_values('Month')
    
    # Plotting
    plt.figure(figsize=(14, 7))
    
    plt.plot(df['Month'], df['Democrat_Sentiment'], label='Democrat Sentiment', color='blue')
    plt.plot(df['Month'], df['Republican_Sentiment'], label='Republican Sentiment', color='red')
    
    # Calculate and plot the 12-month rolling average
    df['Democrat_Rolling_Avg'] = df['Democrat_Sentiment'].rolling(window=12).mean()
    df['Republican_Rolling_Avg'] = df['Republican_Sentiment'].rolling(window=12).mean()
    
    plt.plot(df['Month'], df['Democrat_Rolling_Avg'], label='Democrat 12-Month Rolling Avg', linestyle='--', color='blue')
    plt.plot(df['Month'], df['Republican_Rolling_Avg'], label='Republican 12-Month Rolling Avg', linestyle='--', color='red')
    

    for party in ['Democrat', 'Republican']:
        for period, label_suffix in [(pre_2017, 'Pre-2017'), (post_2017, '2017-Present')]:
            x = list(range(len(period)))
            y = period[f'{party}_Sentiment'].values
            slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
            
            plt.plot(period['Month'], intercept + slope * pd.Series(x), label=f'{party} Trend Line {label_suffix}', linestyle=':')
            
            # Print the R^2 values and the line of best fit equations
            print(f"{party} {label_suffix} line of best fit: y = {slope:.4f}x + {intercept:.4f}")
            print(f"{party} {label_suffix} R^2: {r_value**2:.4f}")
    
    plt.xlabel('Month')
    plt.ylabel('Sentiment Score')
    plt.title('Sentiment Over Time by Party')
    plt.legend()
    plt.show()