In [33]:
import pandas as pd
from methods import *
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
import datetime
import plotly.express as px

'''
# Ensure NLTK resources are downloaded (only needs to run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
'''

"\n# Ensure NLTK resources are downloaded (only needs to run once)\nnltk.download('punkt')\nnltk.download('stopwords')\nnltk.download('wordnet')\nnltk.download('averaged_perceptron_tagger')\n"

In [None]:
src = "https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/presidential-campaigns-debates-and-endorsements-0"
debateData = {}

In [48]:
src2 = "https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/annual-messages-congress-the-state-the-union"
sotuData = {}

In [55]:
response = requests.get(src2)
content = response.text

# Parse the HTML content
soup = BeautifulSoup(content, "html.parser")

# Find the table in the HTML content
table = soup.find('table')

# Iterate over the rows and cells in the table to extract the hyperlinks
for row in table.find_all('tr'):
    for cell in row.find_all('td'):
        link = cell.find('a')
        if link and 'href' in link.attrs:
            href = link['href']
            # Skip hyperlinks after 1960
            if href.startswith('/documents/annual-message-the-congress-the-state-the-union-') and int(href.split('-')[-1]) < 1961:
                print(href)

'https://www.presidency.ucsb.edu/ws/index.php?pid=4135'

In [17]:
response = requests.get(src)
content = response.text

# Parse the HTML content
soup = BeautifulSoup(content, "html.parser")

# Find the table containing the dates and hyperlinks
table = soup.find("table")

# Initialize an empty dictionary to store the extracted data
data = {}

# Iterate over each row in the table
for row in table.find_all("tr"):
    # Extract the date and hyperlink from the row
    date = row.find(string=re.compile(r"\d{1,2}, \d{4}"))
    hyperlink = row.find("a")

    # Add the extracted data to the dictionary
    if date:
        data[date] = hyperlink["href"] if hyperlink else None

In [20]:
def formatDate(date_string):
    month_dict = {
        "January": "01",
        "February": "02",
        "March": "03",
        "April": "04",
        "May": "05",
        "June": "06",
        "July": "07",
        "August": "08",
        "September": "09",
        "October": "10",
        "November": "11",
        "December": "12"
    }
    day, year = date_string.split()[1:]
    day = day.replace(",", "")  # Remove the comma from the day
    day = int(day)
    month = month_dict[date_string.split()[0]]
    return datetime.datetime.strptime(f"{year}-{month}-{day}", "%Y-%m-%d").strftime("%Y-%m-%d")



def verifyDate(date_string):
    date_pattern = re.compile(r"^(January|February|March|April|May|June|July|August|September|October|November|December) [0-9]{1,2}, [0-9]{4}$")
    return bool(date_pattern.match(date_string))

In [23]:
# For each article URL, call the getArticleData and add it to a dictionary
for date, article_url in data.items():
    if article_url != None and verifyDate(date):
        text = getArticleData(article_url)
        debateData[formatDate(date)] = {"text" :text, "source" : article_url}

2024-06-27
2024-01-10
2023-12-06
2023-11-08
2023-09-27
2023-08-23
2020-10-22
2020-09-29
2020-10-07
2020-03-15
2020-02-25
2020-02-19
2020-02-07
2020-01-14
2020-12-19
2019-11-20
2019-10-15
2019-09-12
2019-07-31
2019-07-30
2019-06-27
2019-06-26
2016-10-19
2016-10-09
2016-09-26
2016-10-04
2016-04-14
2016-03-09
2016-03-06
2016-02-11
2016-02-04
2016-01-25
2016-01-17
2015-12-19
2015-11-14
2015-10-13
2016-03-10
2016-03-03
2016-02-25
2016-02-13
2016-02-06
2016-01-28
2016-01-14
2015-12-15
2015-11-10
2015-10-28
2015-09-16
2015-08-06
2012-10-22
2012-10-16
2012-10-03
2012-10-11
2012-02-22
2012-01-26
2012-01-23
2012-01-19
2012-01-16
2012-01-08
2012-01-07
2011-12-15
2011-12-10
2011-11-22
2011-11-12
2011-11-09
2011-10-18
2011-10-11
2011-09-22
2011-09-12
2011-09-07
2011-09-05
2011-08-11
2011-06-13
2008-10-15
2008-10-07
2008-09-26
2008-10-02
2008-04-16
2008-02-26
2008-02-21
2008-01-31
2008-01-21
2008-01-15
2008-01-05
2007-12-13
2007-12-04
2007-11-15
2007-10-30
2007-09-26
2007-09-09
2007-08-19
2007-08-07

In [39]:
# SENTIMENT ANALYSIS

#sort the debateData s.t. the times are in chronological order
debateData = dict(sorted(debateData.items()))

sentDict = {}
# for every article, compute the sentiment by word, sentence, and document.
id = 0
for date,val in debateData.items():
    scores = overall_sentiment(val.get("text"))
    sentDict[id] = {"source" : val.get("source"), 
                    "date" : date,
                    "by word (sum)" : scores[0],
                    "by sentence (sum)": scores[1]}
    id += 1

In [43]:
# Create a Pandas DataFrame from the sentiment dictionary
df = pd.DataFrame.from_dict(sentDict, orient='index')
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

# Create a line plot using Plotly Express
fig = px.line(df, x=df.index, y=['by word (sum)', 'by sentence (sum)'],
              title='Sentiment Analysis Over Time',
              labels={'value': 'Sentiment Score (%)', 'variable': 'Sentiment Type'})

# Add interactive features to the plot
fig.update_layout(hovermode='x unified')
fig.update_xaxes(rangeslider_visible=True)

# Convert the dates to milliseconds since the Unix epoch
x1 = pd.to_datetime('2021-01-06').timestamp() * 1000
x2 = pd.to_datetime('2022-02-24').timestamp() * 1000

# Add vertical lines for specific events
fig.add_vline(x=x1, line_width=2, line_dash='dash', line_color='red',
              annotation_text="Capitol Hill Riots", annotation_position="top left")
fig.add_vline(x=x2, line_width=2, line_dash='dash', line_color='blue',
              annotation_text="Russian Invasion of Ukraine", annotation_position="top right")

# Show the plot
fig.show()