In [1]:
import pandas as pd
from methods import *
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF

'''
# Ensure NLTK resources are downloaded (only needs to run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
'''

src = "https://www.debates.org/voter-education/debate-transcripts"
debateData = {}

In [2]:
urlsTemp = urlExtractor(src)[10:57] # hardcoded way to get the urls that specifically pertain to the transcripts of the debates on the given website ("debate.org")
urls = []
for url in urlsTemp:
    urls.append("https://www.debates.org" + url)

In [4]:
# for each article URL, call the getArticleData and add it to a dictionary. #
# Similarly, process the text and date into their approrpiate formats.
id = 0
for article_url in urls:
    tempData = getArticleData(article_url)
    debateData[id] = {"date": formatDate(tempData[0]), "text" :wordProcessor(tempData[1]), "source" : article_url}
    id += 1

In [None]:
# for each entry in debate data, perform the topic analysis

In [None]:
data_df = pd.DataFrame.from_dict(debateData, orient='index')

# Vectorization
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = vectorizer.fit_transform(data_df['text'])

# LDA model (change to NMF for NMF model)
lda = NMF(n_components=5, random_state=0)
lda.fit(tfidf)

# Display topics
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
tf_feature_names = vectorizer.get_feature_names_out()
display_topics(lda, tf_feature_names, no_top_words)