<a href="https://colab.research.google.com/github/jinkim-com/qm2coursework/blob/main/NLP_Trump_2020.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
%%capture
!pip install spacy
!pip install scattertext
!pip install tika
!pip install spacytextblob
!pip install gensim
!pip install scikit-learn
!pip install torch
!pip install scipy
!python -m spacy download en_core_web_sm
!pip install contractions

import spacy
import json
import pylab
from IPython.core.display import display, HTML
import nltk
from tika import parser
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from spacytextblob.spacytextblob import SpacyTextBlob
import re
import os

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import string
import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

import string

# Download stopwords
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))



%matplotlib inline
pylab.rcParams['figure.figsize'] = (10., 8.)
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe('spacytextblob')
nltk.download('punkt_tab')


# Linking files and downloads from GitHub


In [2]:
url = "https://raw.githubusercontent.com/jinkim-com/qm2coursework/refs/heads/main/Trump_2020.csv"
df = pd.read_csv(url)

# Cleaning the Data

**Our data may still has some unwanted characters such as \n. This means we need to clean the data:**


In [21]:
# Removing all isntances of \n

df['Speech'] = df['Speech'].str.replace('\n', '', regex=False)

We also need to check that all the speeches are before the election (and are therefore part of the campaign). We can then remove unwanted speeches from the data.

In [22]:
df.sort_values(by=['Date'], inplace=True, ascending=False)
df.head()

Unnamed: 0,Speech,Date
19,"Well, thank you very much and I’m thrilled to ...",8/9/20
18,"Well, thank you very much. Thank you.Thank you...",8/9/20
34,"Hello, everybody. Hello, Duluth. Hello, Duluth...",30/9/20
1,I worked so hard for this state. I worked so ...,30/1/20
17,"So thank you Pennsylvania, very much. I’m thri...",3/9/20


# Using contexual embedding techniques to find the words associated with a main word

We are using a pre-trained model in order to find the words most associated with our voter issues. These words are then classified as the regular expressions (RegEx) that we are searching for within the speeches. Instead of using Word2Vec, we are using GloVe, which is more lightweight. This means it is faster and a smaller download which is good for GitHub, but nonetheless effective.

In [25]:
from gensim.downloader import load

# Load the GloVe model (100-dimensional)
glove_model = load("glove-wiki-gigaword-100")



In [26]:
# Size of the vocabulary
print(f"Vocabulary size: {len(glove_model.index_to_key)}")

# Dimension of word vectors
print(f"Vector size: {glove_model.vector_size}")


Vocabulary size: 400000
Vector size: 100


In [27]:
def format_results(results):
    return "\n".join([f"{i+1}. {word} (Similarity: {similarity:.4f})"
                      for i, (word, similarity) in enumerate(results)])

# Function to find top 10 similar words to a single word
def find_similar_single(word, topn=10):
    if word in glove_model:
        results = glove_model.most_similar(positive=[word], topn=topn)
        print(f"Top {topn} words similar to '{word}':\n" + format_results(results)+ "\n")
        return [word for word, _ in results]
    else:
        print(f"Word '{word}' not found in the model vocabulary.")

# Function to find top 10 similar words to a combination of two words
def find_similar_combination(word1, word2, topn=10):
    missing_words = [word for word in [word1, word2] if word not in glove_model]
    if missing_words:
        print(f"The following words are not in the vocabulary: {', '.join(missing_words)}")
        return [word for word, _ in results]

    results = glove_model.most_similar(positive=[word1, word2], topn=topn)
    print((f"Top {topn} words similar to the combination of '{word1}' + '{word2}':\n"
            + format_results(results))+"\n")

In [28]:
economy_top10 = find_similar_single('economy')
healthcare_top10 = find_similar_single('healthcare')
crime_top10 = find_similar_single('crime')
foreign_policy_top10 = find_similar_combination('foreign', 'policy')
immigration_top10 = find_similar_single('immigration')

Top 10 words similar to 'economy':
1. economic (Similarity: 0.8279)
2. growth (Similarity: 0.7947)
3. recession (Similarity: 0.7692)
4. economies (Similarity: 0.7545)
5. recovery (Similarity: 0.7491)
6. downturn (Similarity: 0.7462)
7. slowdown (Similarity: 0.7368)
8. slowing (Similarity: 0.7281)
9. sector (Similarity: 0.7247)
10. crisis (Similarity: 0.7136)

Top 10 words similar to 'healthcare':
1. health (Similarity: 0.7659)
2. care (Similarity: 0.7575)
3. services (Similarity: 0.6494)
4. medicare (Similarity: 0.6475)
5. medical (Similarity: 0.6411)
6. insurance (Similarity: 0.6283)
7. education (Similarity: 0.6249)
8. provider (Similarity: 0.6174)
9. providers (Similarity: 0.6123)
10. medicaid (Similarity: 0.6097)

Top 10 words similar to 'crime':
1. criminal (Similarity: 0.7859)
2. crimes (Similarity: 0.7696)
3. murder (Similarity: 0.6961)
4. trafficking (Similarity: 0.6934)
5. terrorism (Similarity: 0.6890)
6. homicide (Similarity: 0.6839)
7. corruption (Similarity: 0.6802)
8. cri

# Finding the frequency of regular expressions affliated with voter issues

In [None]:
# We created a function that will take a list of words which are affliated with topics of voter importance and display a table of how many times they collectively occcur in each speech.

def count_words(words) :
    # Creating a table which shows the date and frequency of the words being used
    df[f'{words[0]}'] = df['Speech'].apply(lambda x: sum(len(re.findall(r'\b' + word + r'\b', x, re.IGNORECASE)) for word in words))
    print(df[['Date',f'{words[0]}']])
    capitalised_words = [word.capitalize() for word in words]
    title = capitalised_words[0]

    # Creating the graph
    df['Date'] = pd.to_datetime(df['Date']) # Convert 'Date' column to datetime
    df['Month'] = df['Date'].dt.month
    monthly=df.groupby('Month')[f'{words[0]}'].sum()
    monthly.plot()
    plt.xlabel("Month")
    plt.ylabel(f"No. of times the {title} was mentioned")
    plt.title(f"Frequency of {title} mentioned in Trump's campaign speeches in 2016")


In [None]:
count_words(economy_top10)

In [None]:
terrorism_words = ['terrorism', 'terrorists', 'attacks']
count_words(terrorism_words)

In [None]:
supreme_court_words = ['Supreme Court', 'Supreme Court Justice', 'Stras']
count_words(supreme_court_words)

In [None]:
healthcare_words = ['Healthcare', 'Affordable Care Act', 'COVID', 'vaccine', 'coronavirus', 'Obamacare', 'health insurance']
count_words(healthcare_words)

In [None]:
foreign_policy_words = ['Foreign Policy', 'tariffs', 'China', 'Putin', 'Xi']
count_words(foreign_policy_words)

# Finding average sentence length

In [None]:
# We created a function to find the length of each sentence
def sentence_length():
    for speech in df['Speech']:
        all_sentence_lengths = []
        doc = nlp(speech)
        for sentence in doc.sents:
            # Exclude the full stop from the word count
            num_words = len([token for token in sentence if not token.is_punct])
            all_sentence_lengths.append(num_words)

    return all_sentence_lengths

# Creating a table of summary statistics for the sentence lengths
word_count = pd.Series(sentence_length())
summary_stats = word_count.describe().to_frame().T
summary_stats.index = ['Sentence Length (Words)']

print(summary_stats)

KeyboardInterrupt: 

In [None]:
def avg_sentence_length_per_speech(df):
    speech_sentence_lengths = []
    for speech in df['Speech']:
        sentences = nltk.sent_tokenize(speech)  # Sentence segmentation with NLTK
        sentence_lengths = [len(nltk.word_tokenize(sentence)) for sentence in sentences]
        avg_length = np.mean(sentence_lengths) if sentence_lengths else 0  # Handle empty speeches
        speech_sentence_lengths.append(avg_length)

    return pd.Series(speech_sentence_lengths, index=df.index)

avg_lengths_series = avg_sentence_length_per_speech(df)

# We then created a new column in the dataframe with the Average Sentence Length per Speech
df['Mean Sentence Length (Words)'] = avg_lengths_series
# df = df.drop(columns=['Mean Sentence Length(Words)'])

sent_summary_stats = df['Mean Sentence Length (Words)'].describe().to_frame().T
sent_summary_stats.index = ['Mean Sentence Length (Words)']


In [None]:
sent_summary_stats





# Analysing Distinguishing Terms


In [None]:
df['Month'] = df['Date'].dt.month
df.head()

In [None]:
import scattertext as st

# create a corpus of text from the dataframe
corpus = st.CorpusFromPandas(df, # load the dataframe
                             category_col='Month', # indicate which column contains the category we want to distinguish by
                             text_col='Speech', # indicate which column stores the text to be analyzed
                             nlp=nlp).build() # load the NLP models used for analysis

# remove stopwords from the corpus of text
corpus=corpus.remove_terms(nlp.Defaults.stop_words, ignore_absences=True)

# now, we create the scatterplot
html = st.produce_scattertext_explorer(
                   corpus, # load the corpus
                   category="8", # indicate which category value we want to compare against all others; in this case, all rows in which "REX TILLERSON" is the speaker
                   category_name='August', # set the label on the plot as "Rex Tillerson"
                   not_category_name='Others', # set the label on the plot for all other speakers as "Others"
                   width_in_pixels=1000) #set the width

In [None]:
# display the plot
display(HTML(html))

# Sentiment Analysis

In [None]:
def get_sentiment_scores_batch(texts):
    docs = nlp.pipe(texts)  # Process all texts in batch
    polarity_scores = [doc._.blob.polarity for doc in docs]
    return polarity_scores
# Apply the function to the 'Speech' column and create new columns
df['Polarity'] = get_sentiment_scores_batch(df['Speech']) # Assign the results directly to 'Polarity' column
df.head()


In [None]:
sorted_polarity = df.sort_values(by=['Polarity'], ascending=False)
sorted_polarity.head()

In [None]:
# Ensure 'Date' column is of datetime type:
df['Date'] = pd.to_datetime(df['Date'])

# Create the scatter plot:
plt.figure(figsize=(10, 6))  # Adjust figure size as needed
plt.scatter(df['Date'], df['Polarity'], label='Polarity')

# Customize the plot:
plt.xlabel('Date')
plt.ylabel('Polarity Score')
plt.title('Polarity Scores Over Time (Scatter Plot)')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()

# Display the plot:
plt.show()

# Topic Modelling: LDA Method

In [None]:
all_speeches_text = " ".join(df['Speech'].tolist())

In [None]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
!pip install contractions

# Sample multi-document corpus
documents = df['Speech']

expanded_documents = [expand_contractions(doc) for doc in documents]

# Preprocessing and vectorization pipeline
vectorizer = CountVectorizer(stop_words='english', lowercase=True)
X = vectorizer.fit_transform(documents)

# Apply NMF with adjusted max_iter and tol
nmf = NMF(n_components=10, max_iter=1000, tol=1e-4, init='nndsvd', random_state=42)  # Increased max_iter and set init method
nmf.fit(X)

# Get the feature names (words)
words = vectorizer.get_feature_names_out()

# Function to find the words most affiliated with a given input word
def find_affiliated_words(input_word, nmf, vectorizer, n_top_words=10):
    if input_word not in vectorizer.get_feature_names_out():
        print(f"The word '{input_word}' is not in the vocabulary.")
        return

    word_index = vectorizer.get_feature_names_out().tolist().index(input_word)
    topic_word_dist = nmf.components_

    topic_weights = topic_word_dist[:, word_index]
    most_affiliated_topic = np.argmax(topic_weights)

    print(f"The word '{input_word}' is most affiliated with Topic #{most_affiliated_topic + 1}")

    # Get the top words for this topic
    top_word_indices = topic_word_dist[most_affiliated_topic].argsort()[:-n_top_words - 1:-1]
    top_words = [words[i] for i in top_word_indices]

    output = print(f"Words most affiliated with '{input_word}': {top_words}")

    return output



find_affiliated_words("economy", nmf, vectorizer)

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2

NameError: name 'contractions' is not defined

In [12]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import contractions

# Function to expand contractions before passing to CountVectorizer
def expand_contractions(text):
    return contractions.fix(text)

# Sample multi-document corpus
documents = df['Speech']

# Expand contractions in the documents
expanded_documents = [expand_contractions(doc) for doc in documents]

# Preprocessing and vectorization pipeline
vectorizer = CountVectorizer(stop_words='english', lowercase=True)
X = vectorizer.fit_transform(expanded_documents)

# Apply NMF with adjusted max_iter and tol
nmf = NMF(n_components=5, max_iter=1000, tol=1e-4, init='nndsvd', random_state=42)  # Increased max_iter and set init method
nmf.fit(X)

# Get the feature names (words)
words = vectorizer.get_feature_names_out()

# Function to find the words most affiliated with a given input word
def find_affiliated_words(input_word, nmf, vectorizer, n_top_words=10):
    if input_word not in vectorizer.get_feature_names_out():
        print(f"The word '{input_word}' is not in the vocabulary.")
        return

    word_index = vectorizer.get_feature_names_out().tolist().index(input_word)
    topic_word_dist = nmf.components_

    topic_weights = topic_word_dist[:, word_index]
    most_affiliated_topic = np.argmax(topic_weights)

    print(f"The word '{input_word}' is most affiliated with Topic #{most_affiliated_topic + 1}")

    # Get the top words for this topic
    top_word_indices = topic_word_dist[most_affiliated_topic].argsort()[:-n_top_words - 1:-1]
    top_words = [words[i] for i in top_word_indices]

    return top_words

# Check with 'economy'
top_words_for_economy = find_affiliated_words("economy", nmf, vectorizer)
print("Words most affiliated with 'economy':", top_words_for_economy)

The word 'economy' is most affiliated with Topic #2
Words most affiliated with 'economy': ['biden', 'china', 'thank', 'america', 'know', 'country', 'want', 'years', 'joe', 'great']


In [20]:
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF
import numpy as np
import contractions

# Load SpaCy model for POS tagging
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])  # Disable unnecessary components for speed

# Function to expand contractions
def expand_contractions(text):
    return contractions.fix(text)

# Function to extract nouns and pronouns using SpaCy's pipe()
def filter_nouns_and_pronouns(documents):
    filtered_texts = []
    for doc in nlp.pipe(documents, batch_size=50):  # Process documents in batches
        filtered_tokens = [token.text for token in doc if token.pos_ in {"NOUN", "PRON"} and not token.is_stop]
        filtered_texts.append(" ".join(filtered_tokens))
    return filtered_texts

# Sample multi-document corpus
documents = df['Speech']

# Preprocessing pipeline: expand contractions and filter nouns/pronouns
expanded_documents = [expand_contractions(doc) for doc in documents]
filtered_documents = filter_nouns_and_pronouns(expanded_documents)

# Vectorization
vectorizer = CountVectorizer(stop_words='english', lowercase=True)
X = vectorizer.fit_transform(filtered_documents)

# Apply NMF with adjusted max_iter and tol
nmf = NMF(n_components=5, max_iter=1000, tol=1e-4, init='nndsvd', random_state=42)
nmf.fit(X)

# Get the feature names (words)
words = vectorizer.get_feature_names_out()

# Function to find the words most affiliated with a given input word
def find_affiliated_words(input_word, nmf, vectorizer, n_top_words=10):
    if input_word not in vectorizer.get_feature_names_out():
        print(f"The word '{input_word}' is not in the vocabulary.")
        return

    word_index = vectorizer.get_feature_names_out().tolist().index(input_word)
    topic_word_dist = nmf.components_

    topic_weights = topic_word_dist[:, word_index]
    most_affiliated_topic = np.argmax(topic_weights)

    print(f"The word '{input_word}' is most affiliated with Topic #{most_affiliated_topic + 1}")

    # Get the top words for this topic
    top_word_indices = topic_word_dist[most_affiliated_topic].argsort()[:-n_top_words - 1:-1]
    top_words = [words[i] for i in top_word_indices]

    return top_words



The word 'economy' is most affiliated with Topic #4
Words most affiliated with 'economy': ['country', 'years', 'jobs', 'year', 'lot', 'world', 'deal', 'time', 'history', 'job']
The word 'health' is most affiliated with Topic #3
Words most affiliated with 'health: ['people', 'years', 'country', 'party', 'lot', 'time', 'year', 'way', 'wall', 'healthcare']
The word 'immigration' is most affiliated with Topic #3
Words most affiliated with 'immigration: ['people', 'years', 'country', 'party', 'lot', 'time', 'year', 'way', 'wall', 'healthcare']


In [29]:
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF
import numpy as np
import contractions

# Load SpaCy model for POS tagging
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])  # Disable unnecessary components for speed

# Function to expand contractions
def expand_contractions(text):
    return contractions.fix(text)

# Function to extract nouns and pronouns using SpaCy's pipe()
def filter_nouns_and_pronouns(documents):
    filtered_texts = []
    for doc in nlp.pipe(documents, batch_size=50):  # Process documents in batches
        filtered_tokens = [token.text for token in doc if token.pos_ in {"NOUN", "PRON"} and not token.is_stop]
        filtered_texts.append(" ".join(filtered_tokens))
    return filtered_texts

# Sample multi-document corpus
documents = df['Speech']

# Preprocessing pipeline: expand contractions and filter nouns/pronouns
expanded_documents = [expand_contractions(doc) for doc in documents]
filtered_documents = filter_nouns_and_pronouns(expanded_documents)

# Vectorization
vectorizer = CountVectorizer(stop_words='english', lowercase=True)
X = vectorizer.fit_transform(filtered_documents)

# Apply NMF with adjusted max_iter and tol
nmf = NMF(n_components=5, max_iter=1000, tol=1e-4, init='nndsvd', random_state=42)
nmf.fit(X)

# Get the feature names (words)
words = vectorizer.get_feature_names_out()

# Function to find the words most affiliated with a list of input words
def find_affiliated_words(input_words, nmf, vectorizer, n_top_words=10):
    results = {}
    topic_word_dist = nmf.components_

    for input_word in input_words:
        if input_word not in vectorizer.get_feature_names_out():
            results[input_word] = f"'{input_word}' is not in the vocabulary."
            continue

        word_index = vectorizer.get_feature_names_out().tolist().index(input_word)
        topic_weights = topic_word_dist[:, word_index]
        most_affiliated_topic = np.argmax(topic_weights)

        # Get the top words for this topic
        top_word_indices = topic_word_dist[most_affiliated_topic].argsort()[:-n_top_words - 1:-1]
        top_words = [words[i] for i in top_word_indices]

        results[input_word] = {
            "Most Affiliated Topic": most_affiliated_topic + 1,
            "Top Words": top_words
        }

    return results

# Example: Check with a list of words
input_words = economy_top10
top_words_for_list = find_affiliated_words(input_words, nmf, vectorizer)

# Display results
for word, details in top_words_for_list.items():
    if isinstance(details, str):  # Handle missing words
        print(f"{word}: {details}")
    else:
        print(f"{word} is most affiliated with Topic #{details['Most Affiliated Topic']}")
        print(f"Top words: {', '.join(details['Top Words'])}")
        print()


economic: 'economic' is not in the vocabulary.
growth is most affiliated with Topic #1
Top words: people, years, way, thing, country, guy, job, time, ballots, state

recession is most affiliated with Topic #3
Top words: people, years, country, party, lot, time, year, way, wall, healthcare

economies is most affiliated with Topic #4
Top words: country, years, jobs, year, lot, world, deal, time, history, job

recovery is most affiliated with Topic #1
Top words: people, years, way, thing, country, guy, job, time, ballots, state

downturn: 'downturn' is not in the vocabulary.
slowdown: 'slowdown' is not in the vocabulary.
slowing: 'slowing' is not in the vocabulary.
sector is most affiliated with Topic #4
Top words: country, years, jobs, year, lot, world, deal, time, history, job

crisis is most affiliated with Topic #1
Top words: people, years, way, thing, country, guy, job, time, ballots, state

