In [31]:
import pandas as pd
from data import presidential_utils
from data.data_utils import load_spacy_sentencizer
import numpy as np
import string
import spacy
from spacy.pipeline import Sentencizer
nlp = spacy.load("en_core_web_sm")
config = {"punct_chars": ['!', '.', '?', '...', ';', ':', '(', ')']}
nlp.add_pipe("sentencizer", config=config)
from spacy.lang.en import stop_words
stop_words = stop_words.STOP_WORDS

In [32]:
pres_path = "/data/laviniad/presidential"

print("Now loading data...")
presidential_df = presidential_utils.load_full_df_from_raw(pres_path)

Now loading data...


 19%|████████████▊                                                     | 19/98 [00:05<00:21,  3.75it/s]


In [33]:
## load models etc
import json
import pickle as pkl
import pprint

kc_keywords_path = '/home/laviniad/projects/religion_in_congress/src/multi-feature-use/kevincoekeywords/full.json'
keyword_strs = 'keywords_from_coca.txt', 'keywords_from_congress.txt'

def get_keywords(keyword_path):
    with open(keyword_path) as f:
        keyword_set = [l.strip() for l in f.readlines()]
    return keyword_set

keywords_coca = get_keywords(keyword_strs[0])
keywords_congress = get_keywords(keyword_strs[1])

In [34]:
with open(kc_keywords_path, 'r') as f:
    kc_keywords = json.load(f)

In [35]:
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer

In [36]:
TOK = ['god']
wnl = WordNetLemmatizer()
syn = wn.synsets(wnl.lemmatize(TOK[0]), pos=wn.NOUN)
actual = syn[0]
hyponyms = actual.instance_hyponyms()[:-1] # last tends to be... sketch
lemmas_of_hyponyms = [s.lemmas() for s in hyponyms]
hyponyms = [str(s.name()) for instance in lemmas_of_hyponyms for s in instance]
TOK += hyponyms
TOK += [str(s.name()).capitalize() for s in actual.lemmas()]
TOK = list(set([s.replace('_', ' ') for s in TOK if s != 'Allah']))
temp = []
for t in TOK:
    if t != 'Creator' and t != 'Maker' and t!= 'Lord':
        temp.append(t.lower())
TOK += temp
TOK = list(set(TOK))
print(f"Synonym set: {TOK}")

Synonym set: ['Creator', 'Lord', 'almighty', 'Divine', 'god', 'divine', 'Jehovah', 'Supreme being', 'jehovah', 'supreme being', 'god almighty', 'godhead', 'Maker', 'God', 'God Almighty', 'Almighty', 'Godhead']


In [37]:
USE_MY_KEYWORDS = True

if USE_MY_KEYWORDS:
    full_keywords = list(set(keywords_coca).intersection(set(keywords_congress)).union(set([t.lower() for t in TOK])))
    full_keywords.remove('god almighty')
    full_keywords.remove('flesh')
    full_keywords.remove('apostles')
    temp = []
    for i in full_keywords:
        if not ('god' in i and i != 'god'):
            temp.append(i)

    full_keywords = temp
else: 
    full_keywords = [e[1] for e in kc_keywords.items()]
    full_keywords = [' ' + e + ' ' for s in full_keywords for e in s]

In [38]:
## expand keywords
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import gensim.downloader
from tqdm.notebook import tqdm

keyword_temp = full_keywords
model_name = 'word2vec-google-news-300'
word2vec_model = gensim.downloader.load(model_name)

KeyboardInterrupt: 

In [28]:
def calculate_centroid(keywords, model):
    vectors = [model[word] for word in keywords if word in model.vocab]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return None
    

def expand_keywords(keywords, model, centroid, topn=5):
    expanded_keywords = {}
    for keyword in keywords:
        try:
            similar_words = model.most_similar(keyword, topn=topn)
            expanded_keywords[keyword] = [word for word, _ in similar_words]
        except KeyError:
            print(f"Word '{keyword}' not in vocabulary.")
            expanded_keywords[keyword] = []

    return expanded_keywords

In [27]:
non_specific = ['romans', 'maker', 'glory', 'savior', 'creator', 'sins', 'verses', 'thou', 'gospel', 'lord']
centroid = calculate_centroid(keyword_temp, word2vec_model)
expanded_keywords = expand_keywords([k for k in keyword_temp if k not in non_specific], word2vec_model, centroid, topn=5)

for keyword, similar_words in expanded_keywords.items():
    print(f"{keyword}: {similar_words}")

Word 'ephesians' not in vocabulary.
Word 'supreme being' not in vocabulary.
Word 'jehovah' not in vocabulary.
Word 'corinthians' not in vocabulary.
Word 'israelites' not in vocabulary.
Word 'hebrews' not in vocabulary.
Word 'philippians' not in vocabulary.
ephesians: []
salvation: ['deliverance', 'resurrection', 'eternal_salvation', 'eternal_destiny', 'savior']
scripture: ['Scripture', 'scriptures', 'Scriptures', 'Bible', 'New_Testament']
supreme being: []
god: ['gods', 'deity', 'God', 'diety', 'goddess']
eternal: ['everlasting', 'eternity', 'earthly', 'eternal_bliss', 'blessedness']
resurrection: ['rebirth', 'miraculous_resurrection', 'salvation', 'bodily_resurrection', 'Jesus_resurrection']
psalm: ['Psalm', 'Psalms', 'psalms', 'prophet_Jeremiah', 'verse']
righteousness: ['righteous', 'holiness', 'Ps_##:#', 'Matt._#:##-##', 'lovingkindness']
christ: ['jesus', 'derek', 'cohen', 'francis', 'moses']
righteous: ['righteousness', 'effectual_fervent_prayer', 'virtuous', 'godly', 'self_right

In [None]:
#full_keywords.remove('God')
full_keywords.remove('god')

In [44]:
from collections import Counter
import nltk

def get_lexical_overlap(speech):
    if len(speech) > 0:
        count = 0
        for t in full_keywords:
            count += speech.count(t)

        return count / len(nltk.word_tokenize(speech)) # roughly normalize
    return 0

In [45]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [None]:
presidential_df = presidential_df[presidential_df['is_president']]
presidential_df['lexical'] = presidential_df['text'].progress_apply(get_lexical_overlap)

In [None]:
presidential_df['year'] = presidential_df['year'].progress_apply(lambda x: int(x))
presidential_df = presidential_df[presidential_df['year'] < 2022]
presidential_df['inaugural_year'] = presidential_df['year'].progress_apply(lambda x: (x - 1) % 4 == 0)

In [None]:
res = presidential_df[presidential_df['year'] >= 1980]

In [None]:
res.head()

In [38]:
res = res.drop([res.lexical.idxmax(),res.lexical.idxmin()])

In [None]:
presidents = {
    1977: 'Jimmy Carter',
    1981: 'Ronald Reagan',
    1989: 'George Bush',
    1993: 'William J. Clinton',
    2001: 'George W. Bush',
    2009: 'Barack Obama',
    2017: 'Donald J. Trump',
    2021: 'Joseph R. Biden',
    2025: 'unknown' 
}
presidents = {v: k for k, v in presidents.items()}
year_vals = sorted(list(presidents.values()))

def during_term(row_data):
    pres_name = row_data['speaker']
    start = presidents[pres_name]
    end = year_vals[year_vals.index(presidents[pres_name]) + 1]
    
    return row_data['year'] >= start and row_data['year'] < end

In [None]:
res['during_term'] = res.apply(during_term, axis=1)

In [None]:
res = res[res['during_term']]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
palette = {'Democratic': 'blue', 'Republican': 'red'}

sns.set(rc={'figure.figsize':(12,8)}, style='whitegrid')
sns.pointplot(data=res, y='lexical', x='year', hue='party', palette=palette, errorbar=None)
#sns.lineplot(data=res, y='lexical', x='year', color='black', ci=None)
#sns.pointplot(y=pres_num[:,3], x=pres_num[:,0], hue=pres_num[:,4])#, linestyle='none', errorbar=None)
plt.xticks(rotation=75)
plt.ylabel('Proportion of religious keywords')
plt.xlabel('Year')
plt.legend(title='Party', bbox_to_anchor=(1.05, 0.8), loc='upper left', borderaxespad=0)
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(12,8)}, style='whitegrid')
sns.pointplot(data=res, y='lexical', x='year', hue='party', join=False)
#sns.pointplot(y=pres_num[:,3], x=pres_num[:,0], hue=pres_num[:,4])#, linestyle='none', errorbar=None)
plt.xticks(rotation=75)
plt.ylabel('Proportion of religious keywords')
plt.xlabel('Year')
plt.legend(title='Party', bbox_to_anchor=(1.05, 0.8), loc='upper left', borderaxespad=0)
plt.show()