In [None]:
# import packages
import pandas as pd
import json
import numpy as np
import csv
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint
from collections import Counter
from data import presidential_utils, congress_utils
from data.bible_utils import comp_bible_helper
import re
from nltk.tokenize import sent_tokenize
from tqdm.notebook import tqdm

In [None]:
from transformers import RobertaForMaskedLM, RobertaTokenizer, pipeline

In [None]:
# load congressional data
nonproc_indices = '/data/laviniad/congress_errata/nonprocedural_indices.json'
congress_df = congress_utils.load_full_df_from_raw('/data/corpora/congressional-record/', remove_procedural_speeches=True, nonprocedural_indices_path=nonproc_indices)

In [None]:
# load popular bible verses

TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

bible_df = comp_bible_helper()
pop_verses = pd.read_csv('/home/laviniad/projects/religion_in_congress/data/most_popular_verses.csv')
n = 250
pop_citations = list(pop_verses['verse'].iloc[1:n+1]) # remove 'UNKNOWN'

bible_df['Verse'] = bible_df['Verse'].apply(lambda x: x.lower())
bible_df = bible_df[bible_df['Verse'].apply(lambda x: x in pop_citations)]
bible_df['text'] = bible_df['King James Bible'].apply(remove_tags)
verses = bible_df['text']
verse_to_citation = dict(zip(verses, bible_df['Verse']))

In [19]:
# load embedding model

USE_FINETUNED = True

if USE_FINETUNED:
    model = RobertaForMaskedLM.from_pretrained('roberta-large', output_hidden_states=True).to('cpu')
    tokenizer = RobertaTokenizer.from_pretrained('roberta-large', model_max_length=model.config.max_position_embeddings, truncation=True)
else:
    model = RobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path='/data/laviniad/sermons-ir/modeling/presidential_pretrained/checkpoint-110000', output_hidden_states=True).to('cpu')
    tokenizer = RobertaTokenizer.from_pretrained('roberta-large', model_max_length=model.config.max_position_embeddings, truncation=True)

In [None]:
# embed popular bible verses

verse_idx_to_embedding = {}

for idx,verse in tqdm(enumerate(verses)):
    tokenized_input = tokenizer(verse, return_tensors="pt", padding=True, truncation=True)
    
    # doesn't really need to be efficient
    output = model(**tokenized_input)
    hidden_states = output.hidden_states
    representation = hidden_states[-1][:, 0, :]
    verse_idx_to_embedding[idx] = representation

In [None]:
pence_df = congress_df[congress_df['speaker'] == 'Mr. PENCE']
print(f"Number of Mike Pence utterances: {len(pence_df.index)}")

sentence_list = []

for idx, row in tqdm(pence_df.iterrows(), total=len(pence_df.index)):
    sentences = sent_tokenize(row['text'])
    sentence_list += sentences

In [22]:
from sentence_transformers import SentenceTransformer, util
# literally JUST minilm
model = SentenceTransformer('all-mpnet-base-v2')

pence_embeddings = model.encode(sentence_list)
verse_embeddings = model.encode(list(verses))
cos_sim = util.cos_sim(pence_embeddings, verse_embeddings)

all_sentence_combinations = []
for i in range(len(pence_embeddings)):
    for j in range(len(verse_embeddings)):
        all_sentence_combinations.append([cos_sim[i][j], i, j])

all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)

print("Top-25 most similar pairs:")
for score, i, j in all_sentence_combinations[0:25]:
    print("{} \t {} \t {:.4f}".format(sentence_list[i], list(verses)[j], cos_sim[i][j]))

Downloading .gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Top-25 most similar pairs:
No greater love has a man than this, that he should lay down his life for his friends. 	 Greater love hath no man than this, that a man lay down his life for his friends. 	 0.9404
It is written, ``No greater love has a man than this that he should lay down his life for his friends.'' 	 Greater love hath no man than this, that a man lay down his life for his friends. 	 0.9202
It is written that no greater love has a man than this that he should lay down his life for his friends. 	 Greater love hath no man than this, that a man lay down his life for his friends. 	 0.8910
The Bible tells us that God has not given us the spirit of fear, but a spirit of power and love, and of a sound mind. 	 For God hath not given us the spirit of fear; but of power, and of love, and of a sound mind. 	 0.8872
Several millennia ago the words were written that a man should leave his father and mother and cleave to his wife and the two shall become one flesh. 	 Therefore shall a man 

In [24]:
results_df = []
verses = list(verses)

for score, i, j in all_sentence_combinations[:500]:
    results_df.append({'pence_sentence': sentence_list[i],
                       'bible_verse': verses[j],
                       'similarity': cos_sim[i][j]
    })
    
pd.DataFrame(results_df).to_csv('/home/laviniad/projects/religion_in_congress/data/pence_bible_similarity.csv')

IsADirectoryError: [Errno 21] Is a directory: '/home/laviniad/projects/religion_in_congress/'

In [23]:
for score, i, j in all_sentence_combinations[0:25]:
    print("{} \t {} \t {:.4f}".format(sentence_list[i], list(verses)[j], cos_sim[i][j]))

No greater love has a man than this, that he should lay down his life for his friends. 	 Greater love hath no man than this, that a man lay down his life for his friends. 	 0.9404
It is written, ``No greater love has a man than this that he should lay down his life for his friends.'' 	 Greater love hath no man than this, that a man lay down his life for his friends. 	 0.9202
It is written that no greater love has a man than this that he should lay down his life for his friends. 	 Greater love hath no man than this, that a man lay down his life for his friends. 	 0.8910
The Bible tells us that God has not given us the spirit of fear, but a spirit of power and love, and of a sound mind. 	 For God hath not given us the spirit of fear; but of power, and of love, and of a sound mind. 	 0.8872
Several millennia ago the words were written that a man should leave his father and mother and cleave to his wife and the two shall become one flesh. 	 Therefore shall a man leave his father and his mo

In [None]:
# embed pence (speaker tag: 'Mr. PENCE')

import sys

sentence_batch_size = 4

pence_df = congress_df[congress_df['speaker'] == 'Mr. PENCE']
print(f"Number of Mike Pence utterances: {len(pence_df.index)}")
df_idx_to_embedding = {}

for idx, row in tqdm(pence_df.iterrows(), total=len(pence_df.index)):
    sentences = sent_tokenize(row['text'])

    for batch_start in range(0, len(sentences), sentence_batch_size):
        batch_end = min(batch_start + sentence_batch_size, len(sentences))
        batch_sentences = sentences[batch_start:batch_end]

        tokenized_input = tokenizer(batch_sentences, return_tensors="pt", 
                                    padding=True, truncation=True
                                   ).to('cuda:1')

        output = model(**tokenized_input)
        hidden_states = output.hidden_states
        cls_token_representation = hidden_states[-1][:, 0, :].to('cpu')
        representation_list = list(cls_token_representation.split(1, dim=0))
        
        # memory issues???
        del tokenized_input
        del output
        del hidden_states
        del cls_token_representation

        df_idx_to_embedding[idx] = df_idx_to_embedding.get(idx, []) + representation_list

    if idx % 50 == 0:
        size = sys.getsizeof(df_idx_to_embedding)

In [None]:
# reshape pence embeddings to be one long list
sentence_idx_to_df_idx = {}
sentence_idx_to_embedding = {}

count = 0

for idx,e in df_idx_to_embedding.items():
    for i,embedding in enumerate(e):
        sentence_idx_to_df_idx[count] = idx
        sentence_idx_to_embedding[count] = embedding
        count += 1

In [None]:
# compute pairwise similarity

from sklearn.metrics.pairwise import cosine_similarity
import torch

similarity_matrix = cosine_similarity(list(sentence_idx_to_embedding.values()), list(verse_idx_to_embedding.values())) # now similarity_matrix[i,j] corresponds to similarity between sentence i and verse j

In [None]:
# evaluate most similar pairs

most_similar_indices = torch.argmax(similarity_matrix, dim=1)
most_similar_pairs = [(i, most_similar_indices[i].item()) for i in range(len(most_similar_indices))]