In [None]:
!pip install datasets

In [3]:
import pandas as pd
import os
import re
import argparse
from tqdm import tqdm

from rapidfuzz import fuzz, process
from data.bible_utils import comp_bible_helper
from data.congress_utils import induce_party_and_state
from data.data_utils import load_god_synonyms, get_lexical_overlap

import numpy as np
from scipy.spatial.distance import cosine

from transformers import AutoTokenizer
from datasets import Dataset
import nltk
import torch

tqdm.pandas()

from data import congress_utils
from src.references.train_biencoder import BiModel

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"

#parser = argparse.ArgumentParser()
#parser.add_argument('--model_dir', default="/data/laviniad/sermons-ir/modeling/tuned_mpnet/model.zip", type=str)
#parser.add_argument('--input', default="/data/corpora/congressional-record/", type=str)
#parser.add_argument('--out_dir', default="/data/laviniad/sermons-ir/modeling/mpnet_results/")
#parser.add_argument('--congress_errata_path', default="/data/laviniad/congress_errata/", type=str)
#parser.add_argument('--device', default="2", type=str)
#parser.add_argument('--debug', action="store_true")
#args = parser.parse_args()
args = {'model_dir': "/data/laviniad/sermons-ir/modeling/tuned_mpnet/model.zip",
        'input': "/data/corpora/congressional-record/",
        'out_dir': "/data/laviniad/sermons-ir/modeling/mpnet_results/",
        'congress_errata_path': "/data/laviniad/congress_errata/",
        'device': "2",
        'debug': True
       }

DEVICE = 'cuda:' + args['device'] if torch.cuda.is_available() else 'cpu'

KEYWORD_FILTER_THRESHOLD = 0.0005

# load congressional data
print("Loading congressional data")
congressional_df = congress_utils.load_full_df_from_raw(args['input'])
congressional_df = induce_party_and_state(congressional_df)

if args['debug']:
    congressional_df = congressional_df.sample(10000)
    print("In debug mode; sampled 10000 documents from congressional record")

# load keywords
print("Loading keywords")
keywords_path = '/home/laviniad/projects/religion_in_congress/src/multi-feature-use/'
keyword_strs = keywords_path + 'keywords_from_coca.txt', keywords_path + 'keywords_from_congress.txt'

def get_keywords(keyword_path):
    with open(keyword_path) as f:
        keyword_set = [l.strip() for l in f.readlines()]
    return keyword_set

keywords_coca = get_keywords(keyword_strs[0])
keywords_congress = get_keywords(keyword_strs[1])

god_synonyms = load_god_synonyms()
full_keywords = list(set(keywords_coca).intersection(set(keywords_congress)).union(set([t.capitalize() for t in god_synonyms])))
full_keywords.remove('flesh')
temp = []
for i in full_keywords:
    if not ('god' in i and i != 'god'):
        temp.append(i) # avoid all lowercased god mentions

full_keywords = temp
words_of_concern = ['verse', 'verses', 'thou', 'Verse', 'Verses', 'Thou'] # not good signals
for w in words_of_concern:
    if w in full_keywords:
        full_keywords.remove(w)

# retrieve the speeches that contain these keywords and create new df with rows containing sentence + verse + sermon idx of sentence
print("Filtering congressional data by keywords")
def filter(speech, threshold):
    overlap = get_lexical_overlap(speech, full_keywords)
    return overlap > threshold

congressional_df['religious'] = congressional_df['text'].apply(lambda x: filter(x, KEYWORD_FILTER_THRESHOLD))
filtered_df = congressional_df[congressional_df['religious']]
print(f"Number of religious speeches: {len(filtered_df.index)}")

infer_df = []
for idx, row in tqdm(filtered_df.iterrows()):
    sentence_list = nltk.sent_tokenize(row['text'])
    for i,s in enumerate(sentence_list):
        infer_df.append({
            'sermon_idx': idx,
            'index_in_sermon': i,
            'text': s
        })

infer_df = pd.DataFrame(infer_df)
print(f"Number of potentially religious sentences: {len(infer_df.index)}")

# load model
print("Loading model...")
full_model = BiModel(device=DEVICE).to(DEVICE)
full_model.load_state_dict(torch.load(args['model_dir']))
model = full_model.model
model.eval()
biTokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')

# load verses
print("Loading verses...")
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

bible_df = comp_bible_helper()
pop_verses = pd.read_csv('/home/laviniad/projects/religion_in_congress/data/most_popular_verses.csv')
n = 250 # VERY generous
pop_citations = list(pop_verses['verse'].iloc[1:n+1])
bible_df['King James Bible'] = bible_df['King James Bible'].apply(remove_tags) # KJV in this df has italics etc
bible_df['Verse'] = bible_df['Verse'].apply(lambda x: x.lower())
limited_bible_df = bible_df[bible_df['Verse'].apply(lambda x: x in pop_citations)]
limited_verses = limited_bible_df['King James Bible']
verse_df = [{'text': t['King James Bible'], 'citation': t['citation']} for idx,t in limited_bible_df.iterrows()]
limited_verse_to_citation = dict(zip(limited_verses, limited_bible_df['Verse']))
limited_citation_to_verse = {v.lower(): k for k,v in limited_verse_to_citation.items()}

Loading congressional data
On file 0


100%|██████████████████████████████████████████████████████████████████████████████████████████| 106493/106493 [00:00<00:00, 170303.89it/s]


On file 1


100%|██████████████████████████████████████████████████████████████████████████████████████████| 190933/190933 [00:01<00:00, 175686.53it/s]


On file 2


100%|████████████████████████████████████████████████████████████████████████████████████████████| 94162/94162 [00:00<00:00, 206183.81it/s]


On file 3


100%|██████████████████████████████████████████████████████████████████████████████████████████| 207487/207487 [00:00<00:00, 216271.72it/s]


On file 4


100%|██████████████████████████████████████████████████████████████████████████████████████████| 183503/183503 [00:00<00:00, 215197.85it/s]


On file 5


100%|██████████████████████████████████████████████████████████████████████████████████████████| 130265/130265 [00:00<00:00, 228412.63it/s]


On file 6


100%|██████████████████████████████████████████████████████████████████████████████████████████| 197548/197548 [00:00<00:00, 215442.23it/s]


On file 7
On file 8


100%|██████████████████████████████████████████████████████████████████████████████████████████| 139281/139281 [00:00<00:00, 225796.61it/s]


On file 9


100%|██████████████████████████████████████████████████████████████████████████████████████████| 127268/127268 [00:00<00:00, 214713.74it/s]


On file 10


100%|██████████████████████████████████████████████████████████████████████████████████████████| 179989/179989 [00:00<00:00, 211750.78it/s]


On file 11


100%|██████████████████████████████████████████████████████████████████████████████████████████| 271528/271528 [00:01<00:00, 225796.31it/s]


On file 12


100%|████████████████████████████████████████████████████████████████████████████████████████████| 48007/48007 [00:00<00:00, 237146.87it/s]


On file 13


100%|██████████████████████████████████████████████████████████████████████████████████████████| 210379/210379 [00:01<00:00, 207644.42it/s]


On file 14


100%|██████████████████████████████████████████████████████████████████████████████████████████| 196111/196111 [00:00<00:00, 221269.06it/s]


On file 15


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 80/80 [00:00<00:00, 218453.33it/s]


In debug mode; sampled 10000 documents from congressional record
Loading keywords
Synonym set: ['God', 'God Almighty', 'Supreme being', 'Lord', 'Divine', 'supreme being', 'Creator', 'god almighty', 'Almighty', 'godhead', 'Godhead', 'jehovah', 'Jehovah', 'almighty', 'god', 'Maker', 'divine']
Filtering congressional data by keywords
Number of religious speeches: 208


208it [00:00, 2034.22it/s]

Number of potentially religious sentences: 6297
Loading model...





Loading verses...


In [4]:
# embed congress sentences
print("Creating Congress data loader")
congressDataset = Dataset.from_pandas(infer_df)
congressDataset = congressDataset.map(lambda x: biTokenizer(x["text"], max_length=512, padding="max_length", truncation=True))
        
for col in ['input_ids', 'attention_mask']:
    congressDataset = congressDataset.rename_column(col, 'text'+'_'+col)
            
congressDataset.set_format(type='torch')
congressLoader = torch.utils.data.DataLoader(congressDataset, batch_size=4, shuffle=False)

Creating Congress data loader


Map:   0%|          | 0/6297 [00:00<?, ? examples/s]

In [19]:
print("Embedding Congress sentences")
congress_result_tuples = []
for batch in tqdm(congressLoader):
    input_ids = batch["text_input_ids"].to(DEVICE)
    attention_masks = batch["text_attention_mask"].to(DEVICE)
    embedding = model(input_ids, attention_mask=attention_masks)[0]
    print(f"Embeddings: {embedding.size()}")
    mean_pooled = full_model.mean_pooling(embedding, attention_masks).to('cpu')
    print(f"Shape of mean pooled: {mean_pooled.size()}")
    print(f"Batch: {batch}")
    for b in range(0, 4):
        print(b)
        congress_result_tuples.append((batch['text'][b], batch['sermon_idx'][b], mean_pooled[b]))
    

Embedding Congress sentences


  0%|▏                                                                                                    | 2/1575 [00:00<01:36, 16.35it/s]

Embeddings: torch.Size([4, 512, 768])
Shape of mean pooled: torch.Size([4, 768])
Batch: {'sermon_idx': tensor([2133635, 2133635, 2133635, 2133635]), 'index_in_sermon': tensor([0, 1, 2, 3]), 'text': ['Mr. JONES of North Carolina.', 'Mr. Speaker, I thank the gentleman for yielding me the time, and I want to thank the chairman of this committee and the ranking member for working with me on H.R.', '1799, the Fallen Heroes Immigrant Spouse Fairness Act.', 'Mr. Speaker, this came to my attention when I attended the funeral of a Marine who was killed in Operation Iraqi Freedom.'], 'text_input_ids': tensor([[    0,  2724,  1016,  ...,     1,     1,     1],
        [    0,  2724,  1016,  ...,     1,     1,     1],
        [    0, 13843,  1014,  ...,     1,     1,     1],
        [    0,  2724,  1016,  ...,     1,     1,     1]]), 'text_attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
0
1
2


  1%|▌                                                                                                    | 8/1575 [00:00<01:16, 20.55it/s]

Shape of mean pooled: torch.Size([4, 768])
Batch: {'sermon_idx': tensor([1336926,   51815,   51815,   51815]), 'index_in_sermon': tensor([2, 0, 1, 2]), 'text': ['Mr. WILSON of South Carolina led the Pledge of Allegiance as follows: I pledge allegiance to the Flag of the United States of America, and to the Republic for which it stands, one nation under God, indivisible, with liberty and justice for all.', 'Mr. JOHNSON of Louisiana.', "Mr. Chairman, I appreciate my colleague's presentation here.", 'It is clear and concise, and he raises important points.'], 'text_input_ids': tensor([[   0, 2724, 1016,  ...,    1,    1,    1],
        [   0, 2724, 1016,  ...,    1,    1,    1],
        [   0, 2724, 1016,  ...,    1,    1,    1],
        [   0, 2013, 2007,  ...,    1,    1,    1]]), 'text_attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
0
1
2
3
Embeddings: torch.Size([4, 512, 768])
Sh

  1%|▋                                                                                                   | 10/1575 [00:00<01:23, 18.69it/s]


Embeddings: torch.Size([4, 512, 768])
Shape of mean pooled: torch.Size([4, 768])
Batch: {'sermon_idx': tensor([51815, 51815, 51815, 51815]), 'index_in_sermon': tensor([19, 20, 21, 22]), 'text': ['I said this on the floor in September--many of us have--and I will say it again today, the United States is blessed because our land is filled with an abundance of natural resources.', 'My own congressional district back in Louisiana is home of one of the largest natural gas reserves in the country.', 'We believe, we insist that we have the means and the responsibility to use those God-given resources to create jobs, foster economic growth, and pave the way to an era of American energy dominance.', 'Oppressive policies like the ones before us today have been our own worst enemy.'], 'text_input_ids': tensor([[    0,  1049,  2060,  ...,     1,     1,     1],
        [    0,  2030,  2223,  ...,     1,     1,     1],
        [    0,  2061,  2907,  ...,     1,     1,     1],
        [    0, 28562, 

OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB (GPU 2; 23.69 GiB total capacity; 21.93 GiB already allocated; 3.69 MiB free; 22.70 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [5]:
# embed bible verses
print("Creating Bible verse data loader")
verseDataset = Dataset.from_pandas(verse_df)
verseDataset = verseDataset.map(lambda x: biTokenizer(x["text"], max_length=512, padding="max_length", truncation=True))
        
for col in ['input_ids', 'attention_mask']:
    verseDataset = verseDataset.rename_column(col, 'text'+'_'+col)
            
verseDataset.set_format(type='torch')
verseLoader = torch.utils.data.DataLoader(verseDataset, batch_size=4, shuffle=False)

print("Embedding Bible verses")
verse_result_tuples = []
for batch in tqdm(verseLoader):
    input_ids = batch["text_input_ids"].to(DEVICE)
    attention_masks = batch["text_attention_mask"].to(DEVICE)
    embedding = model(input_ids, attention_mask=attention_masks)[:,0]
    mean_pooled = BiModel.mean_pooling(embedding, attention_masks).to('cpu')
    print(f"Shape of embedding batch: {embedding.size()}")
    for b in range(0, len(batch)):
        verse_result_tuples.append((batch[b]['text'], batch[b]['citation'], embedding[b]))

# create new df of references given the above pairs
result_df = []
print("Finding most similar Bible verses")
for congress_tuple in tqdm(congress_result_tuples):
    embedding = congress_tuple[2]
    
    similarities = [1 - cosine(embedding, verse[2]) for verse in verse_result_tuples]
    max_similarity_index = np.argmax(similarities)
    cosine_sim = similarities[max_similarity_index]
    verse_tuple = verse_result_tuples[max_similarity_index]
    result_df.append({
        'sermon_idx': congress_tuple[1], # comes from congress_utils.load_full_df_from_raw(args.input) indices
        'text': congress_tuple[0],
        'most_similar_verse': verse_tuple[1],
        'cosine_similarity': cosine_sim,
        'verse_citation': verse_tuple[1],
    })

# dump
if not args['debug']:
    result_df.to_csv(args['out_dir'] + 'results.csv')
    print(f"Dumped result_df to {args['out_dir'] + 'results.csv'}")
else:
    print("In debug mode; did not dump results")

Embedding Congress sentences


  0%|                                                                                                             | 0/1575 [00:00<?, ?it/s]


TypeError: tuple indices must be integers or slices, not tuple