In [1]:
import pandas as pd

speech_df = "/data/laviniad/congress_errata/congress_df.json"
speech_df = pd.read_json(speech_df)

reference_df = "/data/laviniad/sermons-ir/merged_references.csv"
reference_df = pd.read_csv(reference_df)

In [2]:
speech_df.columns

Index(['bio_id', 'congress_num', 'year', 'date', 'chamber', 'speaker', 'text',
       'month_code', 'month', 'gender', 'party', 'state', 'is_in_senate',
       'enclosing_county', 'perc_adherents', 'religion', 'is_republican',
       'church_in_bio', 'dw_nom_1', 'dw_nom_2', 'lexical', 'num_general',
       'num_christian', 'length', 'binary_lex', 'full_state',
       'state_perc_white', 'state_perc_black'],
      dtype='object')

In [3]:
len(reference_df)

1514

In [4]:
# use nltk regexptokenizer

from data.data_utils import tokenizer


def count_god_mentions_in_text(text):
    # tokenize text
    tokens = tokenizer.tokenize(text)
    # count mentions
    low_tokens = [t.lower() for t in tokens]
    god_mentions = low_tokens.count("god")
    return god_mentions

Loaded 100 keywords from /home/laviniad/projects/sermons-ir/src/multi-feature-use/keywords_from_coca.txt
Loaded 150 keywords from /home/laviniad/projects/sermons-ir/src/multi-feature-use/keywords_from_congress_FINAL.txt


In [5]:
# want: speaker, chamber, state, gender, total words spoken, total religious keywords used, total mentions of God, total biblical references 
from tqdm.notebook import tqdm


columns = ["bio_id", "speaker_name", "chamber", "state", "gender", "party", "total_words", "total_keywords", "total_god_mentions", "total_biblical_references"]
results = []
for b in tqdm(speech_df["bio_id"].unique()):
    speaker_df = speech_df[speech_df["bio_id"] == b]
    if speaker_df.empty:
        continue

    speaker_name = speaker_df["speaker"].iloc[0]
    chamber = speaker_df["chamber"].iloc[0]
    state = speaker_df["state"].iloc[0]

    gender = speaker_df["gender"].iloc[0]
    party = speaker_df["party"].iloc[0]

    total_words = speaker_df["length"].sum()
    total_keywords = speaker_df["num_christian"].sum() + speaker_df["num_general"].sum()
    total_god_mentions = speaker_df["text"].apply(lambda x: count_god_mentions_in_text(x)).sum()
    total_biblical_references = len(reference_df[reference_df["bio_id"] == b])

    results.append([b, speaker_name, chamber, state, gender, party, total_words, total_keywords, total_god_mentions, total_biblical_references])


results = pd.DataFrame(results, columns=columns)

  0%|          | 0/1615 [00:00<?, ?it/s]

In [6]:
results['total_god_mentions'].sum()

54402

In [7]:
len(results[results['total_god_mentions'] > 0])

1424

In [8]:
# number of unique speakers
print("Number of unique speakers: ", len(results))
print("Proportion who never say God: ", len(results[results['total_god_mentions'] == 0]) / len(results))

Number of unique speakers:  1614
Proportion who never say God:  0.11771995043370508


In [9]:
results.to_csv("/data/laviniad/congress_errata/congress_speakers.csv", index=False)