In [2]:
import nomic
import json
api_key_path = "/home/ubuntu/api_keys.json"

with open(api_key_path, 'r') as j:
    key = json.loads(j.read())['nomic']

nomic.login(key)

#- Scientific programming
import numpy as np
import pandas as pd
from tqdm import tqdm

#- Plotting
import matplotlib.pyplot as plt
import seaborn as sns

#- Data
from nomic import AtlasDataset

#- Processing
from fuzzywuzzy import fuzz

dataset = AtlasDataset('hivemind/tweets-from-members-of-us-congress-from-all-time-updated-2024-05-29')

embds = dataset.maps[0].embeddings.latent
df = dataset.maps[0].data.df

[32m2024-06-10 15:58:20.214[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36m__init__[0m:[36m829[0m - [1mLoading existing dataset `hivemind/tweets-from-members-of-us-congress-from-all-time-updated-2024-05-29`.[0m
100%|██████████████████████████████████████████████████████████████████████████████| 129/129 [00:00<00:00, 20436.06it/s]


In [3]:
#- In future we will add relevant tweet indices based on tags

congress_name_list, tweets_per_congress_person = np.unique(list(df['name']), return_counts=True)
congress_name_list = list(congress_name_list)
congress_handles = [list(df['twitter_lower'][df['name'] == cn])[0] for cn in congress_name_list]

average_embds = np.zeros((len(congress_name_list), embds.shape[1]))
party_by_congress_person = [-1 for cp in congress_name_list]

for i, embd in enumerate(tqdm(embds)):
    congress_name_index = congress_name_list.index(df['name'][i])
    average_embds[congress_name_index] += embd

    if party_by_congress_person[congress_name_index] == -1:
        party_by_congress_person[congress_name_index] = df['party'][i].lower()

average_embds = np.divide(average_embds, tweets_per_congress_person.reshape(-1, 1))
unique_party_labels, party_counts = np.unique(party_by_congress_person, return_counts=True)

100%|██████████████████████████████████████████████████████████████████████| 3037316/3037316 [01:49<00:00, 27639.32it/s]


In [4]:
data_path = '/home/ubuntu/congressional-twin/data/congress_votes/congress_votes_118-2024_h142.csv'
file_name = data_path.split('/')[-1].split('.csv.')[0]
 
df_vote_result = pd.read_csv(data_path, header=1)

matched_name = []
matched_ratio = []
average_embd = []
handles = []
party= []

for i, row in df_vote_result.iterrows():
    voter_name = row['name']
    highest_ratio_index = np.argmax([fuzz.token_sort_ratio(voter_name, name) for name in congress_name_list])

    matched_name.append(congress_name_list[highest_ratio_index])
    matched_ratio.append(fuzz.token_sort_ratio(voter_name, congress_name_list[highest_ratio_index]))
    average_embd.append(average_embds[highest_ratio_index])
    handles.append(congress_handles[highest_ratio_index])
    party.append(party_by_congress_person[highest_ratio_index])

df_vote_result['matched-name'] = matched_name
df_vote_result['matched-ratio'] = matched_ratio
df_vote_result['twitter-handle'] = handles
df_vote_result['party'] = party
df_vote_result['average-embd'] = average_embd

df_vote_result.head()

Unnamed: 0,person,state,district,vote,name,party,matched-name,matched-ratio,twitter-handle,average-embd
0,400004,AL,4,Yea,Rep. Robert Aderholt [R],r,"Aderholt, Robert",83,robert_aderholt,"[0.020525227366267024, 0.04287387229300834, -0..."
1,400030,GA,2,Not Voting,Rep. Sanford Bishop [D],d,"Bishop Jr., Sanford",86,sanfordbishop,"[0.011686148968609896, 0.034835316918113014, -..."
2,400033,OR,3,Yea,Rep. Earl Blumenauer [D],d,Emanuel Cleaver D-MO,59,repcleaver,"[0.014774418466661748, 0.04408360926692325, -0..."
3,400052,TX,26,Yea,Rep. Michael Burgess [R],r,"Burgess, Michael",83,michaelcburgess,"[0.019034768800315616, 0.05050808603658616, -0..."
4,400057,CA,41,Yea,Rep. Ken Calvert [R],r,"Calvert, Ken",79,kencalvert,"[0.020587250114365935, 0.04766320684325197, -0..."


In [5]:
df_vote_result.to_csv(f'/home/ubuntu/congressional-twin/data/data/{file_name}_with_average_embedding_and_metadata.csv')