In [10]:
import nomic
import json
api_key_path = "/home/ubuntu/api_keys.json"

with open(api_key_path, 'r') as j:
    key = json.loads(j.read())['nomic']

nomic.login(key)
#- Scientific programming
import numpy as np
import pandas as pd
from tqdm import tqdm

#- Plotting
import matplotlib.pyplot as plt
import seaborn as sns

#- Data
from nomic import AtlasDataset

#- Processing
from fuzzywuzzy import fuzz
from unidecode import unidecode


dataset = AtlasDataset('hivemind/tweets-from-members-of-us-congress-from-all-time-updated-2024-05-29')

embds = dataset.maps[0].embeddings.latent
df = dataset.maps[0].data.df

[32m2024-06-10 15:22:27.686[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36m__init__[0m:[36m829[0m - [1mLoading existing dataset `hivemind/tweets-from-members-of-us-congress-from-all-time-updated-2024-05-29`.[0m
100%|██████████| 129/129 [00:00<00:00, 26554.04it/s]


In [11]:
#- In future we will add relevant tweet indices based on tags

congress_name_list, tweets_per_congress_person = np.unique(list(df['name']), return_counts=True)
congress_name_list = list(congress_name_list)
congress_handles = [list(df['twitter_lower'][df['name'] == cn])[0] for cn in congress_name_list]

average_embds = np.zeros((len(congress_name_list), embds.shape[1]))
party_by_congress_person = [-1 for cp in congress_name_list]

for i, embd in enumerate(tqdm(embds)):
    congress_name_index = congress_name_list.index(df['name'][i])
    average_embds[congress_name_index] += embd

    if party_by_congress_person[congress_name_index] == -1:
        party_by_congress_person[congress_name_index] = df['party'][i].lower()

average_embds = np.divide(average_embds, tweets_per_congress_person.reshape(-1, 1))
unique_party_labels, party_counts = np.unique(party_by_congress_person, return_counts=True)

100%|██████████| 3037316/3037316 [01:12<00:00, 41865.29it/s]


In [12]:
# store everything into a dataframe with each row for a congress person, it has redundant columns. 
data = {'twitter_lower': congress_handles, 'congress_name_list': congress_name_list}

df_congress = pd.DataFrame(data)
df_congress['average_embds'] = average_embds.tolist()

df_congress = df_congress.merge(df.drop_duplicates(subset='twitter_lower'), on='twitter_lower', how='left')
df_congress.head()


Unnamed: 0,twitter_lower,congress_name_list,average_embds,name,party,years,chamber,state,postedAt,source,tweetId
0,repfinkenauer,Abby Finkenauer D-IA,"[0.0059132452736815905, 0.043169012535101525, ...",Abby Finkenauer D-IA,D,2019_2020,House,IA,2020-08-26 20:49:46.024,PolitWoops,1298724338238025728
1,repspanberger,Abigail Spanberger D-VA,"[0.013804460712461475, 0.03647619804269963, -0...",Abigail Spanberger D-VA,D,2019_2020,House,VA,2019-09-23 13:15:00.318,PolitWoops,1176122795119525888
2,repkinzinger,Adam Kinzinger R-IL,"[0.006521038611271089, 0.037537520951817276, -...",Adam Kinzinger R-IL,R,2017_2018,House,IL,2017-07-06 01:38:49.456,PolitWoops,882775812155592704
3,repadamschiff,Adam Schiff D-CA,"[0.020784396506911152, 0.048458186244812865, -...",Adam Schiff D-CA,D,2011_2012,House,CA,2012-10-02 16:30:01.074,PolitWoops,253169977962659840
4,repadamsmith,Adam Smith D-WA,"[0.014037557097547627, 0.045134131921029684, -...",Adam Smith D-WA,D,2021_2022,House,WA,2022-03-30 16:15:06.969,PolitWoops,1509202569985413120


In [15]:
#data_path = '/home/ubuntu/congressional-twin/data/congress_votes/congress_votes_118-2024_h142.csv'
data_path = '.../congressional-twin/data/congress_votes/congress_votes_118-2024_h142.csv'
file_name = data_path.split('congress_votes_')[-1].split('.csv')[0]
 
df_vote_result = pd.read_csv(data_path, header=1)

df_vote_result.head()

Unnamed: 0,person,state,district,vote,name,party
0,400004,AL,4,Yea,Rep. Robert Aderholt [R],Republican
1,400030,GA,2,Not Voting,Rep. Sanford Bishop [D],Democrat
2,400033,OR,3,Yea,Rep. Earl Blumenauer [D],Democrat
3,400052,TX,26,Yea,Rep. Michael Burgess [R],Republican
4,400057,CA,41,Yea,Rep. Ken Calvert [R],Republican


In [16]:
df_vote_result['matched-name'] = ''
df_vote_result['twitter-handle'] = ''
df_vote_result['party2'] = ''
df_vote_result['state2'] = ''
df_vote_result['average-embd'] = ''

In [17]:


for i, row in df_vote_result.iterrows():
    name = unidecode(row['name'].split(' [')[0].split('Rep. ')[1])
    party_val = row['party'][0]
    state_val = row['state']
    candidate_names = df_congress.loc[(df_congress['party'] == party_val) & (df_congress['state'] == state_val), 'congress_name_list'].apply(unidecode)
    name_parts = name.split(" ")

    for napa in name_parts:
        if candidate_names.str.contains(napa).any():
            index = candidate_names[candidate_names.str.contains(napa)].index[0]
            df_vote_result.at[i, 'matched-name'] = df_congress.loc[index]['congress_name_list']
            df_vote_result.at[i, 'average-embd'] = df_congress.loc[index]['average_embds']
            df_vote_result.at[i, 'twitter-handle'] = df_congress.loc[index]['twitter_lower']
            df_vote_result.at[i, 'party2']=df_congress.loc[index]['party']
            df_vote_result.at[i, 'state2']=df_congress.loc[index]['state']
        else:
            df_vote_result.at[i, 'matched-name'] = np.nan
            df_vote_result.at[i, 'average-embd'] = np.nan
            df_vote_result.at[i, 'twitter-handle'] = np.nan
            df_vote_result.at[i, 'party2']=np.nan
            df_vote_result.at[i, 'state2']=np.nan



df_vote_result.head()

  if candidate_names.str.contains(napa).any():
  index = candidate_names[candidate_names.str.contains(napa)].index[0]


Unnamed: 0,person,state,district,vote,name,party,matched-name,twitter-handle,party2,state2,average-embd
0,400004,AL,4,Yea,Rep. Robert Aderholt [R],Republican,"Aderholt, Robert",robert_aderholt,R,AL,"[0.020525227366267024, 0.04287387229300834, -0..."
1,400030,GA,2,Not Voting,Rep. Sanford Bishop [D],Democrat,"Bishop Jr., Sanford",sanfordbishop,D,GA,"[0.011686148968609896, 0.034835316918113014, -..."
2,400033,OR,3,Yea,Rep. Earl Blumenauer [D],Democrat,,,,,
3,400052,TX,26,Yea,Rep. Michael Burgess [R],Republican,"Burgess, Michael",michaelcburgess,R,TX,"[0.019034768800315616, 0.05050808603658616, -0..."
4,400057,CA,41,Yea,Rep. Ken Calvert [R],Republican,"Calvert, Ken",kencalvert,R,CA,"[0.020587250114365935, 0.04766320684325197, -0..."


In [18]:
#df_vote_result.to_csv(f'/Users/tianyichen/Desktop/Research /PhDresearch/Twitters/congressional-twin/data/data/{file_name}.csv')
df_vote_result.to_csv(f'~/congressional-twin/data/data/{file_name}.csv')