In [8]:
import nomic
import json
api_key_path = "/home/ubuntu/api_keys.json"

with open(api_key_path, 'r') as j:
    key = json.loads(j.read())['nomic']

nomic.login(key)
#- Scientific programming
import numpy as np
import pandas as pd
from tqdm import tqdm

#- Plotting
import matplotlib.pyplot as plt
import seaborn as sns

#- Data
from nomic import AtlasDataset

#- Processing
from fuzzywuzzy import fuzz
from unidecode import unidecode
import re
import datetime

dataset = AtlasDataset('hivemind/tweets-from-members-of-us-congress-from-all-time-updated-2024-06-10')

embds = dataset.maps[0].embeddings.latent
df = dataset.maps[0].data.df

tags_df = dataset.maps[0].tags.df

df = df.merge(tags_df, on='tweetId')
df.head()

[32m2024-06-11 20:10:42.941[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36m__init__[0m:[36m829[0m - [1mLoading existing dataset `hivemind/tweets-from-members-of-us-congress-from-all-time-updated-2024-06-10`.[0m
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 125/125 [00:00<00:00, 25749.62it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 125/125 [00:00<00:00, 6246.28it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 125/125 [00:00<00:00, 6362.18it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 125/125 [00:00<00:00, 591.15it/s]


Unnamed: 0,name,party,twitter_lower,years,chamber,state,postedAt,source,tweetId,<115><2017><senate><179>,<118><2024><house><142>
0,Josh Gottheimer D-NJ,D,repjoshg,2021_2022,House,NJ,2022-05-29 13:22:24.171,PolitWoops,1530902377196314624,False,False
1,Ed Markey D-MA,D,senmarkey,2011_2012,Senate,MA,2011-05-13 21:06:26.255,PolitWoops,69146505528868864,False,False
2,Debbie Stabenow D-MI,D,senstabenow,2021_2022,Senate,MI,2022-03-02 01:14:27.293,PolitWoops,1498829050919411712,False,False
3,Ted Cruz R-TX,R,sentedcruz,2019_2020,Senate,TX,2019-10-31 16:48:33.055,PolitWoops,1189947274526384128,False,False
4,Dan Bishop R-NC,R,repdanbishop,2021_2022,House,NC,2021-08-25 00:15:49.917,PolitWoops,1430322991376834560,False,False


In [9]:
data_path = '../congress_votes/congress_votes_118-2024_h142.csv'
file_name = data_path.split('congress_votes_')[-1].split('.csv')[0]

temp_ = pd.read_csv(data_path)
vote_info_str = list(temp_.columns)[0]

pattern = r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}'
datetime_regex = re.compile(pattern)
match = datetime_regex.search(vote_info_str)

vote_datetime_obj = datetime.datetime.strptime(match.group(), '%Y-%m-%dT%H:%M:%S')

In [10]:
tag_name = '<118><2024><house><142>'

congress_name_list = np.unique(list(df['name']))
congress_name_list = list(congress_name_list)

tweets_per_congress_person = {
    'global': np.zeros(len(congress_name_list)),
    'issue-specific': np.zeros(len(congress_name_list))
}

average_embds = {
    'global': np.zeros((len(congress_name_list), embds.shape[1])),
    'issue-specific': np.zeros((len(congress_name_list), embds.shape[1]))
}

congress_handles = [list(df['twitter_lower'][df['name'] == cn])[0] for cn in congress_name_list]
party_by_congress_person = [-1 for cp in congress_name_list]

for i, embd in enumerate(tqdm(embds)):
    congress_name_index = congress_name_list.index(df['name'][i])
    time_posted = df['postedAt'][i].to_pydatetime()

    if vote_datetime_obj < time_posted:
        continue

    tweets_per_congress_person['global'][congress_name_index] += 1
    average_embds['global'][congress_name_index] += embd

    if df[tag_name][i]:
        tweets_per_congress_person['issue-specific'][congress_name_index] += 1
        average_embds['issue-specific'][congress_name_index] += embd

    if party_by_congress_person[congress_name_index] == -1:
        party_by_congress_person[congress_name_index] = df['party'][i].lower()

average_embds = {k: np.divide(average_embds[k],  tweets_per_congress_person[k].reshape(-1, 1)) for k in average_embds}
unique_party_labels, party_counts = np.unique(party_by_congress_person, return_counts=True)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2904271/2904271 [03:27<00:00, 14011.47it/s]
  average_embds = {k: np.divide(average_embds[k],  tweets_per_congress_person[k].reshape(-1, 1)) for k in average_embds}


In [11]:
# store everything into a dataframe with each row for a congress person, it has redundant columns. 
data = {'twitter_lower': congress_handles, 'congress_name_list': congress_name_list}

df_congress = pd.DataFrame(data)
df_congress['average_embds_global'] = average_embds['global'].tolist()
df_congress['average_embds_issue_specific'] = average_embds['issue-specific'].tolist()

df_congress = df_congress.merge(df.drop_duplicates(subset='twitter_lower'), on='twitter_lower', how='left')
df_congress.head()

Unnamed: 0,twitter_lower,congress_name_list,average_embds_global,average_embds_issue_specific,name,party,years,chamber,state,postedAt,source,tweetId,<115><2017><senate><179>,<118><2024><house><142>
0,repfinkenauer,Abby Finkenauer D-IA,"[0.0059132452736815905, 0.043169012535101525, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",Abby Finkenauer D-IA,D,2019_2020,House,IA,2020-06-18 15:00:50.141,PolitWoops,1273631764053987328,False,False
1,repspanberger,Abigail Spanberger D-VA,"[0.013804457920879136, 0.03647619804269963, -0...","[0.009033125097101385, 0.053407079523259945, -...",Abigail Spanberger D-VA,D,2021_2022,House,VA,2022-04-16 14:37:39.608,PolitWoops,1515338638091030528,False,False
2,repkinzinger,Adam Kinzinger R-IL,"[0.0065210342063088894, 0.03753754738159047, -...","[0.010206903110850941, 0.038581553372469816, -...",Adam Kinzinger R-IL,R,2017_2018,House,IL,2018-11-15 16:25:46.373,PolitWoops,1063105789286666240,False,False
3,repadamschiff,Adam Schiff D-CA,"[0.020784396506911152, 0.048458186244812865, -...","[0.01086626554790296, 0.049639375586258735, -0...",Adam Schiff D-CA,D,2019_2020,House,CA,2019-11-19 20:26:15.846,PolitWoops,1196887433289359360,False,False
4,repadamsmith,Adam Smith D-WA,"[0.014037557642971887, 0.045134131921029684, -...","[-0.0011352598667144775, 0.05262286927964952, ...",Adam Smith D-WA,D,2017_2018,House,WA,2018-08-01 20:44:57.706,PolitWoops,1024757902551261184,False,False


In [16]:
data_path = '../congress_votes/congress_votes_118-2024_h142.csv'
file_name = data_path.split('congress_votes_')[-1].split('.csv')[0]
 
df_vote_result = pd.read_csv(data_path, header=1)

df_vote_result.head()

Unnamed: 0,person,state,district,vote,name,party
0,400004,AL,4,Yea,Rep. Robert Aderholt [R],Republican
1,400030,GA,2,Not Voting,Rep. Sanford Bishop [D],Democrat
2,400033,OR,3,Yea,Rep. Earl Blumenauer [D],Democrat
3,400052,TX,26,Yea,Rep. Michael Burgess [R],Republican
4,400057,CA,41,Yea,Rep. Ken Calvert [R],Republican


In [17]:
df_vote_result['matched-name'] = ''
df_vote_result['twitter-handle'] = ''
df_vote_result['matched-party'] = ''
df_vote_result['matched-state'] = ''
df_vote_result['average-embd-global'] = ''
df_vote_result['average-embd-issue-specific'] = ''

for i, row in df_vote_result.iterrows():
    name = unidecode(row['name'].split(' [')[0].split('Rep. ')[1])
    party_val = row['party'][0]
    state_val = row['state']
    candidate_names = df_congress.loc[(df_congress['party'] == party_val) & (df_congress['state'] == state_val), 'congress_name_list'].apply(unidecode)
    name_parts = name.split(" ")

    for napa in name_parts:
        if candidate_names.str.contains(napa).any():
            index = candidate_names[candidate_names.str.contains(napa)].index[0]

            df_vote_result.at[i, 'matched-name'] = df_congress.loc[index]['congress_name_list']
            df_vote_result.at[i, 'twitter-handle'] = df_congress.loc[index]['twitter_lower']
            df_vote_result.at[i, 'matched-party']=df_congress.loc[index]['party']
            df_vote_result.at[i, 'matched-state']=df_congress.loc[index]['state']
            df_vote_result.at[i, 'average-embd-global'] = df_congress.loc[index]['average_embds_global']

            if np.isnan(average_embds['issue-specific'][index][0]):
                df_vote_result.at[i, 'average-embd-issue-specific'] = np.nan
            else:
                df_vote_result.at[i, 'average-embd-issue-specific'] = df_congress.loc[index]['average_embds_issue_specific']
        else:
            df_vote_result.at[i, 'matched-name'] = np.nan
            df_vote_result.at[i, 'twitter-handle'] = np.nan
            df_vote_result.at[i, 'matched-party']=np.nan
            df_vote_result.at[i, 'matched-state']=np.nan
            df_vote_result.at[i, 'average-embd-global'] = np.nan
            df_vote_result.at[i, 'average-embd-issue-specific'] = np.nan

df_vote_result.head()

  if candidate_names.str.contains(napa).any():
  index = candidate_names[candidate_names.str.contains(napa)].index[0]
  if candidate_names.str.contains(napa).any():
  index = candidate_names[candidate_names.str.contains(napa)].index[0]
  if candidate_names.str.contains(napa).any():
  if candidate_names.str.contains(napa).any():
  index = candidate_names[candidate_names.str.contains(napa)].index[0]


Unnamed: 0,person,state,district,vote,name,party,matched-name,twitter-handle,matched-party,matched-state,average-embd-global,average-embd-issue-specific
0,400004,AL,4,Yea,Rep. Robert Aderholt [R],Republican,"Aderholt, Robert",robert_aderholt,R,AL,"[0.017503820146833146, 0.03896857670375279, -0...","[0.006040811538696289, 0.042439937591552734, -..."
1,400030,GA,2,Not Voting,Rep. Sanford Bishop [D],Democrat,"Bishop Jr., Sanford",sanfordbishop,D,GA,"[0.018433302640914917, 0.049648284912109375, -...","[0.017269134521484375, 0.0095062255859375, -0...."
2,400033,OR,3,Yea,Rep. Earl Blumenauer [D],Democrat,,,,,,
3,400052,TX,26,Yea,Rep. Michael Burgess [R],Republican,"Burgess, Michael",michaelcburgess,R,TX,"[0.021851594874400967, 0.05065598708904342, -0...","[0.02545155649599822, 0.08420595915421196, -0...."
4,400057,CA,41,Yea,Rep. Ken Calvert [R],Republican,"Calvert, Ken",kencalvert,R,CA,"[0.018314135260879993, 0.05678512156009674, -0...","[0.020268772777758147, 0.06673792788856908, -0..."


In [18]:
df_vote_result.to_csv(f'~/congressional-twin/data/data/{file_name}.csv')