In [6]:
import nomic
import json
api_key_path = "/home/ubuntu/api_keys.json"

with open(api_key_path, 'r') as j:
    key = json.loads(j.read())['nomic']

nomic.login(key)

from nomic import AtlasDataset
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

dataset = AtlasDataset('hivemind/tweets-from-members-of-us-congress-from-all-time-updated-2024-05-21')

embds = dataset.maps[0].embeddings.latent
df = dataset.maps[0].data.df

[32m2024-06-06 19:42:15.364[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36m__init__[0m:[36m829[0m - [1mLoading existing dataset `hivemind/tweets-from-members-of-us-congress-from-all-time-updated-2024-05-21`.[0m
381it [02:39,  2.38it/s]                                                                                                
100%|██████████████████████████████████████████████████████████████████████████████| 129/129 [00:00<00:00, 21018.77it/s]


In [2]:
congress_persons, tweets_per_congress_person = np.unique(list(df['twitter_lower']), return_counts=True)
congress_persons = list(congress_persons)

average_embds = np.zeros((len(congress_persons), embds.shape[1]))
party_by_congress_person = [-1 for cp in congress_persons]

for i, embd in enumerate(tqdm(embds)):
    congress_person_index = congress_persons.index(df['twitter_lower'][i])
    average_embds[congress_person_index] += embd

    if party_by_congress_person[congress_person_index] == -1:
        party_by_congress_person[congress_person_index] = df['party'][i].lower()

average_embds = np.divide(average_embds, tweets_per_congress_person.reshape(-1, 1))
unique_party_labels, party_counts = np.unique(party_by_congress_person, return_counts=True)

 15%|██████████▌                                                            | 453420/3037316 [00:14<01:24, 30675.48it/s]


KeyboardInterrupt: 

In [None]:
from graspologic.embed import ClassicalMDS
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

CMDS = ClassicalMDS(n_components=20)
cmds_embds = CMDS.fit_transform(average_embds)
singular_values = CMDS.singular_values_

CMDS = ClassicalMDS()
cmds_embds = CMDS.fit_transform(average_embds)

LDA = LinearDiscriminantAnalysis()
LDA.fit(cmds_embds, party_by_congress_person)
lda_embds = LDA.transform(cmds_embds)

colors = {
    'd': 'b',
    'r': 'r',
    'i': 'g'
         }

fig, ax = plt.subplots(1,1)
ax.scatter(range(len(singular_values)), singular_values)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(10,5))
color_by_congress_person = [colors[p] for p in party_by_congress_person]
boldness_by_congress_person = tweets_per_congress_person / max(tweets_per_congress_person)

ax[0].scatter(cmds_embds[:, 0], cmds_embds[:, 1], 
              c=color_by_congress_person, 
              alpha=boldness_by_congress_person)
ax[0].set_title('CMDS', fontsize=14)
xlim, ylim = ax[0].get_xlim(), ax[0].get_ylim()

for label,color in colors.items():
    ax[0].scatter(1e5, 1e5, label=label, color=color)

ax[0].set_xlim(*xlim)
ax[0].set_ylim(*ylim)

ax[0].legend()

ax[1].scatter(lda_embds[:, 0], lda_embds[:, 1], c=color_by_congress_person, alpha=boldness_by_congress_person)
ax[1].set_title('LDA o CMDS', fontsize=14)

for ax_ in ax:
    ax_.set_yticks([])
    ax_.set_xticks([])

fig.suptitle('Representations of Congressional Tweeters \nbased on average embedding of tweets', fontsize=16, y=1.05)

In [9]:
dataset.maps[0].data.df

100%|██████████████████████████████████████████████████████████████████████████████| 129/129 [00:00<00:00, 21331.17it/s]


Unnamed: 0,name,party,twitter_lower,years,chamber,state,postedAt,source,tweetId
0,Sean Maloney D-NY,D,repseanmaloney,2021_2022,House,NY,2021-01-11 13:57:00.270,PolitWoops,1348629988589793280
1,Brian Fitzpatrick R-PA,R,repbrianfitz,2021_2022,House,PA,2021-04-15 13:00:15.852,PolitWoops,1382680168792211456
2,Michael Bennet D-CO,D,senbennetco,2015_2016,Senate,CO,2016-05-06 18:11:51.163,PolitWoops,728648485201915904
3,Jim Risch R-ID,R,senatorrisch,2017_2018,Senate,ID,2017-01-03 23:04:48.118,PolitWoops,816420071769776128
4,Joseph Manchin D-WV,D,sen_joemanchin,2019_2020,Senate,WV,2019-04-25 21:15:00.255,PolitWoops,1121523023104708608
...,...,...,...,...,...,...,...,...,...
3032307,Val Demings D-FL,D,repvaldemings,2019_2020,House,FL,2019-12-12 02:07:01.054,PolitWoops,1204945719750250496
3032308,Val Demings D-FL,D,repvaldemings,2019_2020,House,FL,2019-07-26 16:29:23.446,PolitWoops,1154790829753556992
3032309,Jodey Arrington R-TX,R,reparrington,2021_2022,House,TX,2021-07-29 16:27:23.209,PolitWoops,1420783018793443328
3032310,Jimmy Gomez D-CA,D,repjimmygomez,2021_2022,House,CA,2021-01-25 15:04:25.852,PolitWoops,1353720387109052416


In [10]:
dataset.maps[0].data.df.columns

100%|██████████████████████████████████████████████████████████████████████████████| 129/129 [00:00<00:00, 21362.33it/s]


Index(['name', 'party', 'twitter_lower', 'years', 'chamber', 'state',
       'postedAt', 'source', 'tweetId'],
      dtype='object')

In [None]:
dir(data_.dataset)