In [20]:
import pickle
from tqdm import tqdm
import os

from gensim.models.word2vec import Word2Vec
import numpy as np
from sklearn.cross_decomposition import CCA
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords, wordnet

# Worldview & Ideology Analysis

This notebook contains examples of how to perform the analysis from "Aligning Multidimensional Worldviews and Discovering Ideological Differences" (Milbauer et al., 2021)

## Loading the trained embeddings

First, we load the trained embeddings, and quickly examine them to see if they make sense.
We are using small text samples (500k tokens), so embeddings may not be very good.

In [53]:
model_a = Word2Vec.load('models/politics.word2vec.model')
model_b = Word2Vec.load('models/the_donald.word2vec.model')
# pretrained on more data
# model_a = Word2Vec.load('models/politics.big.model')
# model_b = Word2Vec.load('models/the_donald.big.model')

posWords = ['biden']
negWords = []
for x in model_a.wv.most_similar(positive=posWords, negative=negWords):
    print(x)
print()
for x in model_b.wv.most_similar(positive=posWords, negative=negWords):
    print(x)

('harris', 0.7471194267272949)
('warren', 0.7145416736602783)
('sanders', 0.6976892948150635)
('bernie', 0.6945799589157104)
('joe_biden', 0.6156193017959595)
('kamala', 0.6149285435676575)
('buttigieg', 0.5869153141975403)
('candidate', 0.5764375925064087)
('joe', 0.5390542149543762)
('nomination', 0.5072815418243408)

('harris', 0.5104426145553589)
('creepy_joe', 0.48085713386535645)
('joe_biden', 0.4680749475955963)
('joe', 0.4410553574562073)
('sniffing', 0.4272039234638214)
('warren', 0.42609065771102905)
('sleepy_joe', 0.4177272915840149)
('kamala', 0.4112250804901123)
('nominee', 0.3884388506412506)
('hillary', 0.38795745372772217)


## Aligning the embeddings

First, we find the overlapping vocabulary of the two models, and use this to construct an embedding matrix for each model.

In [54]:
vocab_a = list(set(model_a.wv.vocab.keys()))
vocab_b = list(set(model_b.wv.vocab.keys()))

shared_vocab = set.intersection(set(vocab_a),
                                set(vocab_b))
shared_vocab = list(sorted(list(shared_vocab)))
combo_vocab = set.union(set(vocab_a),
                                set(vocab_b))

w2idx = { w:i for i,w in enumerate(shared_vocab) }
a2idx = { w:i for i,w in enumerate(vocab_a) }
idx2b = { i:w for i,w in enumerate(vocab_b) }

mtxA = np.vstack([model_a.wv[w] for w in shared_vocab])
mtxB = np.vstack([model_b.wv[w] for w in shared_vocab])
mtxA_ = np.vstack([model_a.wv[w] for w in vocab_a])
mtxB_ = np.vstack([model_b.wv[w] for w in vocab_b])

We then select only the N most common words as anchors to train our alignment. (If you're using the big model, this won't quite work because the vocabularies are different.)

In [55]:
counts = pickle.load(open('data/counts.pkl', 'rb'))
n = 5000
topN = [y for x,y in sorted([(counts[w], w) for w in w2idx if w in counts], reverse=True)][:n]
idxs = [w2idx[w] for w in topN]

In [56]:
anchorA = mtxA[idxs, :]
anchorB = mtxB[idxs, :]

Next, we use two different techniques for aligning the embeddings: SVD and CCA

In [57]:
def align_svd(source, target):
    product = np.matmul(source.transpose(), target)
    U, s, V = np.linalg.svd(product)
    T = np.matmul(U,V)
    return T

svd = align_svd(anchorA, anchorB)
svdA = mtxA_.dot(svd)
svdB = mtxB_

In [58]:
def align_cca(source, target):
    N_dims = source.shape[1]
    cca = CCA(n_components=N_dims, max_iter=2000)
    cca.fit(source, target)
    return cca

cca = align_cca(anchorA, anchorB)
ccaA, ccaB = cca.transform(mtxA, mtxB)

In [59]:
def build_translator(a, b, a2idx, idx2b):
    sims = cosine_similarity(a, b)
    most_sims = np.argsort(sims, axis=1)[:, ::-1]
    
    def translator(w, k=1):
        idx = a2idx[w]
        idxs = most_sims[idx, :k]
        words = [idx2b[i] for i in idxs]
        return words, sims[idx, idxs]
    
    return translator

In [60]:
translator = build_translator(svdA, svdB, a2idx, idx2b)

## Exploring the Alignment

We now explore three different ways of using the alignmed embeddings to explore the worldview and ideology of the two communities.

In [71]:
translator('democrat', k=5)

(['democrat', 'republican', 'dem', 'democrats', 'republicans'],
 array([0.61647093, 0.58910996, 0.51375484, 0.4719858 , 0.46580005],
       dtype=float32))

### Misalignment

In [62]:
misaligned = []
scores = []

for w in shared_vocab:
    w_ = translator(w)[0][0]
    s = translator(w)[1][0]
    if w != w_:
        misaligned.append((w, w_))
        scores.append(s)
        
print(len(misaligned) / len(shared_vocab))

0.3664901664145234


In [64]:
for pair, score in sorted(zip(misaligned, scores), key=lambda x:x[1], reverse=True)[:20]:
    print(pair, score)

('performed_automatically', 'please_contact') 0.8923226
('moderators', 'please_contact') 0.8301286
('``', "''") 0.7827312
('&', 'gt') 0.74673975
('bot', 'performed_automatically') 0.7402881
(';', 'gt') 0.71963507
('though', 'but') 0.7046928
('citizenship_question', 'census') 0.68586487
('amp', ';') 0.68398106
('action', 'performed_automatically') 0.6676772
('couple', 'few') 0.6567316
('disagree', 'agree') 0.64628285
('dems', 'democrats') 0.6362802
('supreme_court', 'scotus') 0.61996275
('republican', 'democrat') 0.6085014
('dumb', 'stupid') 0.60647255
('26_times', 'lolita_express') 0.6013237
('capitalism', 'communism') 0.5988106
('jeffrey_epstein', 'epstein') 0.59700453
('illegal_immigrants', 'illegals') 0.5922674


### Antonyms

In [65]:
def get_antonyms(vocab):
    antonyms = []
    for w in tqdm(vocab):
        for synset in wordnet.synsets(w):
            for lemma in synset.lemmas():
                if lemma.antonyms():
                    antonyms.append((w, lemma.antonyms()[0].name()))
    antonyms = set(antonyms)
    return antonyms

antonyms = get_antonyms(combo_vocab)

100%|██████████| 11909/11909 [00:00<00:00, 42394.39it/s]


In [66]:
for mPair in misaligned:
    if mPair in antonyms or (mPair[0], mPair[1]) in antonyms:
        print(mPair)

('civilian', 'military')
('decrease', 'increase')
('disagree', 'agree')
('disrespect', 'respect')
('illogical', 'logical')
('inaccurate', 'accurate')
('indirectly', 'directly')
('ineffective', 'effective')
('intolerant', 'tolerant')
('invalid', 'valid')
('liability', 'asset')
('sell', 'buy')
('sells', 'buy')
('unreasonable', 'reasonable')
('unwilling', 'willing')
('weakness', 'strength')
('west', 'east')


### Translation / Conceptual Homomorphisms

In [67]:
unique_vocab = []
for w in model_a.wv.vocab:
    if w not in model_b.wv.vocab:
        unique_vocab.append(w)

In [68]:
translations = []
scores = []
for w in unique_vocab:
    t = translator(w)
    translations.append((w, t[0][0]))
    scores.append(t[1][0])

In [69]:
for pair, score in sorted(zip(translations, scores), key=lambda x:x[1], reverse=True)[:20]:
    print(pair, score)

('instructions_provided', 'performed_automatically') 0.71877486
('permanent_ban', 'performed_automatically') 0.69331694
('rule_violations', 'performed_automatically') 0.63353837
('wishing_death/physical', 'performed_automatically') 0.594555
('fully_participate', 'please_contact') 0.5898004
('rulebreaking_content', 'performed_automatically') 0.5775635
('`_youtu.be', '`') 0.55210274
('spam_domain', 'performed_automatically') 0.5434005
('/r/politics_within', 'performed_automatically') 0.52550036
('troll_accusations', 'performed_automatically') 0.51061064
('whitelisting', 'performed_automatically') 0.4963802
('blatant_spam', 'performed_automatically') 0.48971322
('confederate_flag', 'flag') 0.48527563
('excluding_indians', 'persons') 0.48497242
('site_administrators', 'link_shortener') 0.48107997
('following_reason', 'submission') 0.48058963
('alan_dershowitz', 'epstein') 0.48009375
('drinking_water', 'water') 0.47866067
('breaking_channel', 'link_shortener') 0.47774062
('nonreputable_/', 