In [262]:
import pickle
from tqdm import tqdm
import os

from gensim.models.word2vec import Word2Vec
import numpy as np
from sklearn.cross_decomposition import CCA
from sklearn.metrics.pairwise import cosine_similarity
import json
from nltk.corpus import stopwords, wordnet
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
import random

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Worldview & Ideology Analysis

This notebook contains examples of how to perform the analysis from "Aligning Multidimensional Worldviews and Discovering Ideological Differences" (Milbauer et al., 2021)

## If editing in Google Colab: mount gdrive, clone github repo (if not already cloned), and go to cloned folder

In [None]:
# mount gdrive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# %cd drive/MyDrive/Colab_Notebooks #go to main Colab folder 
# ! git clone 'https://github.com/limyuzheng88/worldview-ideology.git' #clone
# ! git pull #if already cloned repo, then just pull

%cd drive/MyDrive/Colab_Notebooks/worldview-ideology/
!ls

## Load corpora

In [64]:
source_a, source_b = 'subreddit_askmen', 'subreddit_askwomen'
with open('./corpus/{}.txt'.format(source_a), encoding='utf-8') as f:
    corpus_a = f.readlines()
with open('./corpus/{}.txt'.format(source_b), encoding='utf-8') as f:
    corpus_b = f.readlines()

## Loading the trained embeddings

First, we load the trained embeddings, and quickly examine them to see if they make sense.
We are using small text samples (500k tokens), so embeddings may not be very good.

In [122]:
model_a = Word2Vec.load('models/politics.word2vec.model')
model_b = Word2Vec.load('models/the_donald.word2vec.model')
# pretrained on more data
# model_a = Word2Vec.load('models/politics.big.model')
# model_b = Word2Vec.load('models/the_donald.big.model')

posWords = ['democrat']
negWords = []
for x in model_a.wv.most_similar(positive=posWords, negative=negWords):
    print(x)
print()
for x in model_b.wv.most_similar(positive=posWords, negative=negWords):
    print(x)

('republican', 0.6307386159896851)
('dem', 0.6165011525154114)
('democrats', 0.6043610572814941)
('democratic', 0.5978788137435913)
('dems', 0.49818018078804016)
('liberal', 0.48974037170410156)
('candidate', 0.4867892265319824)
('party', 0.4780939519405365)
('republicans', 0.477538526058197)
('progressive', 0.4756127595901489)

('dem', 0.6980773210525513)
('republican', 0.6605644226074219)
('democratic', 0.6189409494400024)
('democrats', 0.6140726804733276)
('party', 0.6009922623634338)
('dems', 0.48170965909957886)
('liberal', 0.4369395971298218)
('leftist', 0.42886313796043396)
('left', 0.4251934587955475)
('candidates', 0.4231654405593872)


In [54]:
# MY DATASET

model_a = Word2Vec.load('data/models/{}.model'.format(source_a))
model_b = Word2Vec.load('data/models/{}.model'.format(source_b))
# pretrained on more data
# model_a = Word2Vec.load('models/politics.big.model')
# model_b = Word2Vec.load('models/the_donald.big.model')

posWords = ['men', 'porn']
negWords = ['women']
for x in model_a.wv.most_similar(positive=posWords, negative=negWords):
    print(x)
print()
for x in model_b.wv.most_similar(positive=posWords, negative=negWords):
    print(x)

('addiction', 0.674181342124939)
('watching', 0.5972264409065247)
('watch', 0.5872642993927002)
('watched', 0.5676982402801514)
('modern', 0.561260461807251)
('fantasy', 0.5315399765968323)
('content', 0.5298962593078613)
('argue', 0.5205428600311279)
('becomes', 0.5109003186225891)
('sexuality', 0.5108972787857056)

('creepy', 0.5877805948257446)
('attracted', 0.5398934483528137)
('pleasure', 0.5357645750045776)
('flirting', 0.5247712135314941)
('man', 0.5183247327804565)
('sexually', 0.5093643665313721)
('watching', 0.5009644031524658)
('dudes', 0.49731189012527466)
('hitting', 0.4970484673976898)
('anime', 0.4925317168235779)


## Aligning the embeddings

First, we find the overlapping vocabulary of the two models, and use this to construct an embedding matrix for each model.

In [23]:
vocab_a = list(set(model_a.wv.vocab.keys()))
vocab_b = list(set(model_b.wv.vocab.keys()))

shared_vocab = set.intersection(set(vocab_a),
                                set(vocab_b))
shared_vocab = list(sorted(list(shared_vocab)))
combo_vocab = set.union(set(vocab_a),
                                set(vocab_b))

w2idx = { w:i for i,w in enumerate(shared_vocab) }
a2idx = { w:i for i,w in enumerate(vocab_a) }
idx2b = { i:w for i,w in enumerate(vocab_b) }

mtxA = np.vstack([model_a.wv[w] for w in shared_vocab])
mtxB = np.vstack([model_b.wv[w] for w in shared_vocab])
mtxA_ = np.vstack([model_a.wv[w] for w in vocab_a])
mtxB_ = np.vstack([model_b.wv[w] for w in vocab_b])

We then select only the N most common words as anchors to train our alignment. (If you're using the big model, this won't quite work because the vocabularies are different.)

In [24]:
counts = pickle.load(open('data/counts.pkl', 'rb'))
n = 5000
topN = [y for x,y in sorted([(counts[w], w) for w in w2idx if w in counts], reverse=True)][:n] #w2idx is from shared_vocab
idxs = [w2idx[w] for w in topN]

In [25]:
anchorA = mtxA[idxs, :]
anchorB = mtxB[idxs, :]

Next, we use two different techniques for aligning the embeddings: SVD and CCA

In [26]:
def align_svd(source, target):
    product = np.matmul(source.transpose(), target)
    U, s, V = np.linalg.svd(product)
    T = np.matmul(U,V)
    return T

svd = align_svd(anchorA, anchorB)
svdA = mtxA_.dot(svd)
svdB = mtxB_

In [27]:
def align_cca(source, target):
    N_dims = source.shape[1]
    cca = CCA(n_components=N_dims, max_iter=2000)
    cca.fit(source, target)
    return cca

cca = align_cca(anchorA, anchorB)
ccaA, ccaB = cca.transform(mtxA, mtxB)

In [28]:
def build_translator(a, b, a2idx, idx2b):
    sims = cosine_similarity(a, b)
    most_sims = np.argsort(sims, axis=1)[:, ::-1]
    
    def translator(w, k=1):
        idx = a2idx[w]
        idxs = most_sims[idx, :k]
        words = [idx2b[i] for i in idxs]
        return words, sims[idx, idxs]
    
    return translator

In [29]:
translator = build_translator(svdA, svdB, a2idx, idx2b)

## Exploring the Alignment

We now explore three different ways of using the alignmed embeddings to explore the worldview and ideology of the two communities.

In [92]:
translator('democrat', k=5)

(['democrat', 'republican', 'dem', 'democrats', 'republicans'],
 array([0.6164709 , 0.5891098 , 0.5137549 , 0.4719857 , 0.46580008],
       dtype=float32))

In [45]:
# MY DATASET
translator('lol', k=5)

(['lol', 'haha', 'yeah', ':face_with_tears_of_joy:', 'idk'],
 array([0.79610276, 0.7740769 , 0.7501412 , 0.74681777, 0.7448173 ],
       dtype=float32))

'aa'

In [269]:
[lemmatizer.lemmatize(token) for token in word_tokenize('aa dd')]

['aa', 'dd']

In [270]:
# MY DATASET
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

# function to get example sentences that contain said word  
def get_example_sentences(word, corpus, n_sentences=1, seed=None):
    random.seed(a=seed) #fix seed if needed. removes randomness, and fixes the output as a constant
    sentences = random.sample([s for s in corpus if word in s], n_sentences)
    return sentences
    
n_sentences = 1
seed = None

word_a = 'sex'
word_a_highlighted = '{}{}{}'.format(color.BOLD + color.BLUE, word_a, color.END)
print('\'{}\' in source_a ({} example sentence below):'.format(word_a_highlighted, n_sentences))
[print(s.replace(word_a, word_a_highlighted)) for s in get_example_sentences(word_a, corpus_a, n_sentences=n_sentences, seed=seed)]

word_b = translator(word_a, k=5)
print('translates to source_b words:\n')
for w in zip(translator(word_a, k=5)[0], translator(word_a, k=5)[1]):
    word_b_highlighted = '{}{}{}'.format(color.BOLD + color.RED, w[0], color.END)
    print(word_b_highlighted + ', aligment score: {}'.format(w[1]))
    [print(s.replace(w[0], word_b_highlighted)) for s in get_example_sentences(w[0], corpus_b, n_sentences=n_sentences, seed=seed)]

'[1m[94msex[0m' in source_a (1 example sentence below):
it really depends on your relationship with the cheater and how severe it is . typically ( like most people here saying ‘ no ' ) , it 's not ideal to trust a cheater . and there is every reason not to . however , the only time they deserve a second chance is when they repent and seek forgiveness ( not just with you but with themselves ) . and show a great deal of effort to improve and restore the relationship . also , this is marginal , but if their cheating is also not a byproduct of their making ; they happen to be a [1m[94msex[0m addict and need serious mental treatment to help express their emotions and vent healthily rather than through [1m[94msex[0m . when they should n't be trusted is when their cheating is more than just that . if they happen to lie , deceive , manipulate and much more ... cut them out . do n't give them a second chance . if these people are willing to cheat , make you feel like shit , and see no 

### Misalignment

In [62]:
misaligned = []
scores = []

for w in shared_vocab:
    w_ = translator(w)[0][0]
    s = translator(w)[1][0]
    if w != w_:
        misaligned.append((w, w_))
        scores.append(s)
        
print(len(misaligned) / len(shared_vocab))

0.3664901664145234


In [46]:
# MY DATASET
misaligned = []
scores = []

for w in shared_vocab:
    w_ = translator(w)[0][0]
    s = translator(w)[1][0]
    if w != w_:
        misaligned.append((w, w_))
        scores.append(s)
        
print(len(misaligned) / len(shared_vocab))

0.38506024096385544


In [64]:
for pair, score in sorted(zip(misaligned, scores), key=lambda x:x[1], reverse=True)[:20]:
    print(pair, score)

('performed_automatically', 'please_contact') 0.8923226
('moderators', 'please_contact') 0.8301286
('``', "''") 0.7827312
('&', 'gt') 0.74673975
('bot', 'performed_automatically') 0.7402881
(';', 'gt') 0.71963507
('though', 'but') 0.7046928
('citizenship_question', 'census') 0.68586487
('amp', ';') 0.68398106
('action', 'performed_automatically') 0.6676772
('couple', 'few') 0.6567316
('disagree', 'agree') 0.64628285
('dems', 'democrats') 0.6362802
('supreme_court', 'scotus') 0.61996275
('republican', 'democrat') 0.6085014
('dumb', 'stupid') 0.60647255
('26_times', 'lolita_express') 0.6013237
('capitalism', 'communism') 0.5988106
('jeffrey_epstein', 'epstein') 0.59700453
('illegal_immigrants', 'illegals') 0.5922674


In [47]:
# MY DATASET
for pair, score in sorted(zip(misaligned, scores), key=lambda x:x[1], reverse=True)[:20]:
    print(pair, score)

('=/', 'compose') 0.93772894
('automatically', 'compose') 0.9272409
('performed', 'compose') 0.9270565
('subreddit', 'compose') 0.91976225
('bot', 'compose') 0.9151716
('concerns', 'compose') 0.909357
('moderators', 'compose') 0.9054626
('[', ']') 0.8976162
('action', 'compose') 0.89207244
('message', 'performed') 0.87207484
('shorts', 'pants') 0.86476004
('sister', 'brother') 0.8642143
('shirts', 'leggings') 0.8639887
('please', 'compose') 0.85940146
('cheap', 'buy') 0.85615104
('food', 'meal') 0.8556203
('6', '5') 0.854324
('buying', 'buy') 0.85293144
('r', 'compose') 0.85048825
('play', 'games') 0.84840715


### Antonyms

In [48]:
def get_antonyms(vocab):
    antonyms = []
    for w in tqdm(vocab):
        for synset in wordnet.synsets(w):
            for lemma in synset.lemmas():
                if lemma.antonyms():
                    antonyms.append((w, lemma.antonyms()[0].name()))
    antonyms = set(antonyms)
    return antonyms

antonyms = get_antonyms(combo_vocab)

100%|██████████| 2828/2828 [00:09<00:00, 310.07it/s] 


In [66]:
for mPair in misaligned:
    if mPair in antonyms or (mPair[0], mPair[1]) in antonyms:
        print(mPair)

('civilian', 'military')
('decrease', 'increase')
('disagree', 'agree')
('disrespect', 'respect')
('illogical', 'logical')
('inaccurate', 'accurate')
('indirectly', 'directly')
('ineffective', 'effective')
('intolerant', 'tolerant')
('invalid', 'valid')
('liability', 'asset')
('sell', 'buy')
('sells', 'buy')
('unreasonable', 'reasonable')
('unwilling', 'willing')
('weakness', 'strength')
('west', 'east')


In [49]:
# MY DATASET
for mPair in misaligned:
    if mPair in antonyms or (mPair[0], mPair[1]) in antonyms:
        print(mPair)

('boy', 'girl')
('disagree', 'agree')
('expensive', 'cheap')
('forward', 'back')
('give', 'take')
('light', 'dark')
('male', 'female')
('more', 'less')
('particular', 'general')
('positive', 'negative')
('second', 'first')
('sell', 'buy')
('sister', 'brother')
('small', 'big')
('unfortunately', 'luckily')
('white', 'black')
('wife', 'husband')


In [129]:
print("{}% misaligned pairs from 'misaligned', in 'antonyms' set".format(len([mPair for mPair in misaligned if mPair in antonyms or (mPair[0], mPair[1]) in antonyms])/len(misaligned)*100))

0.5847953216374269% misaligned pairs from 'misaligned', in 'antonyms' set


In [50]:
# MY DATASET
print("{}% misaligned pairs from 'misaligned', in 'antonyms' set".format(len([mPair for mPair in misaligned if mPair in antonyms or (mPair[0], mPair[1]) in antonyms])/len(misaligned)*100))

2.127659574468085% misaligned pairs from 'misaligned', in 'antonyms' set


### Translation / Conceptual Homomorphisms

In [51]:
unique_vocab = []
for w in model_a.wv.vocab:
    if w not in model_b.wv.vocab:
        unique_vocab.append(w)

In [52]:
translations = []
scores = []
for w in unique_vocab:
    t = translator(w)
    translations.append((w, t[0][0]))
    scores.append(t[1][0])

In [69]:
for pair, score in sorted(zip(translations, scores), key=lambda x:x[1], reverse=True)[:20]:
    print(pair, score)

('instructions_provided', 'performed_automatically') 0.71877486
('permanent_ban', 'performed_automatically') 0.69331694
('rule_violations', 'performed_automatically') 0.63353837
('wishing_death/physical', 'performed_automatically') 0.594555
('fully_participate', 'please_contact') 0.5898004
('rulebreaking_content', 'performed_automatically') 0.5775635
('`_youtu.be', '`') 0.55210274
('spam_domain', 'performed_automatically') 0.5434005
('/r/politics_within', 'performed_automatically') 0.52550036
('troll_accusations', 'performed_automatically') 0.51061064
('whitelisting', 'performed_automatically') 0.4963802
('blatant_spam', 'performed_automatically') 0.48971322
('confederate_flag', 'flag') 0.48527563
('excluding_indians', 'persons') 0.48497242
('site_administrators', 'link_shortener') 0.48107997
('following_reason', 'submission') 0.48058963
('alan_dershowitz', 'epstein') 0.48009375
('drinking_water', 'water') 0.47866067
('breaking_channel', 'link_shortener') 0.47774062
('nonreputable_/', 

In [254]:
# MY DATASET

for pair, score in sorted(zip(translations, scores), key=lambda x:x[1], reverse=True)[:20]:
    word_a_highlighted = '{}{}{}'.format(color.BOLD + color.BLUE, pair[0], color.END)
    word_b_highlighted = '{}{}{}'.format(color.BOLD + color.RED, pair[1], color.END)
    print(word_a_highlighted + ' & ' + word_b_highlighted + ', aligment score: {}'.format(score))
    
    # print word_a_highlighted, and example sentences
    print('{}:'.format(word_a_highlighted))
    [print(s.replace(pair[0], word_a_highlighted)) for s in get_example_sentences(pair[0], corpus_a, n_sentences=n_sentences, seed=seed)]
    
    # print word_b_highlighted, and example sentences
    print('{}:'.format(word_b_highlighted))
    [print(s.replace(pair[1], word_b_highlighted)) for s in get_example_sentences(pair[1], corpus_b, n_sentences=n_sentences, seed=seed)]
    

[1m[94maskmen[0m & [1m[91mcompose[0m, aligment score: 0.9154050946235657
[1m[94maskmen[0m:
your submission was removed by a computer . this could be for a number of reasons , most of which are summarized in the rules text on the right . in most of these cases , the computer is right , and we will not overturn its decision . if you have re-read your question and still think this is a failure of the automated filter , message us with an actual reason as to why the computer is wrong . if you just say that you think the computer is wrong without any reasoning , we will ignore you . * i am a bot , and this action was performed automatically . please [ contact the moderators of this subreddit ] ( / message / compose / ? to =/ r / [1m[94maskmen[0m ) if you have any questions or concerns . *

[1m[91mcompose[0m:
hello digital_curations . thank you for participating in / r / askwomen . however , your submission has been removed , because we do not allow personal advice or evaluati

your submission was removed by a computer . this could be for a number of reasons , most of which are summarized in the rules text on the right . in most of these cases , the computer is right , and we will not overturn its decision . if you have re-read your question and still think this is a failure of the automated [1m[94mfilter[0m , message us with an actual reason as to why the computer is wrong . if you just say that you think the computer is wrong without any reasoning , we will ignore you . * i am a bot , and this action was performed automatically . please [ contact the moderators of this subreddit ] ( / message / compose / ? to =/ r / askmen ) if you have any questions or concerns . *

[1m[91mcompose[0m:
hello kaleyt 21 . thank you for participating in / r / askwomen . however , your submission has been removed , because we do not allow personal advice or evaluation submissions . you can always go to / r / askwomenadvice , / r / relationships for romantic / non-romantic