In [1]:
########################################################################################################
####################################### FOR TESTING ####################################################
########################################################################################################

In [2]:
#!/usr/bin/env python3

In [56]:
from gensim.models import Word2Vec
from procrustes import smart_procrustes_align_gensim
import numpy as np
import pandas as pd
from plotnine import ggplot, aes, geom_col, theme_classic, scale_fill_manual, labs, element_text, theme
from spacy.lang.en import English
from scipy.spatial.distance import cosine
import os
from statistics import harmonic_mean
nlp = English(pipeline=[])
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x7bd84b2e1790>

In [4]:
def get_sentences(text: str) -> list[list[str]]:
    """Split the specified text into sentences, consisting of text tokens."""

    sents = []

    # We process the text in chunks by paragraph, ensuring that a sentence
    # never crosses a paragraph boundary:
    for para in text.split("\n\n"):
        doc = nlp(para.replace("\n", " "))
        for sent in doc.sents:
            tokens = [
                token.text.lower().strip() for token in sent if not token.is_space
            ]
            sents.append(tokens)

    return sents

In [5]:
def similarity_print(vec, target: str, subName: str, n: int = 10):

    print("Top " + str(n) + " words similar to " + target + " in " + subName + ":")

    for word, value in vec.most_similar(target, topn=n):

        print(f"{value: .2f} {word}")

In [43]:
def tokenize(corpus_path : str, corpora_path="corpora"):
    """
    Return a tokenized version of the corpus at corpus_path
    """

    # Read and tokenize corpus from disc
    with open(os.path.join(corpora_path, corpus_path)) as f:
        raw = f.read()
        tokenized = get_sentences(raw)
    
    return tokenized

In [37]:
def get_aligned_embedding_spaces(corpus1: str, corpus2: str):
    """
    Create an embedding space for corpus1 and corpus 2,
    align corpus 2 to corpus 1, and return their embeddings
    """

    # TODO: ignore words with low frequencies
    # TODO: find ideal threshold

    model1 = Word2Vec(corpus1, epochs=25, window=4)
    model2 = Word2Vec(corpus2, epochs=25, window=4)

    print("Word2Vec models generated")

    model2_aligned = smart_procrustes_align_gensim(model1, model2)

    print("Embedding spaces aligned")

    embeddings1 = model1.wv
    embeddings2 = model2_aligned.wv

    return (embeddings1, embeddings2)

In [None]:
# Read and tokenize dem corpus from disc
with open("corpora/demText.txt") as f:
    demText = f.read()
    demSents = get_sentences(demText)

# Read and tokenize rep corpus from disc
with open("corpora/repText.txt") as f:
    repText = f.read()
    repSents = get_sentences(repText)

# TODO: ignore words with low frequencies
# TODO: find ideal threshold

demModel = Word2Vec(demSents, epochs=25, window=4)
repModel = Word2Vec(repSents, epochs=25, window=4)

print("Word2Vec models generated")

repModelAligned = smart_procrustes_align_gensim(demModel, repModel)

print("Embedding spaces aligned")

demVec = demModel.wv
repVec = repModelAligned.wv

In [None]:
def get_least_similar(embeddings1, embeddings2, topn=10, all=False) -> dict:
    """
    Return the topn words with the lowest cosine similarity
    across the two embedding spaces and their similarity score.
    """

    # Get list of tokens used in both embedding spaces (intersection of both vocabularies)
    vocab = list(set(embeddings1.index_to_key) & set(embeddings2.index_to_key)) 

    # Give option to get all tokens
    if all:
        topn = len(vocab)

    # Filter to only tokens that are in both datasets
    vec1 = {token: embeddings1[token] for token in vocab}
    vec2 = {token: embeddings2[token] for token in vocab}

    # Calculate each token's cosine similarity across the two embedding spaces
    similarities = {token: 1 - cosine(vec1[token], vec2[token]) for token in vocab}

    # Lower index --> less similar
    least_sim = sorted(similarities, key=lambda x: similarities[x], reverse=True)

    # Get the vectors of the 10 least similar words across embedding spaces
    bottom_n_words = least_sim[:topn]
    bottom_n_sims = {token: similarities[token] for token in bottom_n_words}

    return bottom_n_sims




In [None]:
import os
corpora_filepath = "corpora"
corpora = os.listdir(corpora_filepath)
corpus_comparisons = []

# Maybe a way to do all of this with matrix operations?
for corpus1 in corpora:
    for corpus2 in corpora:
        # No point in comparing a corpora's similarity to itself
        if corpus1 == corpus2 or {corpus1, corpus2} in [c["corpora"] for c in corpus_comparisons]:
            pass
        else:
            print(f"COMPARING: {set((corpus1, corpus2))}")
            
            tokens1 = tokenize(corpus1)
            tokens2 = tokenize(corpus2)
            embeddings1, embeddings2 = get_aligned_embedding_spaces(tokens1, tokens2)
            least_similar = get_least_similar(embeddings1, embeddings2, topn=100)
            corpus_comparisons.append({
                "corpora" : {corpus1, corpus2}, # Since order is not supposed to matter
                "similarities" : least_similar
            })

            print(f"CORPUS 1: {corpus1}")
            print(f"CORPUS 2: {corpus2}")
            for token in least_similar:
                print(f"TOKEN: {token}    DISSIMILARITY: {least_similar[token]}")


COMPARING: {'demText.txt', 'repText.txt'}


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_fl

Word2Vec models generated
7259 7259
7259 7259
Embedding spaces aligned
CORPUS 1: demText.txt
CORPUS 2: repText.txt
TOKEN: be    SIMILARITY: 0.8905577659606934
TOKEN: trying    SIMILARITY: 0.8851844668388367
TOKEN: to    SIMILARITY: 0.8813861608505249
TOKEN: have    SIMILARITY: 0.8812063932418823
TOKEN: you    SIMILARITY: 0.8754894137382507
TOKEN: do    SIMILARITY: 0.873787522315979
TOKEN: someone    SIMILARITY: 0.8710473775863647
TOKEN: a    SIMILARITY: 0.8700002431869507
TOKEN: as    SIMILARITY: 0.868139386177063
TOKEN: give    SIMILARITY: 0.8661353588104248
TOKEN: want    SIMILARITY: 0.8657065033912659
TOKEN: agree    SIMILARITY: 0.863441526889801
TOKEN: tell    SIMILARITY: 0.862470805644989
TOKEN: know    SIMILARITY: 0.8610430955886841
TOKEN: think    SIMILARITY: 0.8603705167770386
TOKEN: many    SIMILARITY: 0.8589439392089844
TOKEN: does    SIMILARITY: 0.8584671020507812
TOKEN: win    SIMILARITY: 0.8566277623176575
TOKEN: guys    SIMILARITY: 0.856235921382904
TOKEN: more    SIMILAR

In [None]:
# Collect all of the corpus<-->corpus dissimilarities under each token

token_ids = {}

for comparison in corpus_comparisons:
    
    for token in comparison["similarities"]:

        if token not in token_ids.keys():
            token_ids[token] = []

        token_ids[token].append(comparison["similarities"][token]) # Adds adds the dissimilarity number for that token

# Find the harmonic mean for those dissimilarities
# We find the harmonic mean so that words that vary 
# across certain communities and not others are not
# weighted too highly

whole_corpora_dissimilarity = {id: harmonic_mean(token_ids[id]) for id in token_ids}

sorted_dis = sorted(whole_corpora_dissimilarity, key=lambda x: whole_corpora_dissimilarity[x], reverse=True)

# Get the ten highest means
top_dis = sorted_dis[:10]

for token in top_dis:
    print(f"TOKEN: {token}    DISSIMILARITY: {whole_corpora_dissimilarity[token]}")




TOKEN: accurate    DISSIMILARITY: 0.3114924430847168
TOKEN: responsibility    DISSIMILARITY: 0.3113376498222351
TOKEN: defending    DISSIMILARITY: 0.3111327290534973
TOKEN: indictment    DISSIMILARITY: 0.3110048770904541
TOKEN: recent    DISSIMILARITY: 0.31099194288253784
TOKEN: types    DISSIMILARITY: 0.3109530806541443
TOKEN: steroids    DISSIMILARITY: 0.3108091950416565
TOKEN: kicked    DISSIMILARITY: 0.31079423427581787
TOKEN: muslims    DISSIMILARITY: 0.3104955554008484
TOKEN: overhaul    DISSIMILARITY: 0.31049036979675293


In [None]:
TARGET = "taxes"
words = []
values = []
communities = []

for word, value in demVec.most_similar(TARGET, topn=10):
    words.append(word)
    values.append(value)
    communities.append("r/democrats")

for word, value in repVec.most_similar(TARGET, topn=10):
    words.append(word)
    values.append(value)
    communities.append("r/republicans")

df = pd.DataFrame({
    "Community" : communities,
    "Word" : words,
    "Similarity" : values
})

In [None]:
p = (ggplot(df, aes(x="Word", y="Similarity", fill="Community")) +
 geom_col(position="dodge") +
 scale_fill_manual(values={"r/republicans" : "red", "r/democrats" : "blue"}) +
 labs(title=f'Semantic Similarity between "{TARGET}" and Related Words') +
 theme(axis_text_x=element_text(angle=45)))

p.show()

p.save("taxes.png")

similarity_print(demVec, TARGET, "democrats")
similarity_print(repVec, TARGET, "republicans")