In [None]:
import fix_notebook_imports

from src import util

import collections
from pprint import pprint
import itertools
import multiprocessing

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_context("poster")

import scipy
import gensim


In [None]:
CONTEXT_WINDOW_SIZE = 3

In [None]:
def check_for_left_right_error_exception(left_only, right_only): 
    if left_only and right_only: 
        raise Exception("Only one can be true: left_only or right_only")

In [None]:
def get_contexts_for_target_word(target_word, context_window_size=CONTEXT_WINDOW_SIZE, left_only=False, right_only=False):
    check_for_left_right_error_exception(left_only, right_only)
    
    contexts = []
    for i, paragraph in enumerate(util.PARAGRAPHS):
        for j, word, in enumerate(paragraph):
            if word.lower() == target_word.lower():
                start = max(0, j-context_window_size)
                end = min(len(paragraph)-1, j+context_window_size+1)
                left = paragraph[start:j]
                right = paragraph[j+1:end]
                if left_only:
                    context = left
                elif right_only:
                    context = right
                else:
                    context = left + right
                if len(context) > 0:
                    contexts.append(context)
                
    return contexts

In [None]:
def get_contexts_containing_context_word(context_word, context_window_size=CONTEXT_WINDOW_SIZE, left_only=False, right_only=False):
    check_for_left_right_error_exception(left_only, right_only)
    
    contexts = []
    for i, paragraph in enumerate(util.PARAGRAPHS):
        for j, target_word, in enumerate(paragraph):
            start = max(0, j-context_window_size)
            end = min(len(paragraph)-1, j+context_window_size+1)
            left = paragraph[start:j]
            right = paragraph[j+1:end]
            if left_only:
                context = left
            elif right_only:
                context = right
            else:
                context = left + right
            if context_word.lower() in [w.lower() for w in context]:
                contexts.append(context)
                
    return contexts

In [None]:
def get_readable_contexts_for_target_word(target_word, context_window_size=CONTEXT_WINDOW_SIZE, left_only=False, right_only=False):
    check_for_left_right_error_exception(left_only, right_only)
    
    readable_contexts = []
    for i, paragraph in enumerate(util.PARAGRAPHS):
        for j, word, in enumerate(paragraph):
            if word.lower() == target_word.lower():
                start = max(0, j-context_window_size)
                end = min(len(paragraph)-1, j+context_window_size+1)
                left = " ".join(paragraph[start:j])
                right = " ".join(paragraph[j+1:end])
                if left_only:
                    readable_context =  f"{left} <<{target_word}>>"
                elif right_only:
                    readable_context = f"<<{target_word}>> {right}"
                else:
                    readable_context = f"{left} <<{target_word}>> {right}"
                readable_contexts.append(readable_context)
                
    return readable_contexts

In [None]:
def get_shared_context_words_counter(target_word_1, target_word_2, context_window_size=CONTEXT_WINDOW_SIZE, left_only=False, right_only=False):
    counter_1 = get_context_word_counter_for_target_word(target_word_1, context_window_size, left_only=left_only, right_only=right_only)
    counter_2 = get_context_word_counter_for_target_word(target_word_2, context_window_size, left_only=left_only, right_only=right_only)
    counts = [(key, (round(counter_1[key]/sum(counter_1.values()), 3), round(counter_2[key]/sum(counter_2.values()), 3))) for key in list((counter_1.keys() & counter_2.keys()))]
    
    counts = sorted(counts, key=lambda x: sum(x[1]), reverse=True)
    counter = collections.OrderedDict(counts)
    
    return counter

In [None]:
def get_context_word_counter_for_target_word(target_word, context_window_size=CONTEXT_WINDOW_SIZE, left_only=False, right_only=False):
    contexts = get_contexts_for_target_word(target_word, context_window_size, left_only=left_only, right_only=right_only)
    context_words = [word for context in contexts for word in context]
    counter = collections.Counter(context_words)
    
    return counter

In [None]:
def get_target_word_to_contexts_dict(context_window_size=CONTEXT_WINDOW_SIZE, left_only=False, right_only=False):
    target_word_to_contexts_dict = {}
    for target_word in util.VOCAB:
        target_word_to_contexts_dict[target_word] = get_contexts_for_target_word(target_word, context_window_size, left_only=left_only, right_only=right_only)
        
    return target_word_to_contexts_dict


In [None]:
def get_context_word_to_contexts_dict(context_window_size=CONTEXT_WINDOW_SIZE, left_only=False, right_only=False):    
    context_word_to_contexts_dict = {}
    for context_word in util.VOCAB:
        context_word_to_contexts_dict[context_word] = get_contexts_containing_context_word(context_word, context_window_size, left_only=left_only, right_only=right_only)
        
    return context_word_to_contexts_dict


In [None]:
def get_target_word_probability_dict(context_window_size=CONTEXT_WINDOW_SIZE):
    target_word_probability_dict = collections.defaultdict(float)
    target_word_counter = collections.Counter(util.WORDS)
    num_target_words = len(util.WORDS)
    for target_word in util.VOCAB:
        target_word_probability_dict[target_word] /= num_target_words
        
    return target_word_probability_dict


In [None]:
def get_context_word_probability_dict(context_window_size=CONTEXT_WINDOW_SIZE, left_only=False, right_only=False):
    context_word_probability_dict = collections.defaultdict(float)
    num_contexts = len(util.WORDS)
    for target_word in util.VOCAB:
        contexts = get_contexts_for_target_word(target_word, context_window_size, left_only=left_only, right_only=right_only)
        for context in contexts:
            for context_word in list(set(context)):
                context_word_probability_dict[context_word] += (1/num_contexts)
        
    return context_word_probability_dict

In [None]:
def get_context_word_probability_given_target_word_dict(context_window_size=CONTEXT_WINDOW_SIZE, left_only=False, right_only=False):
    context_word_probability_given_target_word_dict = collections.defaultdict(float)
    target_word_counter = collections.Counter(util.WORDS)
    for target_word in util.VOCAB:
        contexts = get_contexts_for_target_word(target_word, context_window_size, left_only=left_only, right_only=right_only)
        for context in contexts:
            for context_word in list(set(context)):
                context_word_probability_given_target_word_dict[(target_word, context_word)] += (1/target_word_counter[target_word])

    return context_word_probability_given_target_word_dict


In [None]:
target_word_to_contexts_dict = get_target_word_to_contexts_dict()
target_word_to_left_contexts_dict = get_target_word_to_contexts_dict(left_only=True)
target_word_to_right_contexts_dict = get_target_word_to_contexts_dict(right_only=True)


In [None]:
context_word_to_contexts_dict = get_context_word_to_contexts_dict()
context_word_to_left_contexts_dict = get_context_word_to_contexts_dict(left_only=True)
context_word_to_right_contexts_dict = get_context_word_to_contexts_dict(right_only=True)


In [None]:
context_word_probability_dict = get_context_word_probability_dict()
left_context_word_probability_dict = get_context_word_probability_dict(left_only=True)
right_context_word_probability_dict = get_context_word_probability_dict(right_only=True)


In [None]:
target_word_probability_dict = get_target_word_probability_dict()


In [None]:
context_word_probability_given_target_word_dict = get_context_word_probability_given_target_word_dict()
left_context_word_probability_given_target_word_dict = get_context_word_probability_given_target_word_dict(left_only=True)
right_context_word_probability_given_target_word_dict = get_context_word_probability_given_target_word_dict(right_only=True)


In [None]:
def get_binomial_probability_of_at_least_k_contexts_containing_context_word(contexts, context_word, verbose=False):
    p = context_word_probability_dict[context_word]
    n = len(contexts)
    k = sum([1 if context_word in context else 0 for context in contexts])
    
    if k == 0: 
        prob = 1.0
    else:
        prob = 1 - scipy.stats.binom.cdf(k-1, n, p)
    
    if verbose:
        print(f"Binomial probability (assuming independence) of at least as many contexts containing the given context word as was witnessed: {prob}")
        print("Binomial parameters:")
        print(f"\tp: {round(p, 5)}")
        print(f"\tn: {n}")
        print(f"\tk: {k}")
        
    return prob

In [None]:
def get_statistically_unlikely_target_word_context_word_pairs(target_word_to_contexts_dict, threshold=1e-5, min_occurrences=10):
    results = []
    print(f"Exploring {len(util.VOCAB)} different words...")
    for i, target_word in enumerate(util.VOCAB):
        if not sum([1 if w == target_word else 0 for w in util.WORDS]) > min_occurrences: continue
        contexts = target_word_to_contexts_dict[target_word]
        unique_context_words = list(set([context_word for context in contexts for context_word in context]))
        for context_word in unique_context_words:
            prob = get_binomial_probability_of_at_least_k_contexts_containing_context_word(contexts, context_word)
            if prob < threshold:
                results.append((target_word, context_word, prob))
    results = sorted(results, key=lambda x: x[2])
    print("Done.")
    
    return results

In [None]:
statistically_unlikely_target_word_context_word_pairs = get_statistically_unlikely_target_word_context_word_pairs(target_word_to_contexts_dict)
statistically_unlikely_target_word_left_context_word_pairs = get_statistically_unlikely_target_word_context_word_pairs(target_word_to_left_contexts_dict)
statistically_unlikely_target_word_right_context_word_pairs = get_statistically_unlikely_target_word_context_word_pairs(target_word_to_right_contexts_dict)


In [None]:
statistically_unlikely_target_word_context_word_pairs


In [None]:
pprint(statistically_unlikely_target_word_left_context_word_pairs)
print("")
pprint(statistically_unlikely_target_word_right_context_word_pairs)

In [None]:
def analyze_target_word_context_word_pair(target_word, context_word):
    print(f"Target word: {target_word}")
    print(f"Context word: {context_word}")
    print("")
    print(f"Prior probability: {round(context_word_probability_dict[context_word], 5)}")
    print(f"Posterior probability: {round(context_word_probability_given_target_word_dict[(target_word, context_word)], 5)}")
    print("")
    get_binomial_probability_of_at_least_k_contexts_containing_context_word(target_word_to_contexts_dict[target_word], context_word, verbose=True)


In [None]:
target_word = "qokeedy"
context_word = "qokeedy"

analyze_target_word_context_word_pair(target_word, context_word)


In [None]:
words_likely_to_repeat_in_same_context = []

MULTIPLIER = 3
for word in util.VOCAB: 
    try:
        if context_word_probability_given_target_word_dict[(word, word)] > MULTIPLIER*context_word_probability_dict[word]:
            words_likely_to_repeat_in_same_context.append(word)
        else:
            pass
    except:
        pass
    
ratio = len(words_likely_to_repeat_in_same_context) / len(util.VOCAB)
print(ratio)    
    
words_likely_to_repeat_in_same_context

In [None]:
vector_size = 100
window = CONTEXT_WINDOW_SIZE
min_count = 3
epochs = 40

model = gensim.models.word2vec.Word2Vec(
    util.PARAGRAPHS, 
    workers=multiprocessing.cpu_count(), 
    size=vector_size, 
    window=window, 
    min_count=min_count,
    iter=epochs
)

In [None]:
def get_probability_of_pattern(pattern):
    return sum([1 for w in util.VOCAB if pattern.lower() in w.lower()]) / len(util.VOCAB)

In [None]:
def get_binomial_probability_of_at_least_k_similar_word_embeddings_containing_pattern(word, pattern, model, topn=10, verbose=False):    
    p = get_probability_of_pattern(pattern)
    n = topn
    if word in model.wv.vocab:
        k = sum([1 for w, v in model.wv.most_similar(word.lower(), topn=n) if pattern.lower() in w.lower()])
    else: 
        k = 0
    
    if k == 0: 
        prob = 1.0
    else:
        prob = 1 - scipy.stats.binom.cdf(k-1, n, p)
    
    if verbose:
        print(f"Binomial probability (assuming independence) of at least as many similar word embeddings containing the pattern as was witnessed: {prob}")
        print("Binomial parameters:")
        print(f"\tp: {round(p, 5)}")
        print(f"\tn: {n}")
        print(f"\tk: {k}")
        
    return prob

In [None]:
def get_statistically_unlikely_patterns_in_similar_word_embeddings(model, threshold=1e-3):
    results = []
    print(f"Exploring {len(util.VOCAB)} different words...")
    for i, word in enumerate(util.VOCAB):
        already_explored = []
        for x,y in itertools.combinations(range(len(word)), r=2):
            pattern = word[x:y]
            if (word, pattern) in already_explored:
                continue
            prob = get_binomial_probability_of_at_least_k_similar_word_embeddings_containing_pattern(word, pattern, model)
            if prob < threshold:
                results.append((word, pattern, prob))
            already_explored.append((word, pattern))
    results = sorted(results, key=lambda x: x[2])
    print("Done.")
    
    return results

In [None]:
statistically_unlikely_patterns_in_similar_word_embeddings = get_statistically_unlikely_patterns_in_similar_word_embeddings(model)


In [None]:
statistically_unlikely_patterns_in_similar_word_embeddings


In [None]:
def analyze_pattern_in_similar_word_embeddings(word, pattern):
    print(f"Reference word: {word}")
    print(f"Pattern: {pattern}")
    print("")
    print(f"Probability of pattern occurence: {round(get_probability_of_pattern(pattern), 5)}")
    print("")
    print("Most similar word embeddings:")
    pprint(model.wv.most_similar(word))
    print("")
    get_binomial_probability_of_at_least_k_similar_word_embeddings_containing_pattern(word, pattern, model, verbose=True)


In [None]:
analyze_pattern_in_similar_word_embeddings("tchol", "ho")


In [None]:
analyze_pattern_in_similar_word_embeddings("lkchedy", "ched")
