## In-place synonym creation

This notebook loads a dataset, tokenizes it, and based on criteria from the user replaces some tokens with artificially created synonyms.  This should assist in validating word embeddings if a word and it's invented synonym show up close together.

In [1]:
#Quick cell to make jupyter notebook use the full screen width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


In [2]:
import numpy as np
import vectorizers
import textmap.tokenizers
import nltk
from nltk.tokenize import TweetTokenizer 
nltk.download('stopwords')
from nltk.corpus import stopwords
import random
import string
from sklearn.preprocessing import normalize
import pickle

from src.data import Dataset

  "The stanza library could not be imported StanzaTokenizer will not be available."
  "The SpaCy library could not be imported SpaCyTokenizer will not be available."
  "The tokenizers library could not be imported SpaCyTokenizer will not be available."
[nltk_data] Downloading package stopwords to /home/john/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Load Dataset

In [3]:
dataset = Dataset.load('reddit_comment_tree_text')

In [4]:
data = np.array(dataset['data']['body'])

In [5]:
type(data)

numpy.ndarray

### Tokenize data

In [6]:
%%time
tokens = textmap.tokenizers.NLTKTweetTokenizer().fit_transform(data)

CPU times: user 2min 56s, sys: 1.24 s, total: 2min 57s
Wall time: 2min 57s


### Token replacement function

In [8]:
def synonym_token_replace(
        tokens,
        ignored_tokens=None,
        excluded_token_regex=None,
        min_frequency=None,
        max_frequency=None,
        min_occurrences=None,
        max_occurrences=None,
        min_document_frequency=None,
        max_document_frequency=None,
        min_document_occurrences=None,
        max_document_occurrences=None,
        num_candidates=25,
        replace_probability=[0.3]):

    """

    :param tokens: a tuple of tuples of tokenized documents

    :param ignored_tokens: a set of tokens to prune from token dictionary

    :param excluded_token_regex: a regex pattern to identify tokens to prune from token dictionary

    :param min_frequency: float - The minimum frequency of occurrence allowed for tokens. Tokens that occur
        less frequently than this will be pruned.

    :param max_frequency: float - The maximum frequency of occurrence allowed for tokens. Tokens that occur
        more frequently than this will be pruned.

    :param min_occurrences: int - A constraint on the minimum number of occurrences for a token to be considered
        valid. If None then no constraint will be applied.

    :param max_occurrences: int - A constraint on the maximum number of occurrences for a token to be considered
        valid. If None then no constraint will be applied.

    :param min_document_frequency: int - A constraint on the minimum frequency of documents with occurrences for a
        token to be considered valid. If None then no constraint will be applied.

    :param max_document_frequency: int - A constraint on the maximum frequency of documents with occurrences for a
        token to be considered valid. If None then no constraint will be applied.

    :param min_document_occurrences: int - A constraint on the minimum number of documents with occurrences for a
        token to be considered valid. If None then no constraint will be applied.

    :param max_document_occurrences: int - A constraint on the maximum number of documents with occurrences for a
        token to be considered valid. If None then no constraint will be applied.


    :param num_candidates: int - The number of candidate tokens to be replaced with synonyms

    :param replace_probability: list - List of floats of probabilities for synonym creation

    :return:  a tuple of tuples of tokenized documents containing new synonym in place of original tokens

    """

    # flatten tuple of tuples and get token dictionary
    token_dict, token_freq, n_tokens = vectorizers._vectorizers.construct_token_dictionary_and_frequency(
        vectorizers.utils.flatten(tokens))

    # prune token dictionary depending on parameters supplied by user
    # returns a dictionary of candidate tokens for replacement
    candidate_dict, candidate_freq = vectorizers._vectorizers.prune_token_dictionary(
        token_dict,
        token_freq,
        ignored_tokens=ignored_tokens,
        excluded_token_regex=excluded_token_regex,
        min_frequency=min_frequency,
        max_frequency=max_frequency,
        min_occurrences=min_occurrences,
        max_occurrences=max_occurrences,
        min_document_frequency=min_document_frequency,
        max_document_frequency=max_document_frequency,
        min_document_occurrences=min_document_occurrences,
        max_document_occurrences=max_document_occurrences,
        total_tokens=n_tokens,
        total_documents=len(tokens),
    )

    # take a random sample of tokens from the candidate dictionary
    candidate_tokens = random.sample(list(candidate_dict.keys()), num_candidates)
    print("Candidates for replacement:")
    print(candidate_tokens)



    # normalize replacement_probability
    norm_prob = np.array(replace_probability).reshape(1,-1)
    norm_prob = normalize(norm_prob, axis=1, norm='l1').flatten().tolist()

    """
    Iterate over documents, checking probabilities for token replacement with synonyms

    Based on user-defined probabilities, candidate tokens will be replaced with one of k synonyms.
    These synonyms will take the form <original_token>_$$<k> where k is the index of the kth probability  
    passed by the user

    A new sequence of tokenized documents, including created synonyms, will be built
    """

    new_doc_list = []

    for doc in tokens:
        new_doc = []
        for token in doc:

            if token not in candidate_tokens:
                new_doc.append(token)
            else:
                synonyms = []
                for idx,_ in enumerate(norm_prob):
                    synonyms.append(f"{token}_$${idx}")

                synonym = np.random.choice(synonyms, p=norm_prob)
                new_doc.append(synonym)


        new_doc_list.append(new_doc)

    # change dataset back to tuple of tuples before returning
    new_doc_tuple = tuple(tuple(doc) for doc in new_doc_list)


    return new_doc_tuple

In [9]:
new_tokens = synonym_token_replace(
        tokens,
        min_occurrences=1000,max_occurrences=2000,
        ignored_tokens={'deleted','removed'} | set(stopwords.words('english')) | set(string.punctuation),
        replace_probability=[0.3,0.2,0.4]
        )

Candidates for replacement:
['handful', 'fans', 'operation', 'borders', 'factors', 'mid', 'responses', 'congressmen', 'forgotten', 'lucky', 'damned', 'reaction', 'damaging', 'genuine', 'scholars', 'contempt', 'expense', 'scam', 'station', 'fell', 'assets', 'responding', 'breath', 'trail', 'flooding']


In [10]:
len(new_tokens)

195200

In [18]:
for doc in new_tokens[:1000]:
    for token in doc:
        if "_$$" in token:
            print(doc)
            print("=============================================")
            break

('lol', "reddit's", 'favorite', 'candidate', 'is', 'an', 'old', 'white', 'guy', '(', 'bernie', ')', '>', 'as', 'the', 'house', 'impeachment', 'hearings', 'move', 'forward', ',', 'sen', '.', 'john', 'kennedy', ',', 'r-la', '.', ',', 'argued', 'that', 'ukraine', 'tried', 'to', 'undermine', 'donald', 'trump', 'in', 'the', '2016', 'election', ',', 'so', 'trump', 'had', 'good', 'reason', 'to', 'be', 'wary', 'of', 'ukraine', 'and', 'delay', 'nearly', '$', '400', 'million', 'in', 'military', 'aid', '.', '>', '>', 'there', '’', 's', 'no', 'question', 'that', 'some', 'ukraine', 'officials', 'publicly', 'objected', 'to', 'comments', 'then-candidate', 'trump', 'made', 'about', 'russia', 'annexing', 'crimea', ',', 'about', '10,000', 'square', 'miles', 'of', 'what', 'had', 'been', 'ukrainian', 'territory', '.', 'trump', 'cast', 'the', 'russian', 'land', 'grab', 'essentially', 'as', 'a', 'done', 'deal', '.', '>', '>', 'but', 'kennedy', 'said', 'the', 'ukrainian', 'response', 'went', 'beyond', 'a', '