## Append sentence with replacement synonym

This notebook loads a dataset, tokenizes it, and based on criteria from the user replaces candidate tokens in the corpus with a token indicating it is the original token/sentence and, based upon some probability, appends an identical sentence to the document with the token replaced with a token that indicates it is an appended token/sentence.  This should assist in validating word embeddings if a sentence and it's appended synonym sentence show up close together.

In [1]:
#Quick cell to make jupyter notebook use the full screen width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


In [59]:
import numpy as np
import vectorizers
import textmap.tokenizers
import nltk
from nltk.tokenize import TweetTokenizer 
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
import random
import string
from sklearn.preprocessing import normalize
import pickle
import copy

from src.data import Dataset

[nltk_data] Downloading package stopwords to /home/john/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/john/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Load Dataset

In [3]:
dataset = Dataset.load('reddit_comment_tree_text')

In [4]:
data = np.array(dataset['data']['body'])

In [5]:
type(data)

numpy.ndarray

### Tokenize data

In [80]:
%%time
corpus = textmap.tokenizers.NLTKTweetTokenizer(tokenize_by='sentence').fit_transform(data)

CPU times: user 5min 2s, sys: 5.2 s, total: 5min 7s
Wall time: 5min 7s


In [127]:
pickle.dump(corpus, open("/home/john/Code/NLP_Utilities/corpus", "wb"))

### Token replacement function

In [121]:
def synonym_sentence_append(
        corpus,
        ignored_tokens=None,
        excluded_token_regex=None,
        min_frequency=None,
        max_frequency=None,
        min_occurrences=None,
        max_occurrences=None,
        min_document_frequency=None,
        max_document_frequency=None,
        min_document_occurrences=None,
        max_document_occurrences=None,
        num_candidates=25,
        tokens_to_replace=None,
        replace_probability=0.3):

    """

    :param corpus: a tuple of tuples of tokenized sentences

    :param ignored_tokens: a set of tokens to prune from token dictionary

    :param excluded_token_regex: a regex pattern to identify tokens to prune from token dictionary

    :param min_frequency: float - The minimum frequency of occurrence allowed for tokens. Tokens that occur
        less frequently than this will be pruned.

    :param max_frequency: float - The maximum frequency of occurrence allowed for tokens. Tokens that occur
        more frequently than this will be pruned.

    :param min_occurrences: int - A constraint on the minimum number of occurrences for a token to be considered
        valid. If None then no constraint will be applied.

    :param max_occurrences: int - A constraint on the maximum number of occurrences for a token to be considered
        valid. If None then no constraint will be applied.

    :param min_document_frequency: int - A constraint on the minimum frequency of documents with occurrences for a
        token to be considered valid. If None then no constraint will be applied.

    :param max_document_frequency: int - A constraint on the maximum frequency of documents with occurrences for a
        token to be considered valid. If None then no constraint will be applied.

    :param min_document_occurrences: int - A constraint on the minimum number of documents with occurrences for a
        token to be considered valid. If None then no constraint will be applied.

    :param max_document_occurrences: int - A constraint on the maximum number of documents with occurrences for a
        token to be considered valid. If None then no constraint will be applied.


    :param num_candidates: int - The number of candidate tokens to be replaced with synonyms

    :param tokens_to_replace: list - A list of tokens to be replaced with synonyms.  If None, the other parameters
        will be used to select tokens to replace

    :param replace_probability: float - the probability a new synonym sentence will be added to the corpus

    :return:  a tuple of tuples of tokenized sentences containing new synonym in place of original tokens and
        a list of the words that were replaced

    """

    #check if tokens to be replaced are supplied, if not, choose tokens depending on parameters from user
    if not tokens_to_replace:

        # flatten tuple of tuples and get token dictionary
        token_dict, token_freq, n_tokens = vectorizers._vectorizers.construct_token_dictionary_and_frequency(
            vectorizers.utils.flatten(corpus))

        # prune token dictionary depending on parameters supplied by user
        # returns a dictionary of candidate tokens for replacement
        candidate_dict, candidate_freq = vectorizers._vectorizers.prune_token_dictionary(
            token_dict,
            token_freq,
            ignored_tokens=ignored_tokens,
            excluded_token_regex=excluded_token_regex,
            min_frequency=min_frequency,
            max_frequency=max_frequency,
            min_occurrences=min_occurrences,
            max_occurrences=max_occurrences,
            min_document_frequency=min_document_frequency,
            max_document_frequency=max_document_frequency,
            min_document_occurrences=min_document_occurrences,
            max_document_occurrences=max_document_occurrences,
            total_tokens=n_tokens,
            total_documents=len(corpus),
        )

        # take a random sample of tokens from the candidate dictionary
        tokens_to_replace = random.sample(list(candidate_dict.keys()), num_candidates)


    print("Tokens for replacement:")
    print(tokens_to_replace)

    new_corpus = []

    for sent in corpus:
        word_changed = False
        sent = list(sent)

        # check each token by index and create a deep copy with the changed word at that index and add new sentence to new corpus
        for idx,token in enumerate(sent):
            if token in tokens_to_replace:
                new_sent=copy.deepcopy(sent)
                new_sent[idx] = f"{token}_$$0"
                new_corpus.append(new_sent)
                word_changed = True

                # depending on probability, add another copy of the new sentence with the second replacement synonym
                if random.random() <= replace_probability:
                    added_sent=copy.deepcopy(sent)
                    added_sent[idx] = f"{token}_$$1"
                    new_corpus.append(added_sent)


        # if no words were changed, just add the original sentence to the new corpus
        if not word_changed:
            new_corpus.append(sent)
            
        
    # change dataset back to tuple of tuples before returning
    new_corpus_tuple = tuple(tuple(sent) for sent in new_corpus)


    return new_corpus_tuple, tokens_to_replace


In [118]:
%%time
new_corpus, replaced_tokens = synonym_sentence_append(
        corpus,
        min_occurrences=1000,max_occurrences=2000,
        ignored_tokens={'deleted','removed'} | set(stopwords.words('english')) | set(string.punctuation)
        )

Tokens for replacement:
['distinction', 'documented', 'protected', 'pm', 'muslims', 'incredible', 'supposedly', 'kick', 'targeted', 'unnecessary', 'breath', 'careful', 'bully', 'coward', 'rejected', 'blackout', 'forcing', 'mail', 'rolls', 'requirements']
CPU times: user 42.3 s, sys: 330 ms, total: 42.6 s
Wall time: 42.6 s


In [111]:
new_corpus[:10]

(('official',
  'meetings',
  'with',
  'adversaries',
  'by',
  'presidents',
  ',',
  'secretary',
  'of',
  'states',
  'and',
  'members',
  'of',
  'congress',
  'are',
  'not',
  'secret',
  '...',
  'unless',
  'of',
  'course',
  'you',
  'are',
  'donald',
  'trump_$$0',
  '...',
  'or',
  'tulsi',
  'gabbard',
  '.'),
 ('official',
  'meetings',
  'with',
  'adversaries',
  'by',
  'presidents',
  ',',
  'secretary',
  'of',
  'states',
  'and',
  'members',
  'of',
  'congress',
  'are',
  'not',
  'secret',
  '...',
  'unless',
  'of',
  'course',
  'you',
  'are',
  'donald',
  'trump_$$1',
  '...',
  'or',
  'tulsi',
  'gabbard',
  '.'),
 ('https://www.nbcnews.com/politics/elections/bernie-sanders-camp-fix-was-against-clinton-n817501',
  'http://nymag.com/intelligencer/2019/06/bernie-sanders-2016-rigged-wont-pledge-support-winner.html',
  'i',
  'am',
  'sure',
  "i'll",
  'find',
  'some',
  'more',
  'sources',
  'if',
  'you',
  'insist',
  '.'),
 ('so',
  'she_$$0',
 

In [119]:
len(corpus)

4284273

In [120]:
len(new_corpus)

4293246

In [98]:
for idx,sent in enumerate(new_corpus):
    for token in sent:
        if "_$$1" in token:
            print(new_corpus[idx])
            print()
        break

In [124]:
for sent in new_corpus[100:5000]:
    for token in sent:
        if "_$$" in token:
            print(sent)
            print("=============================================")
            

('>', '>', '"', "it's", 'been', 'well', 'documented_$$0', 'in', 'the', 'financial', 'times', ',', 'in', 'politico', ',', 'in', 'the', 'economist', ',', 'in', 'the', 'washington', 'examiner', ',', 'even', 'on', 'cbs', ',', 'that', 'the', 'prime', 'minister', 'of', 'ukraine', ',', 'the', 'interior', 'minister', ',', 'the', 'ukrainian', 'ambassador', 'to', 'the', 'united', 'states', ',', 'the', 'head', 'of', 'the', 'ukrainian', 'anti-corruption', 'league', ',', 'all', 'meddled', 'in', 'the', 'election', 'on', 'social', 'media', 'and', 'otherwise', ',', '"', 'he', 'said', '.')
('[', 'x', ']', 'rolls_$$0', 'back', 'protective', 'regulations', 'for', 'the', 'american', 'people', '.')
('[', 'x', ']', 'rolls_$$1', 'back', 'protective', 'regulations', 'for', 'the', 'american', 'people', '.')
('because', 'people', 'buy', 'a', 'huge', 'amount', 'of', 'stuff', 'right', 'before', 'tariffs', 'kick_$$0', 'in', ',', "that's", 'the', 'boom', ',', 'the', 'decline', 'happens', 'after', 'when', 'people', 

In [69]:

corpus = list(list(sent) for sent in tokens[:3])

In [70]:
corpus[2]

['so',
 'she',
 'volunteered',
 'to',
 'serve',
 'in',
 'iraq',
 ',',
 'a',
 'war',
 'she',
 "didn't",
 'agree',
 'with',
 '.']

In [71]:
tokens_to_replace = ['trump', 'secret', 'iraq', 'she']
prob = 1

In [75]:
new_corpus = []

#iterate over sentences in the corpus of tokenized sentences
for sent in corpus:
    word_changed = False
    sent = list(sent)

    # check each token by index and create a deep copy with the changed word at that index and add new sentence to new corpus
    for idx,token in enumerate(sent):
        if token in tokens_to_replace:
            new_sent=copy.deepcopy(sent)
            new_sent[idx] = f"{token}_$$0"
            print(new_sent)
            new_corpus.append(new_sent)
            word_changed = True
            
            # depending on probability, add another copy of the new sentence with the second replacement synonym
            if random.random() <= prob:
                added_sent=copy.deepcopy(sent)
                added_sent[idx] = f"{token}_$$1"
                new_corpus.append(new_sent)
                print(added_sent)
                
            print()
    
    # if no words were changed, just add the original sentence to the new corpus
    if not word_changed:
        new_corpus.append(sent)
        print(sent)
        print()
        


    
    

['official', 'meetings', 'with', 'adversaries', 'by', 'presidents', ',', 'secretary', 'of', 'states', 'and', 'members', 'of', 'congress', 'are', 'not', 'secret_$$0', '...', 'unless', 'of', 'course', 'you', 'are', 'donald', 'trump', '...', 'or', 'tulsi', 'gabbard', '.']
['official', 'meetings', 'with', 'adversaries', 'by', 'presidents', ',', 'secretary', 'of', 'states', 'and', 'members', 'of', 'congress', 'are', 'not', 'secret_$$1', '...', 'unless', 'of', 'course', 'you', 'are', 'donald', 'trump', '...', 'or', 'tulsi', 'gabbard', '.']

['official', 'meetings', 'with', 'adversaries', 'by', 'presidents', ',', 'secretary', 'of', 'states', 'and', 'members', 'of', 'congress', 'are', 'not', 'secret', '...', 'unless', 'of', 'course', 'you', 'are', 'donald', 'trump_$$0', '...', 'or', 'tulsi', 'gabbard', '.']
['official', 'meetings', 'with', 'adversaries', 'by', 'presidents', ',', 'secretary', 'of', 'states', 'and', 'members', 'of', 'congress', 'are', 'not', 'secret', '...', 'unless', 'of', 'cou

In [None]:
        
    
    """
    # for each word to be replaced, see if it exists, if so, change it and add sentence to new corpus
    for token in tokens_to_replace:
        if token in sent:
            new_sent = [word if word != token else f"{word}_$$0" for word in sent]
            print(new_sent)
            word_changed=True
            new_corpus.append(new_sent)
            
            # check probability to see is an additional new copy should also be added with a new synonym
            if random.random() <= prob:
                added_sent = [word if "_$$0" not in word else word.replace("_$$0", "_$$1") for word in new_sent]
                new_corpus.append(added_sent)
                print(added_sent)
                
    # if no words were changed in the sentence, just add the original sentence to the new corpus    
    if not word_changed:
        new_corpus.append(sent)
        print(sent)
    
    
    """
    
    
    
    
    
    """
    #reconstruct the sentence, replacing any of the tokens_to_replace
    for token in sent:
        if token not in tokens_to_replace:
            new_sent.append(token)
        else:
            new_sent.append(f"{token}_$$0")
            possible_add = True  # flag indicating to check whether to append an extra new sentence to the corpus
            
    new_corpus.append(new_sent)
    
    #if the sentence had contained a replaceable token, check probability to append an additional new sentence
    if possible_add:
        if random.random() <= prob:
            added_sent = [token if "_$$0" not in token else token.replace("_$$0", "_$$1") for token in new_sent]
            new_corpus.append(added_sent)

   """