<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Functions" data-toc-modified-id="Functions-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Functions</a></span></li><li><span><a href="#experiments" data-toc-modified-id="experiments-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>experiments</a></span></li></ul></div>

1. Fill in the missing word with choices
2. Sentence structure
3. Choose the correct sentence
4. Missing words, parts of speech

***pyinflect's Tags***

pos_type = 'A'
* JJ      Adjective
* JJR     Adjective, comparative
* JJS     Adjective, superlative
* RB      Adverb
* RBR     Adverb, comparative
* RBS     Adverb, superlative

pos_type = 'N'
* NN      Noun, singular or mass
* NNS     Noun, plural

pos_type = 'V'
* VB      Verb, base form
* VBD     Verb, past tense
* VBG     Verb, gerund or present participle
* VBN     Verb, past participle
* VBP     Verb, non-3rd person singular present
* VBZ     Verb, 3rd person singular present
* MD      Modal

In [2]:
import pandas as pd
import spacy
import gensim.downloader as api
import pathlib
from sentence_splitter import SentenceSplitter, split_text_into_sentences
from pyinflect import getAllInflections
import numpy as np

## Functions

In [2]:
def text_to_sentences(text):
    splitter = SentenceSplitter(language='en')
    sentences = splitter.split(text)
    df = pd.DataFrame(sentences, columns=['raw'])
    return df

In [3]:
def select_word(sentence):
    """
    Selects a word from a given sentence and returns a list of possible inflected forms.
    Args: sentence (str). Returns: a list of inflected word forms. 
    """
    doc = nlp(sentence)
    candidate_tokens = [token for token in doc if not token.is_punct and not token.is_space
                            and token.is_lower and token.tag_.startswith(('V', 'N'))] # , 'A'
    if not candidate_tokens:
        return np.nan, np.nan
    
    selected_word = random.choice(candidate_tokens)
    inflections = getAllInflections(selected_word.lemma_, pos_type=selected_word.tag_[0])

    unique_values = [selected_word.lower_]

    # add the plural form for checking user's knowledge of noun pluralization
    if selected_word.pos_.startswith('N'):
        s_form = selected_word.lemma_ + 's'
        if s_form not in unique_values:
            unique_values.append(s_form)

    for forms in inflections.values():
        for form in forms:
            if form not in unique_values:
                unique_values.append(form)

    while len(unique_values) > 4:
        unique_values.pop()

    random.shuffle(unique_values)
    return selected_word, unique_values

In [None]:
nlp = spacy.load('en_core_web_sm')

model = api.load("glove-wiki-gigaword-100")

In [None]:
file_name = "Little_Red_Cap_ Jacob_and_Wilhelm_Grimm.txt"

# Read the text from the file
text = pathlib.Path(file_name).read_text(encoding='utf-8')
basic_english = pd.read_csv('ogden_basic_english.csv')

In [3]:
ogden_basic_english = pd.read_csv('ogden_basic_english.csv')['0'].values.tolist()

In [4]:
ogden_basic_english

['a',
 'able',
 'about',
 'account',
 'acid',
 'across',
 'act',
 'addition',
 'adjustment',
 'advertisement',
 'after',
 'again',
 'against',
 'agreement',
 'air',
 'all',
 'almost',
 'among',
 'amount',
 'amusement',
 'and',
 'angle',
 'angry',
 'animal',
 'answer',
 'ant',
 'any',
 'apparatus',
 'apple',
 'approval',
 'arch',
 'argument',
 'arm',
 'army',
 'art',
 'as',
 'at',
 'attack',
 'attempt',
 'attention',
 'attraction',
 'authority',
 'automatic',
 'awake',
 'baby',
 'back',
 'bad',
 'bag',
 'balance',
 'ball',
 'band',
 'base',
 'basin',
 'basket',
 'bath',
 'be',
 'are',
 'beautiful',
 'because',
 'bed',
 'bee',
 'before',
 'behavior',
 'belief',
 'bell',
 'bent',
 'berry',
 'between',
 'bird',
 'birth',
 'bit',
 'bite',
 'bitter',
 'black',
 'blade',
 'blood',
 'blow',
 'blue',
 'board',
 'boat',
 'body',
 'boiling',
 'bone',
 'book',
 'boot',
 'bottle',
 'box',
 'boy',
 'brain',
 'brake',
 'branch',
 'brass',
 'bread',
 'breath',
 'brick',
 'bridge',
 'bright',
 'broken'

In [283]:
df = text_to_sentences(text)

## experiments

In [612]:
df['type'] = 'select_word'

In [613]:
df['object'], df['options'] = zip(*df['raw'].apply(select_word))

  result = np.asarray(values, dtype=dtype)


In [708]:
def missing_word(sentence):
    doc = nlp(sentence)
    candidate_tokens = [token for token in doc if not token.is_punct and not token.is_space
                            and token.is_lower and token.lemma_ in ogden_basic_english]
    if not candidate_tokens:
        return np.nan
    
    selected_word = random.choice(candidate_tokens)
    return selected_word

In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForMaskedLM

def generate_incorrect_variations(sentence):
    # Load the pre-trained DistilBERT model and tokenizer
    model_name = 'distilbert-base-uncased'
    model = DistilBertForMaskedLM.from_pretrained(model_name)
    tokenizer = DistilBertTokenizer.from_pretrained(model_name)

    # Tokenize the sentence and find the masked token
    tokenized_text = tokenizer.tokenize(sentence)
    masked_index = tokenized_text.index("[MASK]")

    # Convert tokens to tensor
    token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([token_ids])

    # Generate multiple completions using the DistilBERT model
    with torch.no_grad():
        predictions = model(tokens_tensor)[0]
        predicted_token_ids = predictions[0, masked_index].topk(4).indices.tolist()

    # Decode the predicted token IDs into sentences
    variations = []
    for predicted_token_id in predicted_token_ids:
        predicted_token = tokenizer.convert_ids_to_tokens([predicted_token_id])[0]
        tokenized_text[masked_index] = predicted_token
        generated_sentence = tokenizer.convert_tokens_to_string(tokenized_text)
        variations.append(generated_sentence)

    return variations

# Example usage
initial_sentence = "Do you like [MASK]?"
incorrect_variations = generate_incorrect_variations(initial_sentence)
print(incorrect_variations)


In [None]:
generate_incorrect_variations('She decided to eat that cake')