In [5]:
import nltk
nltk.download('punkt')
# tokenize the text
tokens = nltk.word_tokenize("The quick brown fox jumps over the lazy dog")

# generate bigrams
bigrams = nltk.ngrams(tokens, 2)

# print the bigrams
print(list(bigrams))

[('The', 'quick'), ('quick', 'brown'), ('brown', 'fox'), ('fox', 'jumps'), ('jumps', 'over'), ('over', 'the'), ('the', 'lazy'), ('lazy', 'dog')]


[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
import nltk
from nltk import ngrams
from nltk.probability import ConditionalFreqDist

# define the input text
text = "The quick brown fox jumps over the lazy dog"

# tokenize the text and convert the tokens to lowercase
tokens = [w.lower() for w in nltk.word_tokenize(text)]

# generate the trigrams
trigrams = ngrams(tokens, 3)

# compute the conditional frequency distribution of the next word given the previous two words
cfd = ConditionalFreqDist([((w1, w2), w3) for w1, w2, w3 in trigrams])

# print the raw frequency of the trigram "the quick brown"
print(cfd[('the', 'quick')]['brown'])

# predict the next word in the sequence "the quick"
next_word_dist = cfd[('the', 'quick')]

# print the predicted next word and its probability
print(next_word_dist.max())
print(next_word_dist.freq(next_word_dist.max()))

1
brown
1.0


In [7]:
import torchtext.legacy.data
from torchtext.data.utils import ngrams_iterator

# define the input text
text = "The quick brown fox jumps over the lazy dog"

# define the tokenizer
tokenizer = lambda x: x.split()

# define the field that will be used to tokenize the text
field = torchtext.legacy.data.Field(sequential=True, tokenize=tokenizer)

# tokenize the text
tokens = field.tokenize(text)

# create the bigrams from the tokenized text
bigrams = list(ngrams_iterator(tokens, 2))[len(tokens):]

# convert the bigrams to a list of tuples
bigrams = [tuple(bigram.split()) for bigram in bigrams]

# print the bigrams
print(bigrams)

[('The', 'quick'), ('quick', 'brown'), ('brown', 'fox'), ('fox', 'jumps'), ('jumps', 'over'), ('over', 'the'), ('the', 'lazy'), ('lazy', 'dog')]


In [8]:
import torchtext
import torchtext.legacy.data

# define the input text
text = "The quick brown fox jumps over the lazy dog"

# define the tokenizer
tokenizer = lambda x: x.split()

# define the field that will be used to tokenize the text
field = torchtext.legacy.data.Field(sequential=True, tokenize=tokenizer, lower=False)

# tokenize the text
tokens = field.tokenize(text)

# create the bigrams from the tokenized text
bigrams = list(torchtext.data.utils.ngrams_iterator(tokens, 2))[len(tokens):]

# convert the bigrams to a list of tuples
bigrams = [tuple(bigram.split()) for bigram in bigrams]

print(bigrams)

[('The', 'quick'), ('quick', 'brown'), ('brown', 'fox'), ('fox', 'jumps'), ('jumps', 'over'), ('over', 'the'), ('the', 'lazy'), ('lazy', 'dog')]


In [9]:
def sort_key(example):
    # return the length of the first bigram in the example
    return len(example[0][0])

In [10]:
import torchtext

# define the input text
text = "The quick brown fox jumps over the lazy dog"

# define the tokenizer
tokenizer = lambda x: x.split()

# tokenize the text
tokens = tokenizer(text)

# create the bigrams from the tokenized text
bigrams = list(torchtext.data.utils.ngrams_iterator(tokens, 2))[len(tokens):]

# convert the bigrams to a list of tuples
bigrams = [tuple(bigram.split()) for bigram in bigrams]

print(bigrams)
# define a field for the bigrams
bigram_field = torchtext.legacy.data.Field(sequential=True, tokenize=tokenizer, use_vocab=True)


# define an example for each bigram
examples = [torchtext.legacy.data.Example.fromlist([bigram], [('bigram', bigram_field)]) for bigram in bigrams]

# define a dataset from the examples and the bigram field
dataset = torchtext.legacy.data.Dataset(examples, [('bigram', bigram_field)])

# define an iterator for the dataset
iterator = torchtext.legacy.data.BucketIterator(dataset, batch_size=2, train=False, sort=False)

# print the bigrams in the first batch
# print('bigrams in the first batch:', next(iter(iterator)).bigram)

# for example in next(iter(iterator)):
#     print('bigrams in the first batch:', example.bigram)

[('The', 'quick'), ('quick', 'brown'), ('brown', 'fox'), ('fox', 'jumps'), ('jumps', 'over'), ('over', 'the'), ('the', 'lazy'), ('lazy', 'dog')]


In [20]:
import torch
import numpy as np

torch.manual_seed(42)
np.random.seed(42)

def generate_sequence():
    sequence = ["I", "had", "a", "sandwich", "for", "lunch", "yesterday", "and", "it", "was", "delicious"]
    return sequence

class NGramModel(torch.nn.Module):
    def __init__(self, num_words, num_hidden, n=2):
        super().__init__()
        self.n = n
        self.embedding = torch.nn.Embedding(num_words, num_hidden)
        self.linear = torch.nn.Linear(num_hidden * n, num_words)

    def forward(self, x):
        x = self.embedding(x)
        x = x.view(x.size(0), -1)
        return self.linear(x)

In [21]:
def train_model(model, sequence, word_to_idx, learning_rate=1e-2, epochs=100):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = torch.nn.CrossEntropyLoss()
    losses = []
    accuracies = []

    for epoch in range(1, epochs + 1):
        total_loss = 0
        total_accuracy = 0

        # Loop through the words in the sequence
        for i in range(len(sequence) - model.n):
            # Convert the current n-gram and next word to tensors
            current_ngram = torch.tensor([word_to_idx[word] for word in sequence[i:i+model.n]], dtype=torch.long)
            next_word = torch.tensor([word_to_idx[sequence[i + model.n]]], dtype=torch.long)

            # Predict the probability of the next word
            logits = model(current_ngram.unsqueeze(0))

            # Compute the cross-entropy loss and update the totals
            loss = criterion(logits, next_word)
            total_loss += loss.item()

            # Compute the accuracy and update the totals
            _, predicted = torch.max(logits, dim=1)
            total_accuracy += (predicted == next_word).sum().item()

            # Backpropagate the loss and update the model parameters
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Compute the average loss and accuracy for the epoch and update the relevant lists
        avg_loss = total_loss / (len(sequence) - model.n)
        avg_accuracy = total_accuracy / (len(sequence) - model.n)
        losses.append(avg_loss)
        accuracies.append(avg_accuracy)

        if epoch % 10 == 0:
            print(f"Epoch {epoch}: loss = {avg_loss:.4f}, accuracy = {avg_accuracy:.4f}")

    return model, losses, accuracies


In [22]:
# Generate the training data
sequence = generate_sequence()
# Create a vocabulary of unique words in the sequence
vocab = list(set(sequence))

# Create a mapping from word to index
word_to_idx = {word: index for index, word in enumerate(vocab)}

# Create a mapping from index to word
index_to_word = {index: word for index, word in enumerate(vocab)}

# Define the number of words, the number of hidden units, and the value of n
num_words = len(set(sequence))
num_hidden = 10
n = 3

# Initialize the n-gram model
model = NGramModel(num_words, num_hidden, n)

# Train the model
model, losses, accuracies = train_model(model,sequence,word_to_idx,1e-2,10)

Epoch 10: loss = 0.0569, accuracy = 1.0000


In [23]:
word_to_idx

{'sandwich': 0,
 'was': 1,
 'yesterday': 2,
 'lunch': 3,
 'I': 4,
 'and': 5,
 'it': 6,
 'had': 7,
 'a': 8,
 'for': 9,
 'delicious': 10}

In [24]:
index_to_word

{0: 'sandwich',
 1: 'was',
 2: 'yesterday',
 3: 'lunch',
 4: 'I',
 5: 'and',
 6: 'it',
 7: 'had',
 8: 'a',
 9: 'for',
 10: 'delicious'}

In [25]:

_ngram = torch.tensor([word_to_idx['had'],word_to_idx['a'],word_to_idx["sandwich"]], dtype=torch.long)

In [26]:
_logits = model(_ngram.unsqueeze(0))

In [27]:
index_to_word[torch.max(_logits, dim=1)[1].item()]

'for'