In [59]:
import pandas as pd
import matplotlib.pyplot as plt
import tiktoken
import re
import torch
import torch.nn as nn
import torch.optim as optim
import datasets
import math

In [60]:
train_dataset = datasets.load_from_disk("tiny_stories_train").select(range(5))
val_dataset = datasets.load_from_disk("tiny_stories_val").select(range(5))

#converting to list
train_stories = [i for i in train_dataset['text']]
val_stories = [i for i in val_dataset['text']]


len(train_stories), len(val_stories)

(5, 5)

In [53]:
train_stories[0], val_stories[0]

('One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.\n\nLily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."\n\nTogether, they shared the needle and sewed the button on Lily\'s shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.',
 'Spot. Spot saw the shiny car and said, "Wow, Kitty, your car is so bright and clean!" Kitty smiled and replied, "Thank you, Spot. I polish it every day."\n\nAfter playing with the car, Kitty and Spot felt thirsty. They found a small pond with clear water. They drank the water and felt v

In [54]:


# Load or create SentencePiece model
sp = spm.SentencePieceProcessor()
sp.load('sentencepiece.model')  # Load an existing model
# or
# sp.train('--input=data/train.txt --model_prefix=sentencepiece --vocab_size=32000')  # Train a new model

# Tokenize stories
def tokenize_stories(stories):
    tokenized_stories = []
    for story in stories:
        tokens = sp.encode_as_ids(story)
        tokenized_stories.append(tokens)
    return tokenized_stories

# Build vocabulary (not needed as SentencePiece handles vocabulary)
# Convert stories to token IDs
def stories_to_ids(tokenized_stories):
    input_ids = []
    labels = []
    for story in tokenized_stories:
        story_ids = [sp.bos_id()] + story + [sp.eos_id()]
        input_ids.append(story_ids[:-1])  # Input sequence without the <eos> token
        labels.append(story_ids[1:])  # Label sequence without the <bos> token
    return input_ids, labels

# Preprocess data
train_tokenized_stories = tokenize_stories(train_stories)
val_tokenized_stories = tokenize_stories(val_stories)
print(f"train_tokenized_stories is \n length {len(train_tokenized_stories)}\n{train_tokenized_stories[0]}")
train_ids_stories, train_labels = stories_to_ids(train_tokenized_stories)
val_ids_stories, val_labels = stories_to_ids(val_tokenized_stories)
print(f"train_ids_stories is \n length {len(train_ids_stories)}\n{train_ids_stories[0]}")
print(f"train_labels is \n length {len(train_labels)}\n{train_labels[0]}")

ModuleNotFoundError: No module named 'sentencepiece'

In [51]:
# Tokenize stories
def tokenize_stories(stories):
    tokenized_stories = []
    for story in stories:
        tokens = re.findall(r"\w+|[^\w\s]", story.lower())
        tokenized_stories.append(tokens)
    return tokenized_stories

# Build vocabulary
def build_vocab(tokenized_stories):
    vocab = {}
    for story in tokenized_stories:
        for token in story:
            if token not in vocab:
                vocab[token] = len(vocab)
    vocab['<pad>'] = len(vocab)
    vocab['<sos>'] = len(vocab)
    vocab['<eos>'] = len(vocab)
    return vocab

# Convert stories to token IDs
def stories_to_ids(tokenized_stories, vocab):
    input_ids = []
    labels = []
    for story in tokenized_stories:
        story_ids = [vocab['<sos>']] + [vocab[token] for token in story] + [vocab['<eos>']]
        input_ids.append(story_ids[:-1])  # Input sequence without the <eos> token
        labels.append(story_ids[1:])  # Label sequence without the <bos> token
    return input_ids, labels

# Preprocess data
train_tokenized_stories = tokenize_stories(train_stories)
val_tokenized_stories = tokenize_stories(val_stories)
print(f"train_tokenized_stories is \n length {len(train_tokenized_stories)}\n{train_tokenized_stories[0]}")

vocab = build_vocab(train_tokenized_stories + val_tokenized_stories)
print(f"The length of vocab is \n{len(vocab)}\nThe vocab keys are \n{vocab.keys()}")

train_ids_stories, train_labels = stories_to_ids(train_tokenized_stories, vocab)
val_ids_stories, val_labels = stories_to_ids(val_tokenized_stories, vocab)
print(f"train_ids_stories is \n length {len(train_ids_stories)}\n{train_ids_stories[0]}")
print(f"train_labels is \n length {len(train_labels)}\n{train_labels[0]}")

train_tokenized_stories is 
 length 5
['one', 'day', ',', 'a', 'little', 'girl', 'named', 'lily', 'found', 'a', 'needle', 'in', 'her', 'room', '.', 'she', 'knew', 'it', 'was', 'difficult', 'to', 'play', 'with', 'it', 'because', 'it', 'was', 'sharp', '.', 'lily', 'wanted', 'to', 'share', 'the', 'needle', 'with', 'her', 'mom', ',', 'so', 'she', 'could', 'sew', 'a', 'button', 'on', 'her', 'shirt', '.', 'lily', 'went', 'to', 'her', 'mom', 'and', 'said', ',', '"', 'mom', ',', 'i', 'found', 'this', 'needle', '.', 'can', 'you', 'share', 'it', 'with', 'me', 'and', 'sew', 'my', 'shirt', '?', '"', 'her', 'mom', 'smiled', 'and', 'said', ',', '"', 'yes', ',', 'lily', ',', 'we', 'can', 'share', 'the', 'needle', 'and', 'fix', 'your', 'shirt', '.', '"', 'together', ',', 'they', 'shared', 'the', 'needle', 'and', 'sewed', 'the', 'button', 'on', 'lily', "'", 's', 'shirt', '.', 'it', 'was', 'not', 'difficult', 'for', 'them', 'because', 'they', 'were', 'sharing', 'and', 'helping', 'each', 'other', '.', 'a

In [46]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        # Ensure that the model dimension (d_model) is divisible by the number of heads
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        # Initialize dimensions
        self.d_model = d_model # Model's dimension
        self.num_heads = num_heads # Number of attention heads
        self.d_k = d_model // num_heads # Dimension of each head's key, query, and value
        
        # Linear layers for transforming inputs
        self.W_q = nn.Linear(d_model, d_model) # Query transformation
        self.W_k = nn.Linear(d_model, d_model) # Key transformation
        self.W_v = nn.Linear(d_model, d_model) # Value transformation
        self.W_o = nn.Linear(d_model, d_model) # Output transformation
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        # Calculate attention scores
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        # Apply mask if provided (useful for preventing attention to certain parts like padding)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        
        # Softmax is applied to obtain attention probabilities
        attn_probs = torch.softmax(attn_scores, dim=-1)
        
        # Multiply by values to obtain the final output
        output = torch.matmul(attn_probs, V)
        return output
        
    def split_heads(self, x):
        # Reshape the input to have num_heads for multi-head attention
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        
    def combine_heads(self, x):
        # Combine the multiple heads back to original shape
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
        
    def forward(self, Q, K, V, mask=None):
        # Apply linear transformations and split heads
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        # Perform scaled dot-product attention
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        
        # Combine heads and apply output transformation
        output = self.W_o(self.combine_heads(attn_output))
        return output
    
    
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))
    

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)
    

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x
    

class StoryCompletionTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(StoryCompletionTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, dropout, max_seq_length)
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.fc = nn.Linear(d_model, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, tgt):
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return tgt_mask

    def forward(self, tgt):
        tgt_mask = self.generate_mask(tgt)
        tgt_embedded = self.dropout(self.positional_encoding(self.embedding(tgt)))

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, None, None, tgt_mask)

        output = self.fc(dec_output)
        return output

# Hyperparameters
vocab_size = len(vocab)
d_model = 50
num_heads = 5
num_layers = 2
d_ff = 32
max_seq_length = 100
dropout = 0.1
epochs = 5

model = StoryCompletionTransformer(vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

# Training
criterion = nn.CrossEntropyLoss(ignore_index=vocab['<pad>'])
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

model.train()
for epoch in range(epochs):
    optimizer.zero_grad()
    for story in train_ids_stories:
        tgt_input = torch.tensor(story[:-1]).unsqueeze(0)  # (1, seq_length)
        tgt_output = torch.tensor(story[1:]).unsqueeze(0)  # (1, seq_length)
        output = model(tgt_input)
        loss = criterion(output.contiguous().view(-1, vocab_size), tgt_output.contiguous().view(-1))
        loss.backward()
    optimizer.step()
    print(f"Epoch: {epoch+1}, Loss: {loss.item()}")



TypeError: linear(): argument 'input' (position 1) must be Tensor, not NoneType

In [None]:
# Inference
model.eval()

def complete_story(start_sentence, max_length=100):
    start_tokens = [vocab['<sos>']] + [vocab[token] for token in tokenize_stories([start_sentence])[0]]
    story = torch.tensor(start_tokens).unsqueeze(0)  # (1, seq_length)
    for _ in range(max_length):
        output = model(story)
        next_token_probs = output[:, -1, :].squeeze().softmax(dim=0)
        next_token = torch.multinomial(next_token_probs, num_samples=1).item()
        if next_token == vocab['<eos>']:
            break
        story = torch.cat((story, torch.tensor([[next_token]])), dim=1)
    story_tokens = [vocab.inv_vocab[token.item()] for token in story.squeeze()]
    return ' '.join(story_tokens[1:-1])

start_sentence = "Once upon a time"
completed_story = complete_story(start_sentence)
print(completed_story)