In [1]:
import numpy as np
import pandas as pd
from datasets import load_dataset
import sentencepiece as spm

from torch.utils.data import Dataset, DataLoader

import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
full_train_df = pd.read_json('train_data.json', lines=True)

In [3]:
full_train_df.head()

Unnamed: 0,text
0,"One day, a little girl named Lily found a need..."
1,"Once upon a time, there was a little car named..."
2,"One day, a little fish named Fin was swimming ..."
3,"Once upon a time, in a land full of trees, the..."
4,"Once upon a time, there was a little girl name..."


In [4]:
train_df = full_train_df[:1]
train_df

Unnamed: 0,text
0,"One day, a little girl named Lily found a need..."


In [6]:
def generate_inputs_and_labels(df, sp):
    inputs = []
    labels = []

    for index, row in df.iterrows():
        text = row['text']
        tokens = text.split()  # Tokenize the text by splitting on whitespace
        
        input_sequence = tokens # Generate input by adding start of sentence token at the beginning
        label_sequence = tokens # Generate label by adding end of sentence token at the end

        inputs.append(input_sequence)
        labels.append(label_sequence)

    # Convert input and label sequences to strings
    input_strings = [' '.join(sequence) for sequence in inputs]
    label_strings = [' '.join(sequence) for sequence in labels]

    # Tokenize input strings and add <sos> token at the beginning
    tokenized_inputs = []
    input_ids = []
    for sequence in input_strings:
        tokenized_sequence = sp.encode_as_pieces(sequence)
        tokenized_sequence = ['<sos>'] + tokenized_sequence  # Add <sos> token manually
        input_ids.append([sp.piece_to_id('<sos>')] + sp.encode_as_ids(sequence))  # Get token IDs
        tokenized_inputs.append(tokenized_sequence)

    # Tokenize label strings and add </sos> token at the end
    tokenized_labels = []
    label_ids = []
    for sequence in label_strings:
        tokenized_sequence = sp.encode_as_pieces(sequence)
        tokenized_sequence.append('</sos>')  # Add </sos> token manually at the end
        label_ids.append(sp.encode_as_ids(sequence) + [sp.piece_to_id('</sos>')])  # Get token IDs
        tokenized_labels.append(tokenized_sequence)

    # Print tokenized input and label sequences
    for i in range(len(inputs)):
        print("Input Text:", input_strings[i])
        print("Tokenized Input:", tokenized_inputs[i])
        print("Input IDs:", input_ids[i])
        print("Label Text:", label_strings[i])
        print("Tokenized Label:", tokenized_labels[i])
        print("Label IDs:", label_ids[i])

    return input_ids, label_ids


In [8]:
# Example usage
sp = spm.SentencePieceProcessor()
sp.load('small_m.model')

source_ids, target_ids = generate_inputs_and_labels(train_df, sp)

Input Text: One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt. Lily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt." Together, they shared the needle and sewed the button on Lily's shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.
Tokenized Input: ['<sos>', '▁one', '▁day', ',', '▁a', '▁little', '▁girl', '▁name', 'd', '▁lily', '▁found', '▁a', '▁needle', '▁in', '▁her', '▁room', '.', '▁she', '▁knew', '▁it', '▁was', '▁difficult', '▁to', '▁play', '▁with', '▁it', '▁', 'because', '▁it', '▁was', '▁sharp', '.', '▁lily', '▁

In [9]:
# Create a custom dataset
class CustomDataset(Dataset):
    def __init__(self, source_ids, target_ids):
        self.source_ids = source_ids
        self.target_ids = target_ids
        
    def __len__(self):
        return len(self.source_ids)
    
    def __getitem__(self, idx):
        source_sequence = self.source_ids[idx]
        target_sequence = self.target_ids[idx]
        return source_sequence, target_sequence

# Transformer

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import gensim
import numpy as np
import random
import math

# Define a custom dataset class to process data from DataFrame
import torch
from torch.utils.data import Dataset

class DataFrameDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
        self.sos_token = '<sos>'
        self.eos_token = '</sos>'

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        input_seq, target_seq = self.process_text(row['text'])
        return input_seq, target_seq

    def process_text(self, text):
        tokens = self.tokenizer.encode_as_pieces(text)
        input_seq = [self.sos_token] + tokens[:-1]
        target_seq = tokens
        return input_seq, target_seq

    def get_tokens_for_row(self, idx):
        row = self.df.iloc[idx]
        input_seq, target_seq = self.process_text(row['text'])
        return input_seq, target_seq

def collate_fn(batch):
    src_seqs, tgt_seqs = zip(*batch)
    src_seqs = pad_sequence(src_seqs, padding_value=0, batch_first=True)
    tgt_seqs = pad_sequence(tgt_seqs, padding_value=0, batch_first=True)
    return src_seqs, tgt_seqs

# Load tokenizer and embedding model
def load_tokenizer():
    # Load your tokenizer model here
    sp = spm.SentencePieceProcessor()
    sp.load('small_m.model')
    return sp

def load_embedding_model():
    # Load your embedding model here
    return Word2Vec.load("small_word2vec.model")

# Define embedding layer
class PretrainedEmbeddingLayer(nn.Module):
    def __init__(self, embedding_model):
        super(PretrainedEmbeddingLayer, self).__init__()
        self.embedding_model = embedding_model
        
    def forward(self, x):
        embeddings = []
        for idx in x:
            token = tokenizer.index_word.get(idx.item(), '')  
            if token in self.embedding_model:
                embeddings.append(torch.tensor(self.embedding_model[token]))
            else:
                embeddings.append(torch.zeros(self.embedding_model.vector_size))
        return torch.stack(embeddings)


In [326]:
# Load and process data from DataFrame
df = train_df[:1] # Load your DataFrame here

# Load tokenizer and embedding model
tokenizer = load_tokenizer()
embedding_model = load_embedding_model()

# Define embedding layer using pre-trained embeddings
embedding_layer = PretrainedEmbeddingLayer(embedding_model)

text = df['text'][0]
print(text)

One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.

Lily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."

Together, they shared the needle and sewed the button on Lily's shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.


In [13]:
# Create an instance of DataFrameDataset
dataset = DataFrameDataset(df, tokenizer)

# Choose an index to print tokens for
index = 0  # Change this to any index you want to inspect

# Get the tokens for the chosen index
source_tokens, target_tokens = dataset.get_tokens_for_row(index)

# Print the tokens
print("Source Tokens:", source_tokens)
print("Target Tokens:", target_tokens)


Source Tokens: ['<sos>', '▁one', '▁day', ',', '▁a', '▁little', '▁girl', '▁name', 'd', '▁lily', '▁found', '▁a', '▁needle', '▁in', '▁her', '▁room', '.', '▁she', '▁knew', '▁it', '▁was', '▁difficult', '▁to', '▁play', '▁with', '▁it', '▁', 'because', '▁it', '▁was', '▁sharp', '.', '▁lily', '▁wanted', '▁to', '▁share', '▁the', '▁needle', '▁with', '▁her', '▁mom', ',', '▁so', '▁she', '▁could', '▁sew', '▁a', '▁button', '▁on', '▁her', '▁shirt', '.', '\n\n', 'lily', '▁went', '▁to', '▁her', '▁mom', '▁and', '▁said', ',', '▁"', 'mom', ',', '▁i', '▁found', '▁this', '▁needle', '.', '▁can', '▁you', '▁share', '▁it', '▁with', '▁me', '▁and', '▁sew', '▁my', '▁shirt', '?"', '▁her', '▁mom', '▁smiled', '▁and', '▁said', ',', '▁"', 'yes', ',', '▁lily', ',', '▁we', '▁can', '▁share', '▁the', '▁needle', '▁and', '▁fix', '▁your', '▁shirt', '."', '\n\n', 'to', 'ge', 'ther', ',', '▁they', '▁shared', '▁the', '▁needle', '▁and', '▁sew', 'ed', '▁the', '▁button', '▁on', '▁lily', "'", 's', '▁shirt', '.', '▁it', '▁was', '▁not',

In [14]:
# Define a function to get embeddings and similar words
def get_embedding_and_similar_words(word):
    # Get embeddings of the word
    encoded_word = tokenizer.encode_as_pieces(word)
    word_str = ''.join(encoded_word)
    
    if word_str in embedding_model.wv.key_to_index:
        # Get embedding
        embedding = embedding_model.wv[word_str]
        #print(f'Embedding for "{word}": {embedding}')

        # Get similar words
        similar_words = embedding_model.wv.most_similar(word_str)
        print(f'Similar words to "{word}": {similar_words}')
    else:
        print(f'Word "{word}" not found in the embedding model.')


# Example usage
word_to_check = 'girl'
get_embedding_and_similar_words(word_to_check)

Similar words to "girl": [('▁boy', 0.9662375450134277), ('▁name', 0.863869845867157), ('▁years', 0.8347034454345703), ('▁old', 0.8179454207420349), ('▁three', 0.8172581195831299), ('▁who', 0.8083735108375549), ('▁live', 0.7911251187324524), ('▁little', 0.7740851640701294), ('▁nosy', 0.7686184048652649), ('d', 0.7683912515640259)]


# Transformer

In [283]:

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        # Ensure that the model dimension (d_model) is divisible by the number of heads
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        # Initialize dimensions
        self.d_model = d_model # Model's dimension
        self.num_heads = num_heads # Number of attention heads
        self.d_k = d_model // num_heads # Dimension of each head's key, query, and value
        
        # Linear layers for transforming inputs
        self.W_q = nn.Linear(d_model, d_model) # Query transformation
        self.W_k = nn.Linear(d_model, d_model) # Key transformation
        self.W_v = nn.Linear(d_model, d_model) # Value transformation
        self.W_o = nn.Linear(d_model, d_model) # Output transformation
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        # Calculate attention scores
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        # Apply mask if provided (useful for preventing attention to certain parts like padding)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        
        # Softmax is applied to obtain attention probabilities
        attn_probs = torch.softmax(attn_scores, dim=-1)
        
        # Multiply by values to obtain the final output
        output = torch.matmul(attn_probs, V)
        return output
        
    def split_heads(self, x):
        # Reshape the input to have num_heads for multi-head attention
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        
    def combine_heads(self, x):
        # Combine the multiple heads back to original shape
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
        
    def forward(self, Q, K, V, mask=None):
        # Apply linear transformations and split heads
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        # Perform scaled dot-product attention
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        
        # Combine heads and apply output transformation
        output = self.W_o(self.combine_heads(attn_output))
        return output

In [284]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [295]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        
        self.d_model = d_model
        self.max_seq_length = max_seq_length
        pe = self._init_pe(max_seq_length, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        seq_length = x.size(1)
        pe = self.pe[:seq_length, :] if seq_length <= self.max_seq_length else self._init_pe(seq_length, self.d_model)
        return x + pe

    def _init_pe(self, seq_length, d_model):
        pe = torch.zeros(seq_length, d_model)
        position = torch.arange(0, seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        return pe


In [296]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

In [297]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, decoder_input, decoder_mask):
        attn_output = self.self_attn(decoder_input, decoder_input, decoder_input, decoder_mask)
        x = self.norm1(decoder_input + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

In [298]:
class Transformer(nn.Module):
    def __init__(self, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, tgt):
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return tgt_mask

    def forward(self, decoder_input):
        decoder_mask = self.generate_mask(decoder_input)
        decoder_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(decoder_input)))

        dec_output = decoder_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, decoder_mask)

        output = self.fc(dec_output)
        return output


In [299]:
src_vocab_size = 5000
tgt_vocab_size = 5000
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 100
dropout = 0.1

#transformer = Transformer(src_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)
#transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

# Initialize the model
transformer = Transformer(tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)


In [303]:
print(transformer)

Transformer(
  (decoder_embedding): Embedding(5000, 512)
  (positional_encoding): PositionalEncoding()
  (decoder_layers): ModuleList(
    (0-5): 6 x DecoderLayer(
      (self_attn): MultiHeadAttention(
        (W_q): Linear(in_features=512, out_features=512, bias=True)
        (W_k): Linear(in_features=512, out_features=512, bias=True)
        (W_v): Linear(in_features=512, out_features=512, bias=True)
        (W_o): Linear(in_features=512, out_features=512, bias=True)
      )
      (feed_forward): PositionWiseFeedForward(
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
        (relu): ReLU()
      )
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (fc): Linear(in_features=512, out_features=5000, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [323]:
# Convert lists to tensors
source_data = torch.tensor(source_ids)
target_data = torch.tensor(target_ids)

# Print shapes
print("Source data shape:", source_data.shape)
print("Target data shape:", target_data.shape)

print(source_data)
print(target_data)

Source data shape: torch.Size([1, 165])
Target data shape: torch.Size([1, 165])
tensor([[   0,   38,   28,    6,    8,   37,   53,   86,   34,   31,  119,    8,
         1614,   21,   14,  198,    3,   11,  185,   12,    9, 1455,    7,   54,
           24,   12,   19,  230,   12,    9, 1316,    3,   31,   59,    7,  259,
            4, 1614,   24,   14,   43,    6,   23,   11,   94, 2599,    8, 1293,
           32,   14,  802,    3,   31,   68,    7,   14,   43,    5,   18,    6,
           16,  749,    6,   49,  119,  149, 1614,    3,   66,   25,  259,   12,
           24,  145,    5, 2599,  140,  802,   82,   14,   43,   76,    5,   18,
            6,   16,  257,    6,   31,    6,   96,   66,  259,    4, 1614,    5,
          524,  129,  802,   46,  104,    6,   13,  678,    4, 1614,    5, 2599,
           20,    4, 1293,   32,   31,   17,   15,  802,    3,   12,    9,   60,
         1455,   36,   64,   19,  230,   13,   50, 1901,    5,  820,  183,  125,
            3,  167,   13,  5

In [327]:

transformer = Transformer( tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)


criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9)

transformer.train()

for epoch in range(1000):
    optimizer.zero_grad()
    output = transformer(source_data)
    
    # Adjust target_data to have the same length as model output
    target_data_adjusted = target_data[:, :output.size(1)]
    
    loss = criterion(output.contiguous().view(-1, tgt_vocab_size), target_data_adjusted.contiguous().view(-1))
    loss.backward()
    optimizer.step()
    print(f"Epoch: {epoch+1}, Loss: {loss.item()}")

Epoch: 1, Loss: 8.684040069580078
Epoch: 2, Loss: 6.700284004211426
Epoch: 3, Loss: 5.907226085662842
Epoch: 4, Loss: 4.79470682144165
Epoch: 5, Loss: 4.811295509338379
Epoch: 6, Loss: 4.1880879402160645
Epoch: 7, Loss: 3.098968029022217
Epoch: 8, Loss: 2.5268588066101074
Epoch: 9, Loss: 2.1185877323150635
Epoch: 10, Loss: 1.527596354484558
Epoch: 11, Loss: 1.1251120567321777
Epoch: 12, Loss: 0.854336142539978
Epoch: 13, Loss: 0.6587713956832886
Epoch: 14, Loss: 0.5733023881912231
Epoch: 15, Loss: 0.4774659276008606
Epoch: 16, Loss: 0.4077913761138916
Epoch: 17, Loss: 0.33671510219573975
Epoch: 18, Loss: 0.2796070873737335
Epoch: 19, Loss: 0.2266368418931961
Epoch: 20, Loss: 0.19065441191196442
Epoch: 21, Loss: 0.15619532763957977
Epoch: 22, Loss: 0.12294583022594452
Epoch: 23, Loss: 0.10402785241603851
Epoch: 24, Loss: 0.08071884512901306
Epoch: 25, Loss: 0.06430551409721375
Epoch: 26, Loss: 0.05896078050136566
Epoch: 27, Loss: 0.042945388704538345
Epoch: 28, Loss: 0.03618289157748222

In [332]:
import sentencepiece as spm

def word_to_token_id(word, sp_model):
    # Convert word to token ID using SentencePiece
    return sp_model.piece_to_id(word)

def generate_text(model, sp_model, starting_word, ending_word, max_length=50):
    # Convert starting and ending words to token IDs
    starting_token_id = word_to_token_id(starting_word, sp_model)
    if starting_token_id is None:
        raise ValueError(f"Starting word '{starting_word}' not found in vocabulary.")
    ending_token_id = word_to_token_id(ending_word, sp_model)
    if ending_token_id is None:
        raise ValueError(f"Ending word '{ending_word}' not found in vocabulary.")
    
    generated_sequence = [starting_token_id]
    with torch.no_grad():
        for _ in range(max_length):
            input_tensor = torch.tensor([generated_sequence])
            output = model(input_tensor)
            predicted_token = output.argmax(-1)[:,-1].item()
            generated_sequence.append(predicted_token)
            if predicted_token == ending_token_id:
                break
    
    # Convert token IDs to words using SentencePiece
    generated_text = sp_model.decode_ids(generated_sequence)
    return generated_text


# Example usage:
starting_word = "needle in"
ending_word = "</sos>"
generated_sequence = generate_text(transformer, sp, starting_word, ending_word)
print("Generated sequence:", generated_sequence)

Generated sequence:  ⁇  day, a little girl named lily found a needle in her room. she knew it was difficult to play with it because it was sharp. lily wanted to share the needle with her mom, so she could sew a button on her shirt.


In [None]:
One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.

Lily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."

Together, they shared the needle and sewed the button on Lily's shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing 
the needle and fixing her shirt. They both felt happy because they had shared and worked together