In [1]:
import torch
import string
import torch.nn as nn
import torch.nn.functional as F
import time
import math
import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Current device:", device)

Current device: cuda


In [5]:
file = open("input.txt","r",encoding="utf-8")
text = file.read()
text = text.replace("\n" , " ").lower()
punctuation_chars = string.punctuation
text = ''.join(char for char in text if char not in punctuation_chars)

In [6]:
tokens = text.split(" ")
vocab = list(set(tokens))

In [7]:
for i in tqdm.tqdm(vocab):
    if tokens.count(i) < 5:
        tokens.remove(i)
vocab = list(set(tokens))

100%|██████████| 12849/12849 [00:58<00:00, 220.01it/s]


In [8]:
vocab_to_idx = {}
idx_to_vocab = {}
vocab_size = len(vocab)
for idx,v in enumerate(vocab):
    vocab_to_idx[v] = idx
    idx_to_vocab[idx] = v

In [9]:
tokens_num = []
for i in tokens:
    tokens_num.append(vocab_to_idx[i])

In [10]:
x = []
y = []
x_num = []
y_num = []
max_len = 10
for i in range(len(tokens) - max_len - 1):
    x.append(tokens[i:max_len+i])
    y.append(tokens[max_len+i])
    x_num.append(tokens_num[i:max_len+i])
    y_num.append(tokens_num[max_len+i])

In [11]:
for i in range(10):
    print(x[i])
    print(y[i])
for i in range(10):
    print(x_num[i])
    print(y_num[i])

['first', 'citizen', 'before', 'we', 'proceed', 'any', 'further', 'hear', 'me', 'speak']

['citizen', 'before', 'we', 'proceed', 'any', 'further', 'hear', 'me', 'speak', '']
all
['before', 'we', 'proceed', 'any', 'further', 'hear', 'me', 'speak', '', 'all']
speak
['we', 'proceed', 'any', 'further', 'hear', 'me', 'speak', '', 'all', 'speak']
speak
['proceed', 'any', 'further', 'hear', 'me', 'speak', '', 'all', 'speak', 'speak']

['any', 'further', 'hear', 'me', 'speak', '', 'all', 'speak', 'speak', '']
first
['further', 'hear', 'me', 'speak', '', 'all', 'speak', 'speak', '', 'first']
citizen
['hear', 'me', 'speak', '', 'all', 'speak', 'speak', '', 'first', 'citizen']
you
['me', 'speak', '', 'all', 'speak', 'speak', '', 'first', 'citizen', 'you']
are
['speak', '', 'all', 'speak', 'speak', '', 'first', 'citizen', 'you', 'are']
all
[3949, 6506, 2275, 1298, 5625, 2177, 4241, 2515, 6106, 2949]
0
[6506, 2275, 1298, 5625, 2177, 4241, 2515, 6106, 2949, 0]
2190
[2275, 1298, 5625, 2177, 4241, 251

In [12]:
dmodel = 512
heads = 4
batch_size = 32
max_len = 10
shape = (batch_size,max_len,dmodel)
sentence = torch.Tensor(x_num).long()
label = torch.Tensor(y_num).long()

In [13]:
batch = []
for i in range(sentence.shape[0]//32):
    if i == 0:
        batch.append([sentence[0:32],label[0:32]])
    else:
        batch.append([sentence[i*32:(i+1)*32],label[i*32:(i+1)*32]])

In [20]:
class PositionalEncoding(nn.Module):
    '''
    Converts the vector embedding of a batch of sequences to their positional encoding vectors.

    Arguments:
            shape : shape of embedding vector => tuple(batch_size, max_len, dmodel)
            device : device to perform the computation on (e.g., 'cpu' or 'cuda')

    Returns:
            positional encoded vector

    '''
    def __init__(self, shape, device='cpu'):
        super(PositionalEncoding, self).__init__()
        self.max_len = shape[1]
        self.dmodel = shape[2]
        self.device = device

        position = torch.arange(0, self.max_len, device=self.device).float().unsqueeze(1)

        div_term = torch.exp(torch.arange(0, self.dmodel, 2, device=self.device).float() * -(math.log(10000.0) / self.dmodel))

        pos_enc = torch.zeros((1, self.max_len, self.dmodel), device=self.device)
        pos_enc[0, :, 0::2] = torch.sin(position * div_term)
        pos_enc[0, :, 1::2] = torch.cos(position * div_term)

        self.pos_enc = pos_enc

    def forward(self, x):
        x = x + self.pos_enc[:, :x.size(1), :]
        return x


In [29]:


class MultiHeadAttention(nn.Module):
    '''
    Multi-Head Attention mechanism for transformer models.

    Arguments:
        dmodel: Dimension of the model
        heads: Number of attention heads

    Methods:
        forward(x): Perform multi-head attention on the input tensor x
    '''
    def __init__(self, dmodel, heads):
        super(MultiHeadAttention, self).__init__()

        self.dmodel = dmodel
        self.heads = heads
        self.head_size = dmodel // heads

        self.k_linear = nn.Linear(dmodel, dmodel)
        self.q_linear = nn.Linear(dmodel, dmodel)
        self.v_linear = nn.Linear(dmodel, dmodel)
        self.out_linear = nn.Linear(dmodel, dmodel)

    def split_heads(self, x, batch_size):
        '''
        Split the last dimension into (heads, head_size) and transpose to shape (batch_size, heads, seq_len, head_size).
        '''
        return x.view(batch_size, -1, self.heads, self.head_size).transpose(1, 2)

    def attention(self, k, q, v):
        '''
        Compute the attention weights and apply them to the value vectors.
        '''
        d_k = q.size(-1)
        scores = torch.matmul(q, k.transpose(-2, -1)) / torch.sqrt(torch.tensor(d_k, dtype=torch.float32, device=q.device))
        attn = F.softmax(scores, dim=-1)
        return torch.matmul(attn, v)

    def forward(self, x):
        '''
        Perform the multi-head attention mechanism on the input tensor x.
        '''
        batch_size = x.size(0)

        K = self.split_heads(self.k_linear(x), batch_size)  # Key: What can I offer
        Q = self.split_heads(self.q_linear(x), batch_size)  # Query: What am I looking for
        V = self.split_heads(self.v_linear(x), batch_size)  # Value: What I actually offer

        attn_output = self.attention(K, Q, V)
        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.dmodel)

        return self.out_linear(attn_output)


In [15]:
class AddAndNorm(nn.Module):
    '''
    Add and Layer Normalization module for transformer models.

    Arguments:
        dmodel: Dimension of the model

    Methods:
        forward(x, residual): Add the input tensor x and the residual tensor, then apply layer normalization
    '''
    def __init__(self, dmodel):
        super(AddAndNorm, self).__init__()
        self.layer_norm = nn.LayerNorm(dmodel)

    def forward(self, x, residual):
        '''
        Add the input tensor x and the residual tensor, then apply layer normalization.

        Arguments:
            x: Input tensor
            residual: Residual tensor to be added to the input tensor

        Returns:
            Tensor after addition and layer normalization
        '''
        return self.layer_norm(x + residual)


In [16]:
class FeedForward(nn.Module):
    '''
    Position-wise Feed-Forward Network for transformer models with dropout.

    Arguments:
        dmodel: Dimension of the model
        dropout: Dropout probability

    Methods:
        forward(x): Apply the feed-forward network with dropout on the input tensor x
    '''
    def __init__(self, dmodel, dropout=0.1):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(dmodel, dmodel)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dmodel, dmodel)

    def forward(self, x):
        '''
        Apply the feed-forward network with dropout on the input tensor x.

        Arguments:
            x: Input tensor

        Returns:
            Tensor after applying the feed-forward network and dropout
        '''
        return self.linear2(self.dropout(self.relu(self.linear1(x))))


In [113]:
import torch
import torch.nn as nn

class Encoder(nn.Module):
    '''
    Transformer Encoder implementation.

    Arguments:
        vocab_size: Size of the vocabulary
        shape: Shape of the input tensor (batch_size, max_len, dmodel)
        heads: Number of attention heads

    Methods:
        forward(x): Forward pass through the encoder
    '''
    def __init__(self, vocab_size, shape, heads=4):
        super(Encoder, self).__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, shape[2])
        self.positional_encoding = PositionalEncoding(shape,device=device)
        self.multi_headed_attention = MultiHeadAttention(shape[2], heads)
        self.add_and_norm1 = AddAndNorm(shape[2])
        self.feed_forward = FeedForward(dmodel=shape[2])
        self.add_and_norm2 = AddAndNorm(shape[2])
        self.linear = nn.Linear(shape[2], 512)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        out = self.token_embedding_table(x)
        residual = self.positional_encoding(out)
        out = self.multi_headed_attention(residual)

        residual = self.add_and_norm1(out, residual)

        out = self.feed_forward(residual)
        out = self.add_and_norm2(out, residual)

        out = self.linear(out)

        return out


In [125]:


class Pretraining(nn.Module):
    '''
    Pretraining model for next word prediction using a transformer encoder.

    Arguments:
        vocab_size: Size of the vocabulary
        shape: Shape of the input tensor (batch_size, max_len, dmodel)
        heads: Number of attention heads

    Methods:
        forward(x): Forward pass through the pretraining model
        predict_next_word(x): Predict the next word for the input sequence
    '''
    def __init__(self, vocab_size, shape, heads=4):
        super(Pretraining, self).__init__()
        self.encoder = Encoder(vocab_size, shape, heads)
        self.linear = nn.Linear(shape[2] * shape[1], vocab_size)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        out = self.encoder(x)
        out = out.view(out.size(0), -1) #torch.Size([Batch,time*dmodel])
        out = self.linear(out)
        return out


In [126]:

model = Pretraining(vocab_size,shape)
criterition = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
import random
random.shuffle(batch)
model = model.to(device)


for epoch in range(10):
    losses = []
    running_loss = 0.0
    model.train()

    for b in tqdm.tqdm(batch):
        inputs, targets = b[0].to(device), b[1].to(device)
        optimizer.zero_grad()
        res = model(inputs)
        loss  = criterition(res ,targets)
        # Backward pass and optimization step
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        losses.append(loss)

    # Calculate and print the average loss for the epoch
    average_loss = running_loss / len(batch)
    print(f'Epoch {epoch + 1}, Average Loss: {average_loss}')

100%|██████████| 6259/6259 [01:49<00:00, 57.42it/s]


Epoch 1, Average Loss: 6.145278875735225


100%|██████████| 6259/6259 [01:49<00:00, 57.30it/s]


Epoch 2, Average Loss: 5.546202123688897


100%|██████████| 6259/6259 [01:49<00:00, 57.37it/s]


Epoch 3, Average Loss: 4.801016961448317


100%|██████████| 6259/6259 [01:49<00:00, 57.35it/s]


Epoch 4, Average Loss: 4.067565264879352


100%|██████████| 6259/6259 [01:49<00:00, 57.39it/s]


Epoch 5, Average Loss: 3.4987538750321363


100%|██████████| 6259/6259 [01:48<00:00, 57.75it/s]


Epoch 6, Average Loss: 3.064599224145581


100%|██████████| 6259/6259 [01:48<00:00, 57.63it/s]


Epoch 7, Average Loss: 2.7542555458840137


100%|██████████| 6259/6259 [01:48<00:00, 57.58it/s]


Epoch 8, Average Loss: 2.508637972791501


100%|██████████| 6259/6259 [01:49<00:00, 57.10it/s]


Epoch 9, Average Loss: 2.2968764330826925


100%|██████████| 6259/6259 [01:48<00:00, 57.73it/s]

Epoch 10, Average Loss: 2.127277320919564





In [130]:
#softmax res
res = nn.Softmax(dim=-1)(res)

In [164]:
sentence = 'we welcome to the book of writing making working can'
tokens = sentence.split(" ")
tokens_num = []
for i in tokens:
    tokens_num.append(vocab_to_idx[i])

out = model(torch.tensor(tokens_num).unsqueeze(0).to(device))
out = nn.Softmax(dim=-1)(out)
out = torch.argmax(out,dim=-1).item()
print(idx_to_vocab[out])

it
