In [31]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Define the scaled dot-product attention function with masking
def scaled_dot_product_attention(query, key, value, mask=None):
    matmul_qk = torch.matmul(query, key.transpose(-2, -1))
    d_k = query.size(-1)
    scaled_attention_logits = matmul_qk / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))
    
    if mask is not None:
        scaled_attention_logits += (vocab_size * -1e9)
    
    attention_weights = F.softmax(scaled_attention_logits, dim=-1)
    output = torch.matmul(attention_weights, value)
    
    return output, attention_weights

# Modify MultiHeadAttention to accept mask
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        assert d_model % self.num_heads == 0
        self.depth = d_model // self.num_heads
        
        self.wq = nn.Linear(d_model, d_model)
        self.wk = nn.Linear(d_model, d_model)
        self.wv = nn.Linear(d_model, d_model)
        
        self.fc = nn.Linear(d_model, d_model)
        
    def split_heads(self, x, batch_size):
        x = x.view(batch_size, -1, self.num_heads, self.depth)
        return x.permute(0, 2, 1, 3)
        
    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        query = self.split_heads(self.wq(query), batch_size)
        key = self.split_heads(self.wk(key), batch_size)
        value = self.split_heads(self.wv(value), batch_size)
        
        output, attention_weights = scaled_dot_product_attention(query, key, value, mask)
        
        output = output.permute(0, 2, 1, 3).contiguous().view(batch_size, -1, self.d_model)
        output = self.fc(output)
        
        return output

# Modify TransformerEncoderLayer to accept mask
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(TransformerEncoderLayer, self).__init__()
        
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model),
        )
        
        self.layernorm1 = nn.LayerNorm(d_model)
        self.layernorm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask=None):
        attn_output = self.mha(x, x, x, mask)
        out1 = self.layernorm1(x + self.dropout(attn_output))
        
        ffn_output = self.ffn(out1)
        out2 = self.layernorm2(out1 + self.dropout(ffn_output))
        
        return out2

# Modify TransformerLanguageModel to accept mask
class TransformerLanguageModel(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, d_ff, dropout=0.1):
        super(TransformerLanguageModel, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.encoder = TransformerEncoderLayer(d_model, num_heads, d_ff, dropout)
        self.decoder = nn.Linear(d_model, vocab_size)
        
    def forward(self, x, mask=None):
        x = self.embedding(x)
        x = self.encoder(x, mask)
        logits = self.decoder(x)
        
        return logits

# Function to create padding masks
def create_padding_mask(seq):
    return (seq == vocab_size).unsqueeze(1).unsqueeze(1)

# Hyperparameters
vocab_size = 500
d_model = 512
num_heads = 8
d_ff = 10000
dropout = 0

# Example usage with masks
# model = TransformerLanguageModel(vocab_size, d_model, num_heads, d_ff, dropout)
# Assume `input_batch` is your input tensor of shape [batch_size, seq_length]
# input_batch = ...
# mask = create_padding_mask(input_batch)
# output = model(input_batch, mask=mask)


In [32]:
# folder_path = 'datasets/vietnamese/vietnamese/output/'
# number_of_files = 2
data = []

# counter = 0
# file_names = os.listdir(folder_path)[:number_of_files]
# for file_name in file_names:
#     print(file_name)
#     with open(folder_path + file_name, 'r', encoding='utf-8') as file:
#         data.append(file.read())
#     counter += 1
#     if counter % 10 == 0:
#         print(counter, 'files processed')

filename = 'datasets/vietnamese/vietnamese/output/Đợi Anh Đến Đông Tàn - Đường Châu.txt'

with open(filename, 'r', encoding='utf-8') as file:
    for line in file.readlines():
        data.append(line)





In [34]:
from tqdm import tqdm
import numpy as np
from torch.utils.data import TensorDataset, DataLoader

chunk_len = 100  # Change chunk length to 50
process_data = []

print(data[3])

for text in data:
    chunks = [text[i:i + chunk_len] for i in range(0, len(text), chunk_len)]
    for chunk in chunks:
        if len(chunk) == chunk_len:
            process_data.append(chunk)
        else:
            # Pad the remaining sequence with spaces
            chunk += ' ' * (chunk_len - len(chunk))
            process_data.append(chunk)

tokenized_text = [list(line) for line in process_data]
max_len = max([len(line) for line in tokenized_text])
print('Max length of sentence:', max_len)

vocab = set(char for line in tokenized_text for char in line)
print('Vocab size:', len(vocab))

char_to_id = {char: idx for idx, char in enumerate(vocab)}
id_to_char = {idx: char for idx, char in enumerate(vocab)}
id_to_char[len(vocab)] = ' '  # Add space to the end of vocab

# Create input-output pairs
input_sequences = []
label_sequences = []

# Convert tokenized text to IDs
tokenized_ids = [[char_to_id[char] for char in line] for line in tokenized_text]

for sequence in tokenized_ids:
    for i in range(1, len(sequence)):  # start from 1 because we need a pair (input, label)
        input_seq = sequence[:i]
        label_seq = sequence[1:i + 1]  # Shifted by one position to the right
        
        # Pad input and label sequence with zeros to match max_len
        input_seq += [len(vocab)] * (max_len - len(input_seq))
        label_seq += [len(vocab)] * (max_len - len(label_seq))
        
        input_sequences.append(input_seq)
        label_sequences.append(label_seq)

# Convert lists to PyTorch tensors
input_sequences = torch.LongTensor(input_sequences)
label_sequences = torch.LongTensor(label_sequences)

# Create DataLoader for training
batch_size = 1000
train_data = TensorDataset(input_sequences, label_sequences)
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)




Tập 1

Max length of sentence: 100
Vocab size: 152


In [35]:
print('Number of training samples:', len(input_sequences))
print(input_sequences[40])
print(id_to_char[103])
test_seq = input_sequences[230].tolist()
decode_sequence = [id_to_char[idx] for idx in test_seq]
print(''.join(decode_sequence))

Number of training samples: 189981
tensor([150,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,
         69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,
         69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69, 152,
        152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152,
        152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152,
        152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152,
        152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152, 152,
        152, 152])
?
Đợi Anh Đến Đông Tàn
                                                                               


In [36]:
from torch import optim
from torch.utils.data import DataLoader

# Initialize model, loss, and optimizer
model = TransformerLanguageModel(vocab_size, d_model, num_heads, d_ff, dropout)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)
epochs = 5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Assuming train_loader yields batches of shape [batch_size, sequence_length]
for epoch in range(epochs):
    for batch in train_loader:
        input_data, labels = batch
        input_data, labels = input_data.to(device), labels.to(device)

        # print(input_data.shape, labels.shape)

    
        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_data)

        outputs_reshaped = outputs.view(-1, vocab_size)  # Reshape to [64*50, 500]
        labels_reshaped = labels.view(-1)  # Reshape to [64*50]

        
        # print("Output shape:", outputs.shape)  # Should print something like [64, 100, 110]
        # print("Labels shape:", labels.shape)  # Should print something like [64, 100]

        # Reshape for the loss function
        outputs_reshaped = outputs.view(-1, vocab_size)  # Reshape to [64*100, 110]
        labels_reshaped = labels.view(-1)  # Reshape to [64*100]

        # Compute loss
        loss = criterion(outputs_reshaped, labels_reshaped)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
    print('Epoch:', epoch+1, 'Loss:', loss.item())


RuntimeError: unique_by_key: failed to synchronize: cudaErrorMemoryAllocation: out of memory

In [21]:
def generate_text(start_text, generate_length, model, char_to_id, id_to_char):
    model.eval()  # Set the model to evaluation mode
    
    # Convert starting text to tensor
    input_text = [char_to_id[c] for c in start_text]
    input_tensor = torch.tensor([input_text]).to(device)
    
    # Initialize generated text with the start_text
    generated_text = start_text
    
    with torch.no_grad():  # No need to track the gradients
        for _ in range(generate_length):
            # Forward pass
            output = model(input_tensor)
            
            # Get the predicted token (we take the last token here)
            probabilities = nn.functional.softmax(output[0, -1, :], dim=0)
            predicted_token = torch.multinomial(probabilities, 1).item()
            
            # Append predicted character to the generated text
            generated_text += id_to_char[predicted_token]
            
            # Add the new token to the input sequence
            new_input = torch.tensor([[predicted_token]]).to(device)
            input_tensor = torch.cat([input_tensor, new_input], 1)

    return generated_text

# Define some starting text and the length of generated text
start_text = "Đám "
generate_length = 100

# Assuming `model` is your trained model, and char_to_id and id_to_char are your dictionaries
generated_text = generate_text(start_text, generate_length, model, char_to_id, id_to_char)
print(generated_text)


Đám lêu Đámám Đám lêu lêu lêu lêu lêu lêu lêu lêu lêu lêu c lêu m, lêu lêu lêu m, đi lêu lêu lêu lêu lêu
