In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np

# Set random seed for reproducibility
torch.manual_seed(42)

<torch._C.Generator at 0x7f37a5c8d6f0>

In [2]:
class CustomDataset(Dataset):
    def __init__(self, sentences, word_dict, window_size=3):
        self.sentences = sentences
        self.word_dict = word_dict
        self.window_size = window_size
        self.PAD_TOKEN = "<PAD>"
        
        # 데이터 전처리 : 짧은 시퀀스 - 패딩 추가, 긴 시퀀스 - 삭제ss CustomDataset(Dataset):
    def __init__(self, sentences, word_dict, window_size=3):
        self.sentences = sentences
        self.word_dict = word_dict
        self.window_size = window_size
        self.PAD_TOKEN = "<PAD>"
        
        # 데이터 전처리 : 짧은 시퀀스 - 패딩 추가, 긴 시퀀스 - 삭제
        self.X, self.y = self.make_batch()

    def make_batch(self):
        _X = []
        _y = []

        for sentence in self.sentences:
            words = sentence.split()
            
            # Add padding if the sentence is shorter than window_size
            if len(words) < self.window_size:
                words = [self.PAD_TOKEN] * (self.window_size - len(words)) + words
            
            # Convert words to indices and handle padding
            X = [self.word_dict.get(word, self.word_dict[self.PAD_TOKEN]) for word in words[:self.window_size]]
            y = self.word_dict[words[self.window_size]] if len(words) > self.window_size else self.word_dict[self.PAD_TOKEN]

            _X.append(X)
            _y.append(y)
        
        return _X, _y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.long), torch.tensor(self.y[idx], dtype=torch.long)

In [3]:
# RNN model definition with embedding layer
class RNN(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(RNN, self).__init__()
        # Embedding layer for converting words to vectors
        self.embedding = nn.Embedding(vocab_size, 128)
        self.rnn = nn.RNN(input_size=128, hidden_size=hidden_size)
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, hidden, X):
        # Convert word indices to embedding vectors
        X = self.embedding(X)
        
        # Transpose batch and sequence dimensions
        X = X.transpose(0, 1)
        
        hidden_vectors, hidden = self.rnn(X, hidden)
        last_hidden_vector = hidden_vectors[-1]
        
        output = self.fc(last_hidden_vector)
        return output
    
    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, hidden_size)

In [4]:
# Training function for RNN
def train(model, dataloader, epochs, vocab_size):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    
    for epoch in tqdm(range(epochs)):
        for batch_X, batch_y in dataloader:
            hidden = model.init_hidden(batch_X.size(0))
            output = model(hidden, batch_X)
            
            loss = criterion(output, batch_y)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        if epoch % 100 == 0:
            print(f'Epoch: {epoch:04d}, Loss: {loss:.6f}')

In [12]:
# Prediction function for short sentences
def predict(model, input_sentence, word_dict, number_dict, window_size=3):
    model.eval()
    
    words = input_sentence.split()
    
    input_indices = [word_dict.get(w, word_dict["<PAD>"]) for w in words]
    input_tensor = torch.tensor([input_indices], dtype=torch.long)
    
    # RNN의 초기 hidden state 설정 (배치 크기는 1)
    hidden = model.init_hidden(1)
    output = model(hidden, input_tensor)
    
    # 예측된 단어의 인덱스 추출
    predicted_idx = output.data.max(1, keepdim=True)[1]
    predicted_word = number_dict[predicted_idx.item()]
    
    return f"Input Sentence: {input_sentence} -> Predicted Word: {predicted_word}"

In [13]:
# Function to load dataset from file
def load_dataset(file_path):
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            sentences.append(line.strip())
    return sentences

In [24]:
# Load dataset and prepare word dictionaries
sentences = load_dataset('./dataset.txt')

# Build vocabulary
texts = list(set(" ".join(sentences).split())) + ["<PAD>"]
texts.sort()

word_dict = {text: index for index, text in enumerate(texts)}
number_dict = {index: text for index, text in enumerate(texts)}
vocab_size = len(word_dict)
hidden_size = 32
window_size = 3
batch_size = 32

# Initialize the dataset and DataLoader
dataset = CustomDataset(sentences, word_dict, window_size)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
print(f'Dataset length : {dataset.__len__()}')

Dataset length : 50


In [22]:
# Initialize and train the model
model = RNN(vocab_size, hidden_size)
train(model, dataloader, epochs=1000, vocab_size=vocab_size)

  2%|▏         | 22/1000 [00:00<00:04, 218.30it/s]

Epoch: 0000, Loss: 4.433485


 12%|█▏        | 121/1000 [00:00<00:04, 175.94it/s]

Epoch: 0100, Loss: 0.001807


 23%|██▎       | 229/1000 [00:01<00:04, 177.54it/s]

Epoch: 0200, Loss: 0.000813


 32%|███▏      | 323/1000 [00:01<00:03, 186.75it/s]

Epoch: 0300, Loss: 0.000304


 43%|████▎     | 427/1000 [00:02<00:02, 207.40it/s]

Epoch: 0400, Loss: 0.000313


 53%|█████▎    | 531/1000 [00:02<00:01, 246.42it/s]

Epoch: 0500, Loss: 0.000168


 64%|██████▎   | 635/1000 [00:03<00:01, 253.83it/s]

Epoch: 0600, Loss: 0.000082


 74%|███████▍  | 745/1000 [00:03<00:00, 266.00it/s]

Epoch: 0700, Loss: 0.000108


 83%|████████▎ | 834/1000 [00:03<00:00, 284.76it/s]

Epoch: 0800, Loss: 0.000086


 96%|█████████▌| 956/1000 [00:04<00:00, 290.00it/s]

Epoch: 0900, Loss: 0.000085


100%|██████████| 1000/1000 [00:04<00:00, 227.33it/s]


In [23]:
# Predict for a given sentence
input_sentence = "I like to"
print(predict(model, input_sentence, word_dict, number_dict, window_size))

Input Sentence: I like to -> Predicted Word: sing
