In [1]:
from dataset_creation import create_dataset

# Create dataset from all token files in the directory
sequence_length = 100
inputs, targets, idx_to_char, char_to_idx = create_dataset('data/SPGC-tokens-2018-07-18/', sequence_length)

# Print statement to verify dataset creation
if len(inputs) > 0:
    first_seq = ''.join([idx_to_char[idx.item()] for idx in inputs[0]])
    target_char = idx_to_char[targets[0].item()]
    print(f"Dataset created successfully with {len(inputs)} sequences.")
    print(f"First sequence: '{first_seq}'")
    print(f"Target character for the first sequence: '{target_char}'")
else:
    print("No sequences were created. Check the dataset.")

In [5]:
import torch

# def create_dataset(processed_text, sequence_length=100):
#     characters = list(set(processed_text))
#     char_to_idx = {char: idx for idx, char in enumerate(characters)}
#     idx_to_char = {idx: char for idx, char in enumerate(characters)}

#     inputs = []
#     targets = []
#     for i in range(len(processed_text) - sequence_length):
#         input_seq = processed_text[i:i + sequence_length]
#         target_char = processed_text[i + sequence_length]
#         inputs.append([char_to_idx[char] for char in input_seq])
#         targets.append(char_to_idx[target_char])

#     return torch.tensor(inputs, dtype=torch.long), torch.tensor(targets, dtype=torch.long), idx_to_char, char_to_idx

# # Example usage
# sequence_length = 100
# inputs, targets, idx_to_char, char_to_idx = create_dataset(processed_text, sequence_length)

# Define chars using keys of char_to_idx
chars = list(char_to_idx.keys())

n_characters = len(chars)  # Number of unique characters

In [6]:
class SimpleRNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.hidden_size = hidden_size
        # Adjust the input size as needed
        self.i2h = torch.nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = torch.nn.Linear(input_size + hidden_size, output_size)
        self.softmax = torch.nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

# Ensure the input size matches the number of features for each input
input_size = n_characters
n_hidden = 128
rnn = SimpleRNN(input_size, n_hidden, len(chars))

# Define the loss function (criterion) and optimizer
criterion = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=0.005)


In [7]:
def train(input_line_tensor, target_char_tensor):
    hidden = rnn.initHidden()
    rnn.zero_grad()

    # Process the entire input sequence
    for i in range(input_line_tensor.size(0)):
        output, hidden = rnn(input_line_tensor[i].view(1, -1), hidden)

    # Compute the loss
    loss = criterion(output, target_char_tensor.view(-1))
    loss.backward()
    optimizer.step()

    return output, loss.item()


In [8]:
# Training loop
for epoch in range(1, 1001):
    index = epoch % len(inputs)
    input_line_tensor = inputs[index]
    target_char_tensor = targets[index]

    # Convert to one-hot encoding
    input_line_tensor = torch.nn.functional.one_hot(input_line_tensor, num_classes=n_characters).type(torch.float)

    # Train and calculate loss
    output, loss = train(input_line_tensor, target_char_tensor)
    if epoch % 100 == 0:
        print(f'Epoch {epoch} Loss: {loss}')

    if epoch % 500 == 0:
        # Use the output to generate a character prediction
        topv, topi = output.topk(1)
        predicted_char = idx_to_char[topi[0].item()]
        target_char = idx_to_char[target_char_tensor.item()]

        # Retrieve the input sequence as characters
        input_seq = ''.join([idx_to_char[idx.item()] for idx in inputs[index]])
        
        print(f'Epoch {epoch}: Input Sequence: "{input_seq}"')
        print(f'Predicted "{predicted_char}", Target "{target_char}"')



Epoch 100 Loss: 3.2974753379821777
Epoch 200 Loss: 2.916781187057495
Epoch 300 Loss: 3.62199068069458
Epoch 400 Loss: 1.910137414932251
Epoch 500 Loss: 3.6185874938964844
Epoch 500: Input Sequence: "e world is very different now for man holds in his mortal hands the power to abolish all forms of hu"
Predicted " ", Target "m"
Epoch 600 Loss: 3.1989240646362305
Epoch 700 Loss: 3.1318187713623047
Epoch 800 Loss: 3.9293618202209473
Epoch 900 Loss: 0.7651025652885437
Epoch 1000 Loss: 3.552541732788086
Epoch 1000: Input Sequence: " been passed to a new generation of americans born in this century tempered by war disciplined by a "
Predicted "d", Target "h"
