In [1]:
from dataset_creation import TextDataset
from torch.utils.data import DataLoader

# Instantiate the dataset
text_dataset = TextDataset(directory='data/SPGC-tokens-2018-07-18/', sequence_length=100)
print(f"Dataset created with {len(text_dataset)} sequences.")

# Create a DataLoader without a sampler
dataloader = DataLoader(text_dataset, batch_size=1)

# Iterate over a few batches and print their contents
for i, (inputs) in enumerate(dataloader):
    if i >= 2:  # Adjust this value to see more/less batches
        break

    print(f"\nBatch {i+1}")
    print(f"Inputs shape: {inputs.shape}")

    # Optionally print the actual sequences (comment out if too verbose)
    sequence = ''.join([text_dataset.idx_to_char[int(idx)] for idx in inputs[0]])
    # target = text_dataset.idx_to_char[int(targets[0])]
    print(f"Sequence: {sequence}")


Dataset created with 18422222637 sequences.

Batch 1
Inputs shape: torch.Size([1, 100])
Sequence: d or read of cremation i had had the misfortune to break my slate a few days before and the biggest 

Batch 2
Inputs shape: torch.Size([1, 100])
Sequence: ut of the in years answered cousin molly belle there is another road from her house to where everyda


In [2]:
import torch

# def create_dataset(processed_text, sequence_length=100):
#     characters = list(set(processed_text))
#     char_to_idx = {char: idx for idx, char in enumerate(characters)}
#     idx_to_char = {idx: char for idx, char in enumerate(characters)}

#     inputs = []
#     targets = []
#     for i in range(len(processed_text) - sequence_length):
#         input_seq = processed_text[i:i + sequence_length]
#         target_char = processed_text[i + sequence_length]
#         inputs.append([char_to_idx[char] for char in input_seq])
#         targets.append(char_to_idx[target_char])

#     return torch.tensor(inputs, dtype=torch.long), torch.tensor(targets, dtype=torch.long), idx_to_char, char_to_idx

# # Example usage
# sequence_length = 100
# inputs, targets, idx_to_char, char_to_idx = create_dataset(processed_text, sequence_length)

# Define chars using keys of char_to_idx
chars = list(text_dataset.char_to_idx.keys())

n_characters = len(chars)  # Number of unique characters
print(f"Number of unique characters: {n_characters}")
print(f"Characters: {chars}")

Number of unique characters: 70
Characters: ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ',', '.', ';', "'", '"', '?', '!', ' ']


In [3]:
class SimpleRNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = torch.nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = torch.nn.Linear(input_size + hidden_size, output_size)
        self.softmax = torch.nn.LogSoftmax(dim=1)  # Update dim to 1 for batch processing

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), dim=1)  # Change dimension to 1

        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

# Ensure the input size matches the number of features for each input
input_size = n_characters
n_hidden = 128
rnn = SimpleRNN(input_size, n_hidden, len(chars))

# Define the loss function (criterion) and optimizer
criterion = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=0.005)



# Apply Gradient Clipping
optimizer = torch.optim.Adam(rnn.parameters(), lr=0.005)
torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1.0)  # Clip gradients during training


tensor(0.)

In [4]:
def train(input_line_tensor, target_char_tensor):
    hidden = rnn.initHidden()  # Pass batch_size to initHidden
    rnn.zero_grad()

    # Process the entire input sequence
    output, hidden = rnn(input_line_tensor, hidden)  # No need for loop here

    # Modify the target tensor shape
    target_char_tensor = target_char_tensor.view(-1)

    # Compute the loss
    loss = criterion(output.view(-1, len(chars)), target_char_tensor)
    loss.backward()
    optimizer.step()

    return output, loss.item()

In [6]:
# Training loop
for epoch in range(1, 101):
    for batch_idx, (inputs) in enumerate(dataloader):
        # Initialize variables to store the history and predicted characters for each batch
        history = []
        predicted_chars = []

        input_line_tensor = inputs[0]  # Get the first character
        print(f'input_line_tensor.shape: {input_line_tensor.shape}')
        for char_idx in range(input_line_tensor.shape[0]):
            # Convert to one-hot encoding for each character
            hot_input_char_tensor = torch.nn.functional.one_hot(input_line_tensor[char_idx], num_classes=n_characters).type(torch.float)
            
            # Train and calculate loss for each character
            target_char = input_line_tensor[char_idx].unsqueeze(0)
            output, loss = train(hot_input_char_tensor.unsqueeze(0), target_char)
           
            if batch_idx % 100 == 0:
                print(f'Epoch {epoch}, Batch {batch_idx}, Char {char_idx} Loss: {loss}')

            if batch_idx % 100 == 0:
                # Use the output to generate a character prediction
                topv, topi = output.topk(1, dim=1)  # Change dim to 1
                predicted_char = text_dataset.idx_to_char[topi[0, 0].item()]
                target_char = text_dataset.idx_to_char[target_char.item()]

                # Append the current character and prediction to their respective lists
                history.append(target_char)
                predicted_chars.append(predicted_char)

                # Display the summarized history
                history_str = ''.join(history)
                predicted_str = ''.join(predicted_chars)
                print(f'Epoch {epoch}, Batch {batch_idx}, Char {char_idx}: History: "{history_str}", Predicted: "{predicted_str}"')


input_line_tensor.shape: torch.Size([100])
Epoch 1, Batch 0, Char 0 Loss: 0.0
Epoch 1, Batch 0, Char 0: History: " ", Predicted: " "
Epoch 1, Batch 0, Char 1 Loss: 0.0
Epoch 1, Batch 0, Char 1: History: " l", Predicted: " l"
Epoch 1, Batch 0, Char 2 Loss: 0.0
Epoch 1, Batch 0, Char 2: History: " la", Predicted: " la"
Epoch 1, Batch 0, Char 3 Loss: 0.0
Epoch 1, Batch 0, Char 3: History: " lan", Predicted: " lan"
Epoch 1, Batch 0, Char 4 Loss: 0.0
Epoch 1, Batch 0, Char 4: History: " lang", Predicted: " lang"
Epoch 1, Batch 0, Char 5 Loss: 0.0
Epoch 1, Batch 0, Char 5: History: " langu", Predicted: " langu"
Epoch 1, Batch 0, Char 6 Loss: 0.0
Epoch 1, Batch 0, Char 6: History: " langui", Predicted: " langui"
Epoch 1, Batch 0, Char 7 Loss: 0.0
Epoch 1, Batch 0, Char 7: History: " languid", Predicted: " languid"
Epoch 1, Batch 0, Char 8 Loss: 0.0
Epoch 1, Batch 0, Char 8: History: " languidl", Predicted: " languidl"
Epoch 1, Batch 0, Char 9 Loss: 0.0
Epoch 1, Batch 0, Char 9: History: " lan

KeyboardInterrupt: 