In [1]:
import requests

# Load the text from a local file
def load_text_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

# Load the text from a URL
def load_text_from_url(url):
    response = requests.get(url)
    text = response.text.replace('\r\n', '\n')  # Normalize line endings
    return text

# Count unique characters in the text
def count_unique_chars(text):
    unique_chars = set(text)
    # Num of unique characters
    vocab_size = len(unique_chars)
    return vocab_size, unique_chars

# List of file paths or URLs
local_file_1 = 'pg100.txt'
local_file_2 = 'pg2600.txt'
local_file_3 = 'pg766.txt'
url_file_1 = 'https://www.gutenberg.org/cache/epub/100/pg100.txt'
url_file_2 = 'https://www.gutenberg.org/cache/epub/2600/pg2600.txt'
url_file_3 = 'https://www.gutenberg.org/cache/epub/766/pg766.txt'

if 'google.colab' in str(get_ipython()):
    file_path_list = [url_file_1, url_file_2, url_file_3]
else:
    file_path_list = [local_file_1, local_file_2, local_file_3]
text_list = []
vocab_size_list = []
unique_chars_list = []

for file_path in file_path_list:
    if file_path.startswith('http'):
        print(f'Loading text from URL: {file_path}')
        text = load_text_from_url(file_path)
    else:
        print(f'Loading text from file: {file_path}')
        text = load_text_from_file(file_path)
    vocab_size, unique_chars = count_unique_chars(text)
    text_list.append(text)
    vocab_size_list.append(vocab_size)
    unique_chars_list.append(unique_chars)

print(f'Vocabulary size for each text: {vocab_size_list}')
print(f'Unique characters for each text: {unique_chars_list}')
print(f'Example unique characters: {unique_chars_list[0]}')

Loading text from file: pg100.txt
Loading text from file: pg2600.txt
Loading text from file: pg766.txt
Vocabulary size for each text: [107, 112, 90]
Unique characters for each text: [{'K', 'J', '3', '—', '1', 'w', 'S', "'", 'À', '7', 'Q', 'k', '/', 'h', '’', 'F', 'L', '8', 'R', 'a', '%', 'ë', 'o', 'à', 'Æ', '\n', '“', 'r', 'O', 'G', 'æ', '\t', 'H', 'â', 'U', 'X', 'b', 'l', 'œ', '$', 'g', 'y', '6', '0', '‘', '™', 'N', 'É', 'A', '”', '…', 'E', 's', '_', '?', 'V', '.', 'm', '!', 'M', 'é', '[', '2', 'z', 'i', 'Y', 'D', 'Ç', '&', 'Z', 'j', 't', 'd', 'f', 'C', ')', '\ufeff', ' ', ',', '9', 'ê', 'n', '(', 'î', 'P', 'W', '-', '4', 'c', 'v', '5', '•', 'q', 'ç', 'I', 'p', ';', ':', 'x', 'T', 'B', 'è', 'u', 'e', '#', '*', ']'}, {'K', 'Á', 'J', '3', '—', '1', 'á', 'w', 'S', 'À', '7', 'ô', 'ï', 'Q', 'k', '/', 'h', '’', 'F', 'L', '8', 'ú', 'R', 'a', '%', 'ë', 'o', 'à', '\n', 'ý', '“', 'r', 'O', 'G', 'æ', 'H', 'â', 'U', 'X', 'b', 'l', 'ö', 'ü', 'í', 'œ', '$', 'g', 'y', '6', '0', '‘', '™', 'N', 'É', '

In [2]:
# # Compare unique characters from local and URL text files
# local_file_path = 'pg2600.txt'
# url_file_path = 'https://www.gutenberg.org/cache/epub/2600/pg2600.txt'
#
# local_text = load_text_from_file(local_file_path)
# local_vocab_size, local_unique_chars = count_unique_chars(local_text)
# url_text = load_text_from_url(url_file_path)
# url_vocab_size, url_unique_chars = count_unique_chars(url_text)
#
# # Find the extra character(s)
# extra_chars_in_url = url_unique_chars - local_unique_chars
# extra_chars_in_local = local_unique_chars - url_unique_chars
# print(f'Extra characters in URL text: {extra_chars_in_url}')
# print(f'Extra characters in local text: {extra_chars_in_local}')

In [3]:
# Create a dictionary to map characters to indices and vice-versa
def create_char_mappings(unique_chars):
    char_to_index = {char: idx for idx, char in enumerate(unique_chars)}
    index_to_char = {idx: char for idx, char in enumerate(unique_chars)}
    return char_to_index, index_to_char

vocab_size, unique_chars = vocab_size_list[0], unique_chars_list[0]
char_to_index, index_to_char = create_char_mappings(unique_chars)

print(f"Character to index mapping for first text: {char_to_index}")
print(f"Index to character mapping for first text: {index_to_char}")

Character to index mapping for first text: {'K': 0, 'J': 1, '3': 2, '—': 3, '1': 4, 'w': 5, 'S': 6, "'": 7, 'À': 8, '7': 9, 'Q': 10, 'k': 11, '/': 12, 'h': 13, '’': 14, 'F': 15, 'L': 16, '8': 17, 'R': 18, 'a': 19, '%': 20, 'ë': 21, 'o': 22, 'à': 23, 'Æ': 24, '\n': 25, '“': 26, 'r': 27, 'O': 28, 'G': 29, 'æ': 30, '\t': 31, 'H': 32, 'â': 33, 'U': 34, 'X': 35, 'b': 36, 'l': 37, 'œ': 38, '$': 39, 'g': 40, 'y': 41, '6': 42, '0': 43, '‘': 44, '™': 45, 'N': 46, 'É': 47, 'A': 48, '”': 49, '…': 50, 'E': 51, 's': 52, '_': 53, '?': 54, 'V': 55, '.': 56, 'm': 57, '!': 58, 'M': 59, 'é': 60, '[': 61, '2': 62, 'z': 63, 'i': 64, 'Y': 65, 'D': 66, 'Ç': 67, '&': 68, 'Z': 69, 'j': 70, 't': 71, 'd': 72, 'f': 73, 'C': 74, ')': 75, '\ufeff': 76, ' ': 77, ',': 78, '9': 79, 'ê': 80, 'n': 81, '(': 82, 'î': 83, 'P': 84, 'W': 85, '-': 86, '4': 87, 'c': 88, 'v': 89, '5': 90, '•': 91, 'q': 92, 'ç': 93, 'I': 94, 'p': 95, ';': 96, ':': 97, 'x': 98, 'T': 99, 'B': 100, 'è': 101, 'u': 102, 'e': 103, '#': 104, '*': 105,

In [4]:
import numpy as np

# One-hot encode a character based on the character index
def one_hot_encode(char, char_to_index, vocab_size):
    one_hot_vector = np.zeros(vocab_size)
    one_hot_vector[char_to_index[char]] = 1
    return one_hot_vector

test_char_a = 'a'
one_hot_vector = one_hot_encode(test_char_a, char_to_index, vocab_size)
print(f"One-hot encoding for '{test_char_a}': {one_hot_vector}")

One-hot encoding for 'a': [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [5]:
# Sample a short portion from the first book
text = text_list[0][60000:61000]
print(text)

 ill:
Some in their hawks and hounds, some in their horse.
And every humour hath his adjunct pleasure,
Wherein it finds a joy above the rest,
But these particulars are not my measure,
All these I better in one general best.
Thy love is better than high birth to me,
Richer than wealth, prouder than garments’ costs,
Of more delight than hawks and horses be:
And having thee, of all men’s pride I boast.
  Wretched in this alone, that thou mayst take,
  All this away, and me most wretched make.


                    92

But do thy worst to steal thyself away,
For term of life thou art assured mine,
And life no longer than thy love will stay,
For it depends upon that love of thine.
Then need I not to fear the worst of wrongs,
When in the least of them my life hath end,
I see, a better state to me belongs
Than that, which on thy humour doth depend.
Thou canst not vex me with inconstant mind,
Since that my life on thy revolt doth lie,
O what a happy title do I find,
Happy to have thy love, hap

In [None]:
from tqdm import tqdm

# Generate sequences of 32 characters and the next character as the target
def generate_sequences(text, char_to_index, sequence_length=32, stride=1):
    input_sequences = []
    target_characters = []

    # for i in range(0, len(text) - sequence_length, stride):
    for i in tqdm(range(0, len(text) - sequence_length - 1, stride), desc="Generating sequences"):
        input_seq = text[i:i+sequence_length]
        # Target/next character
        target_char = text[i+sequence_length]

        # Convert input sequence to one-hot encoded vectors
        input_seq_encoded = [one_hot_encode(c, char_to_index, len(char_to_index)) for c in input_seq]
        input_sequences.append(input_seq_encoded)

        # One-hot encoding for the target character
        target_char_encoded = one_hot_encode(target_char, char_to_index, len(char_to_index))
        target_characters.append(target_char_encoded)

    return np.array(input_sequences), np.array(target_characters)


In [7]:
input_sequences, target_characters = generate_sequences(text, char_to_index)
print(f"Input sequences shape: {input_sequences.shape}")
print(f"Target characters shape: {target_characters.shape}")
print(input_sequences[2])
print(target_characters[35])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader, TensorDataset, random_split
from tqdm import tqdm

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Device: {device}")

class CharRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size, dropout=0.2):
        super(CharRNN, self).__init__()
        self.rnn = nn.RNN(vocab_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        out, hidden = self.rnn(x, hidden)
        out = self.dropout(out)
        out = self.fc(out)
        return F.log_softmax(out, dim=1), hidden

    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, hidden_size)


Device: cuda


In [9]:
# Load text and prepare data
text = text_list[0]
vocab_size, unique_chars = vocab_size_list[0], unique_chars_list[0]
char_to_index, index_to_char = create_char_mappings(unique_chars)

# Generate sequences
input_sequences, target_characters = generate_sequences(text, char_to_index, stride=64)
input_sequences = torch.tensor(input_sequences, dtype=torch.float32)  # Presuming these are one-hot encoded or normalized sequences
target_characters = torch.tensor(target_characters, dtype=torch.long)

# Create TensorDataset and DataLoader
dataset = TensorDataset(input_sequences, target_characters)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print("Input sequences shape:", input_sequences.shape)
print("Target characters shape:", target_characters.shape)

Generating sequences: 100%|██████████| 84042/84042 [00:02<00:00, 33853.33it/s]


Input sequences shape: torch.Size([84042, 32, 107])
Target characters shape: torch.Size([84042, 107])


In [13]:
# Hyperparameters
hidden_size = 256
learning_rate = 0.001
num_epochs = 10
batch_size = 64
dropout = 0.5

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Initialize model, loss function, and optimizer
model = CharRNN(vocab_size, hidden_size, dropout).to(device)
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for input_seqs, target_seqs in train_loader:
        input_seqs, target_seqs = input_seqs.to(device), target_seqs.to(device)  # Move tensors to the correct device
        hidden = model.init_hidden(batch_size).to(device)  # Move hidden state to the correct device
        optimizer.zero_grad()
        output, hidden = model(input_seqs, hidden)
        print("Output shape:", output.shape)
        print("Target shape:", target_seqs.shape)
        loss = criterion(output.transpose(1, 2), target_seqs)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch+1}, Training Loss: {total_loss / len(train_loader)}')

    # Validation
    model.eval()
    with torch.no_grad():
        val_loss = 0
        for input_seqs, target_seqs in val_loader:
            input_seqs, target_seqs = input_seqs.to(device), target_seqs.to(device)  # Move tensors to the correct device
            hidden = model.init_hidden(batch_size).to(device)  # Move hidden state to the correct device
            output, hidden = model(input_seqs, hidden)
            val_loss += criterion(output.transpose(1, 2), target_seqs).item()
        print(f'Validation Loss: {val_loss / len(val_loader)}')


Output shape: torch.Size([64, 32, 107])
Target shape: torch.Size([64, 107])



KeyboardInterrupt

