In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import re
import random

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/poems-100/poems-100.csv


In [5]:
# Load the dataset
poems_df = pd.read_csv(r'/kaggle/input/poems-100/poems-100.csv')
text_data = ' '.join(poems_df['text'].tolist())

# Basic preprocessing
def preprocess_text(text):
    # Remove special characters and extra whitespace
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text.lower()

processed_text = preprocess_text(text_data)

# Tokenize the text into words
words = processed_text.split()
print(f"Total words in the dataset: {len(words)}")
print(f"Unique words: {len(set(words))}")

# Create vocabulary
vocab = sorted(set(words))
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for i, word in enumerate(vocab)}
vocab_size = len(vocab)
# print(f"word_to_idx: {word_to_idx}")
print(f"Vocabulary size: {vocab_size}")


Total words in the dataset: 25508
Unique words: 5158
Vocabulary size: 5158


In [7]:
# Create sequences for training
def create_sequences(words, seq_length):
    sequences = []
    targets = []
    for i in range(0, len(words) - seq_length):
        seq = words[i:i+seq_length]
        target = words[i+seq_length]
        sequences.append([word_to_idx[word] for word in seq])
        targets.append(word_to_idx[target])
    return np.array(sequences), np.array(targets)

seq_length = 10  # Length of input sequences
sequences, targets = create_sequences(words, seq_length)

# Convert to one-hot encoding
def to_one_hot(idx, vocab_size):
    one_hot = np.zeros((len(idx), vocab_size))
    for i, idx_val in enumerate(idx):
        one_hot[i, idx_val] = 1
    return one_hot

# Create PyTorch dataset
class PoemDataset(Dataset):
    def __init__(self, sequences, targets, vocab_size, one_hot=True):
        self.sequences = sequences
        self.targets = targets
        self.vocab_size = vocab_size
        self.one_hot = one_hot
        
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        if self.one_hot:
            sequence = to_one_hot(self.sequences[idx], self.vocab_size)
            sequence = torch.FloatTensor(sequence)
        else:
            sequence = torch.LongTensor(self.sequences[idx])
        
        target = torch.LongTensor([self.targets[idx]])
        return sequence, target

# Create dataset and dataloader
one_hot_dataset = PoemDataset(sequences, targets, vocab_size, one_hot=True)
one_hot_dataloader = DataLoader(one_hot_dataset, batch_size=64, shuffle=True)


In [8]:
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x, hidden=None):
        if hidden is None:
            batch_size = x.size(0)
            hidden = torch.zeros(1, batch_size, self.hidden_size).to(x.device)
        
        output, hidden = self.rnn(x, hidden)
        output = self.fc(output[:, -1, :])  # Take only the last output
        return output, hidden
    
    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size)

In [9]:
class SimpleLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x, hidden=None):
        if hidden is None:
            batch_size = x.size(0)
            h0 = torch.zeros(1, batch_size, self.hidden_size).to(x.device)
            c0 = torch.zeros(1, batch_size, self.hidden_size).to(x.device)
            hidden = (h0, c0)
        
        output, hidden = self.lstm(x, hidden)
        output = self.fc(output[:, -1, :])  # Take only the last output
        return output, hidden
    
    def init_hidden(self, batch_size):
        h0 = torch.zeros(1, batch_size, self.hidden_size)
        c0 = torch.zeros(1, batch_size, self.hidden_size)
        return (h0, c0)


In [10]:
def train_model(model, dataloader, epochs=10, learning_rate=0.001):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for sequences, targets in dataloader:
            sequences = sequences.to(device)
            targets = targets.squeeze().to(device)
            
            # Forward pass
            outputs, _ = model(sequences)
            loss = criterion(outputs, targets)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader):.4f}')
    
    return model

# Initialize and train the RNN model
input_size = vocab_size  # One-hot encoding size
hidden_size = 128
output_size = vocab_size  # Predict the next word

rnn_model = SimpleRNN(input_size, hidden_size, output_size)
rnn_model = train_model(rnn_model, one_hot_dataloader, epochs=20)

# Initialize and train the LSTM model
lstm_model = SimpleLSTM(input_size, hidden_size, output_size)
lstm_model = train_model(lstm_model, one_hot_dataloader, epochs=20)


Epoch 1/20, Loss: 6.9503
Epoch 2/20, Loss: 6.4194
Epoch 3/20, Loss: 6.1346
Epoch 4/20, Loss: 5.7807
Epoch 5/20, Loss: 5.4190
Epoch 6/20, Loss: 5.0581
Epoch 7/20, Loss: 4.6954
Epoch 8/20, Loss: 4.3294
Epoch 9/20, Loss: 3.9682
Epoch 10/20, Loss: 3.6035
Epoch 11/20, Loss: 3.2522
Epoch 12/20, Loss: 2.8995
Epoch 13/20, Loss: 2.5615
Epoch 14/20, Loss: 2.2402
Epoch 15/20, Loss: 1.9360
Epoch 16/20, Loss: 1.6645
Epoch 17/20, Loss: 1.4184
Epoch 18/20, Loss: 1.1947
Epoch 19/20, Loss: 1.0092
Epoch 20/20, Loss: 0.8424
Epoch 1/20, Loss: 6.9700
Epoch 2/20, Loss: 6.5289
Epoch 3/20, Loss: 6.3518
Epoch 4/20, Loss: 6.0960
Epoch 5/20, Loss: 5.7648
Epoch 6/20, Loss: 5.3879
Epoch 7/20, Loss: 4.9665
Epoch 8/20, Loss: 4.4995
Epoch 9/20, Loss: 4.0220
Epoch 10/20, Loss: 3.5475
Epoch 11/20, Loss: 3.0834
Epoch 12/20, Loss: 2.6302
Epoch 13/20, Loss: 2.2041
Epoch 14/20, Loss: 1.8147
Epoch 15/20, Loss: 1.4671
Epoch 16/20, Loss: 1.1623
Epoch 17/20, Loss: 0.9135
Epoch 18/20, Loss: 0.7041
Epoch 19/20, Loss: 0.5429
Epoc

In [11]:
def generate_text(model, seed_text, max_length=100, temperature=1.0):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.eval()
    
    words = seed_text.lower().split()
    current_words = words[-seq_length:] if len(words) > seq_length else words
    
    # Pad if necessary
    if len(current_words) < seq_length:
        current_words = ['the'] * (seq_length - len(current_words)) + current_words
    
    # Convert to indices
    current_indices = [word_to_idx.get(word, 0) for word in current_words]
    
    generated_words = list(current_words)
    
    with torch.no_grad():
        for _ in range(max_length):
            # Convert to one-hot
            x = to_one_hot(current_indices, vocab_size)
            x = torch.FloatTensor(x).unsqueeze(0).to(device)
            
            # Forward pass
            output, _ = model(x)
            
            # Apply temperature
            output = output.div(temperature)
            
            # Sample from the output distribution
            probs = torch.softmax(output, dim=1).cpu().data.numpy().ravel()
            idx = np.random.choice(len(probs), p=probs)
            
            # Get the predicted word
            word = idx_to_word[idx]
            generated_words.append(word)
            
            # Update current indices
            current_indices = current_indices[1:] + [idx]
    
    return ' '.join(generated_words)

# Generate text with RNN
seed_text = "the rose is red"
generated_text_rnn = generate_text(rnn_model, seed_text, max_length=50)
print("RNN Generated Text:")
print(generated_text_rnn)

# Generate text with LSTM
generated_text_lstm = generate_text(lstm_model, seed_text, max_length=50)
print("\nLSTM Generated Text:")
print(generated_text_lstm)


RNN Generated Text:
the the the the the the the rose is red man voice and home they said air some bring you because they fair not nothing nothing cents coarse it flight something you ll here give thou separate be bear are in bosom and dead one s skeptic ten sweet and yet hour the morning and resign them and still my

LSTM Generated Text:
the the the the the the the rose is red roar the lo the rest under the fish in the crowd from the lives long still children thee my thee o just no flag ever loving way clothes my blow my yonder walls on the ears properties my bare scares my tread around in beat hush a green d upon


In [None]:
# Create dataset without one-hot encoding
embedding_dataset = PoemDataset(sequences, targets, vocab_size, one_hot=False)
embedding_dataloader = DataLoader(embedding_dataset, batch_size=64, shuffle=True)


In [None]:
class RNNWithEmbedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size):
        super(RNNWithEmbedding, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x, hidden=None):
        embedded = self.embedding(x)
        
        if hidden is None:
            batch_size = x.size(0)
            hidden = torch.zeros(1, batch_size, self.hidden_size).to(x.device)
        
        output, hidden = self.rnn(embedded, hidden)
        output = self.fc(output[:, -1, :])  # Take only the last output
        return output, hidden
    
    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size)


In [None]:
class LSTMWithEmbedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size):
        super(LSTMWithEmbedding, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x, hidden=None):
        embedded = self.embedding(x)
        
        if hidden is None:
            batch_size = x.size(0)
            h0 = torch.zeros(1, batch_size, self.hidden_size).to(x.device)
            c0 = torch.zeros(1, batch_size, self.hidden_size).to(x.device)
            hidden = (h0, c0)
        
        output, hidden = self.lstm(embedded, hidden)
        output = self.fc(output[:, -1, :])  # Take only the last output
        return output, hidden
    
    def init_hidden(self, batch_size):
        h0 = torch.zeros(1, batch_size, self.hidden_size)
        c0 = torch.zeros(1, batch_size, self.hidden_size)
        return (h0, c0)


In [None]:
# Initialize and train the RNN model with embeddings
embedding_dim = 100
hidden_size = 128
output_size = vocab_size

rnn_embedding_model = RNNWithEmbedding(vocab_size, embedding_dim, hidden_size, output_size)
rnn_embedding_model = train_model(rnn_embedding_model, embedding_dataloader, epochs=20)

# Initialize and train the LSTM model with embeddings
lstm_embedding_model = LSTMWithEmbedding(vocab_size, embedding_dim, hidden_size, output_size)
lstm_embedding_model = train_model(lstm_embedding_model, embedding_dataloader, epochs=20)


In [None]:
def generate_text_with_embedding(model, seed_text, max_length=100, temperature=1.0):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.eval()
    
    words = seed_text.lower().split()
    current_words = words[-seq_length:] if len(words) > seq_length else words
    
    # Pad if necessary
    if len(current_words) < seq_length:
        current_words = ['the'] * (seq_length - len(current_words)) + current_words
    
    # Convert to indices
    current_indices = [word_to_idx.get(word, 0) for word in current_words]
    
    generated_words = list(current_words)
    
    with torch.no_grad():
        for _ in range(max_length):
            # Convert to tensor
            x = torch.LongTensor(current_indices).unsqueeze(0).to(device)
            
            # Forward pass
            output, _ = model(x)
            
            # Apply temperature
            output = output.div(temperature)
            
            # Sample from the output distribution
            probs = torch.softmax(output, dim=1).cpu().data.numpy().ravel()
            idx = np.random.choice(len(probs), p=probs)
            
            # Get the predicted word
            word = idx_to_word[idx]
            generated_words.append(word)
            
            # Update current indices
            current_indices = current_indices[1:] + [idx]
    
    return ' '.join(generated_words)

# Generate text with RNN + Embedding
seed_text = "the rose is red"
generated_text_rnn_emb = generate_text_with_embedding(rnn_embedding_model, seed_text, max_length=50)
print("RNN + Embedding Generated Text:")
print(generated_text_rnn_emb)

# Generate text with LSTM + Embedding
generated_text_lstm_emb = generate_text_with_embedding(lstm_embedding_model, seed_text, max_length=50)
print("\nLSTM + Embedding Generated Text:")
print(generated_text_lstm_emb)


In [None]:
import matplotlib.pyplot as plt
import time

# Function to measure training time and loss
def measure_performance(model_class, dataloader, epochs=10):
    start_time = time.time()
    losses = []
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model_class.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for sequences, targets in dataloader:
            sequences = sequences.to(device)
            targets = targets.squeeze().to(device)
            
            # Forward pass
            outputs, _ = model(sequences)
            loss = criterion(outputs, targets)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        avg_loss = total_loss / len(dataloader)
        losses.append(avg_loss)
        print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}')
    
    training_time = time.time() - start_time
    return model, losses, training_time

# Compare training time and loss
models = {
    'RNN (One-Hot)': rnn_model,
    'LSTM (One-Hot)': lstm_model,
    'RNN (Embedding)': rnn_embedding_model,
    'LSTM (Embedding)': lstm_embedding_model
}

# Plot loss comparison
plt.figure(figsize=(10, 6))
for model_name, model in models.items():
    plt.plot(range(1, 21), model.losses, label=model_name)

plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss Comparison')
plt.legend()
plt.grid(True)
plt.show()

# Compare generated text quality
seed_texts = ["the rose is red", "i love the way", "she walks in beauty"]
for seed in seed_texts:
    print(f"\nSeed: '{seed}'")
    for model_name, model in models.items():
        if 'Embedding' in model_name:
            generated = generate_text_with_embedding(model, seed, max_length=30)
        else:
            generated = generate_text(model, seed, max_length=30)
        print(f"{model_name}: {generated}")
