In [1]:
import nltk
import matplotlib.pyplot as plt
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
import numpy as np

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [2]:
def read_file(path):
  # Read file into an array of strings
  lines = []
  with open(path, encoding="utf8") as file:
      lines = file.readlines()
  # Remove newline characters from each line
  lines = [line.strip() for line in lines]
  return lines

In [3]:
from google.colab import drive
drive.mount('/content/drive')
data_dir = "/content/drive/My Drive/The Tale of Genji (Royall Tyler Translation)(Cleaned - With Titles).txt"

lines = read_file(data_dir)
lines[:5]

Mounted at /content/drive


['1',
 '',
 'KIRITSUBO',
 '',
 "In a certain reign (whose can it have been?) someone of no very great rank, among all His Majesty's Consorts and Intimates, enjoyed exceptional favor. Those others who had always assumed that pride of place was properly theirs despised her as a dreadful woman, while the lesser Intimates were unhappier still. The way she waited on him day after day only stirred up feeling against her, and perhaps this growing burden of resentment was what affected her health and obliged her often to withdraw in misery to her home; but His Majesty, who could less and less do without her, ignored his critics until his behavior seemed bound to be the talk of all."]

In [4]:
def split_into_chapters(lines):
    chapters = []
    current_chapter = []
    skip_next_line = False

    for line in lines:
        if line.strip().isdigit():
            # Start of a new chapter
            if current_chapter:
                chapters.append(current_chapter)
                current_chapter = []
            skip_next_line = True  # Skip the next line (chapter title)
        elif skip_next_line:
            # Skip this line (chapter title) and continue to the next line
            skip_next_line = False
            continue
        else:
            current_chapter.append(line)

    if current_chapter:
        chapters.append(current_chapter)

    return chapters

# Read file and split into chapters
lines = read_file(data_dir)
chapters = split_into_chapters(lines)

# Example: print the first few lines of the first chapter's content
print(chapters[0][:5])


['KIRITSUBO', '', "In a certain reign (whose can it have been?) someone of no very great rank, among all His Majesty's Consorts and Intimates, enjoyed exceptional favor. Those others who had always assumed that pride of place was properly theirs despised her as a dreadful woman, while the lesser Intimates were unhappier still. The way she waited on him day after day only stirred up feeling against her, and perhaps this growing burden of resentment was what affected her health and obliged her often to withdraw in misery to her home; but His Majesty, who could less and less do without her, ignored his critics until his behavior seemed bound to be the talk of all.", '', 'From this sad spectacle the senior nobles and privy gentlemen could only avert their eyes. Such things had led to disorder and ruin even in China, they said, and as discontent spread through the realm, the example of Yōkihi came more and more to mind, with many a painful consequence for the lady herself; yet she trusted i

In [5]:
!pip install keras_preprocessing


Collecting keras_preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: keras_preprocessing
Successfully installed keras_preprocessing-1.1.2


In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from keras_preprocessing.sequence import pad_sequences
from keras_preprocessing.text import Tokenizer
import numpy as np
from tqdm import tqdm  # Progress bar

# Assuming 'chapters' is a list of text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(chapters)
sequences = []

for chapter in chapters:
    encoded = tokenizer.texts_to_sequences([chapter])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)

max_sequence_len = max([len(x) for x in sequences])
sequences = pad_sequences(sequences, maxlen=max_sequence_len, padding='pre')

# Preparing data
sequences = np.array(sequences)
X, y = sequences[:, :-1], sequences[:, -1]
y = torch.tensor(y, dtype=torch.long)  # Ensure y is of type Long
vocab_size = len(tokenizer.word_index) + 1

# PyTorch dataset
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = TextDataset(X, y)
train_loader = DataLoader(dataset, batch_size=256, shuffle=True)

# PyTorch LSTM model with dropout
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout=0.2):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        output, (hidden, cell) = self.lstm(x)
        out = self.fc(output[:, -1, :])
        return out

model = LSTMModel(vocab_size, embedding_dim=50, hidden_dim=100)
# model = model.to('cuda')  # Move model to GPU

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

# Training loop with progress bar and detailed logging
# for epoch in range(100): # would be 100
#     total_loss = 0
#     progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{100}', leave=False)

#     for inputs, targets in progress_bar:
#         # inputs, targets = inputs.to('cuda'), targets.to('cuda')  # Move data to GPU

#         optimizer.zero_grad()
#         outputs = model(inputs)
#         loss = criterion(outputs, targets)
#         loss.backward()
#         optimizer.step()

#         total_loss += loss.item()
#         progress_bar.set_postfix(loss=total_loss / len(train_loader))

#     print(f'Epoch {epoch + 1}, Average Loss: {total_loss / len(train_loader)}')
# Reduced number of epochs and dataset size
num_epochs = 10
subset_size = int(0.1 * len(dataset))  # 10% of the dataset
subset_indices = np.random.choice(range(len(dataset)), subset_size, replace=False)
subset = torch.utils.data.Subset(dataset, subset_indices)
train_loader = DataLoader(subset, batch_size=256, shuffle=True)

# Training loop
for epoch in range(num_epochs):
    total_loss = 0
    for inputs, targets in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}', leave=False):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f'Epoch {epoch + 1}, Average Loss: {total_loss / len(train_loader)}')




Epoch 1, Average Loss: 8.832297597612653




Epoch 2, Average Loss: 8.26039184842791




Epoch 3, Average Loss: 7.047681263514927




Epoch 4, Average Loss: 5.759328842163086




Epoch 5, Average Loss: 4.741016796657017




Epoch 6, Average Loss: 4.298915454319546




Epoch 7, Average Loss: 4.190052645547049




Epoch 8, Average Loss: 4.024015154157366




Epoch 9, Average Loss: 3.946128708975656


                                                          

Epoch 10, Average Loss: 3.8000244072505405




In [7]:
import torch.nn.functional as F

def generate_text(input_text, num_generated_words, model, tokenizer, max_sequence_len, temperature=1.0):
    model.eval()
    for _ in range(num_generated_words):
        token_list = tokenizer.texts_to_sequences([input_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        input_tensor = torch.tensor(token_list, dtype=torch.long)

        with torch.no_grad():
            predictions = model(input_tensor)
            predictions = F.softmax(predictions / temperature, dim=-1)  # Apply temperature scaling
            predicted_index = torch.multinomial(predictions, 1)[-1, 0].item()  # Sample from the distribution

        output_word = tokenizer.index_word[predicted_index] if predicted_index in tokenizer.index_word else ''
        input_text += " " + output_word if output_word else ''

    return input_text.strip()


In [28]:
input_text = "In a certain reign (whose can it have been?) someone of no very great rank, among all His Majesty's Consorts and Intimates, enjoyed exceptional favor."
num_generated_words = 5
temperature = 15.0  # Adjust this to see different outputs
generated_text = generate_text(input_text, num_generated_words, model, tokenizer, max_sequence_len, temperature)
print(generated_text)


In a certain reign (whose can it have been?) someone of no very great rank, among all His Majesty's Consorts and Intimates, enjoyed exceptional favor. “for some years now i have inhabited the same world as you and yet felt somehow quite different from before, which is why i have neither written, except as necessary, nor sought to learn your news. letters in kana take me time to read, and moments spent otherwise than calling the name are moments lost. that is why i have sent you nothing. i gather that your daughter is now with the heir apparent and that she has borne him a son. that is a very great joy. i say that because although i am only a mountain ascetic and desire no worldly glory, i must confess that i have for many years thought of nothing but you, even during my day and night devotions, and that my prayers have been for you, to the neglect of any longing of mine for the dew on the lotus. these autumn leaves from my home, carried to you on the wind.” “o seer who roams the vastne

In [29]:
import string

def preprocess_text(text):
    """
    Preprocess the text by lowering the case and removing punctuation.
    """
    text = text.lower()
    return text.translate(str.maketrans('', '', string.punctuation))

def check_repetition(generated_text, original_texts):
    """
    Check if any sentence in the generated text exists in the original texts.

    :param generated_text: The text generated by the model.
    :param original_texts: List of original texts (each text can be a string or a list of strings).
    :return: A repeated sentence if found, else None.
    """
    # Preprocess the generated text
    generated_sentences = [preprocess_text(sentence) for sentence in generated_text.split('. ')]

    # Preprocess and split the original texts into sentences
    original_sentences = set()
    for text in original_texts:
        # Join the text if it's a list of words/sentences
        if isinstance(text, list):
            text = ' '.join(text)

        for sentence in text.split('. '):
            original_sentences.add(preprocess_text(sentence))

    # Check for repetition
    for sentence in generated_sentences:
        if sentence in original_sentences:
            return sentence  # Return the first repeated sentence found

    return None  # No repetition found

# Example usage
repeated_sentence = check_repetition(generated_text, chapters)
if repeated_sentence:
    print(f"Repeated sentence found: {repeated_sentence}")
else:
    print("No repetition found in the generated text.")


Repeated sentence found: letters in kana take me time to read and moments spent otherwise than calling the name are moments lost


In [30]:
print(f"repeated_text_overall_percent: {len(repeated_sentence) / len(generated_text)} %")

repeated_text_over_generated_text: 0.0682571239231279 %
