In [1]:
import os
import re
import requests

# Define authors and their corresponding Gutenberg URLs
authors_and_books = {
    "Walpole": [
        ("The Castle of Otranto", "https://www.gutenberg.org/files/696/696-0.txt")
    ],
    "Radcliffe": [
        ("The Mysteries of Udolpho", "https://www.gutenberg.org/files/3268/3268-0.txt"),
        ("The Italian", "https://www.gutenberg.org/files/7881/7881-0.txt")
    ],
    "Lewis": [
        ("The Monk", "https://www.gutenberg.org/files/601/601-0.txt")
    ],
    "Shelley": [
        ("Frankenstein", "https://www.gutenberg.org/files/84/84-0.txt")
    ],
    "Stoker": [
        ("Dracula", "https://www.gutenberg.org/files/345/345-0.txt"),
        ("Dracula's Guest", "https://www.gutenberg.org/files/564/564-0.txt")
    ],
    "E_Bronte": [
        ("Wuthering Heights", "https://www.gutenberg.org/files/768/768-0.txt")
    ],
    "C_Bronte": [
        ("Jane Eyre", "https://www.gutenberg.org/files/1260/1260-0.txt")
    ],
    "Maturin": [
        ("Melmoth the Wanderer", "https://www.gutenberg.org/files/15859/15859-0.txt")
    ],
    "Poe": [
        ("The Works of Edgar Allan Poe", "https://www.gutenberg.org/files/2147/2147-0.txt")
    ],
    "Hawthorne": [
        ("The House of the Seven Gables", "https://www.gutenberg.org/files/77/77-0.txt")
    ],
    "Le_Fanu": [
        ("Uncle Silas", "https://www.gutenberg.org/files/14851/14851-0.txt"),
        ("Carmilla", "https://www.gutenberg.org/files/10007/10007-0.txt")
    ]
}

base_path = "./gothic_novels"

# Create directories for each author
for author in authors_and_books.keys():
    author_path = os.path.join(base_path, author)
    os.makedirs(author_path, exist_ok=True)

# Download text files using requests
for author, books in authors_and_books.items():
    for book_title, book_url in books:
        book_path = os.path.join(base_path, author, f"{book_title}.txt")
        if not os.path.exists(book_path):
            response = requests.get(book_url)
            if response.status_code == 200:
                with open(book_path, 'w', encoding='utf-8') as f:
                    f.write(response.text)
                print(f"Downloaded {book_title} for author {author}")
            else:
                print(f"Failed to download {book_title} for author {author}")
        else:
            print(f"{book_title} already downloaded for author {author}")

print("Download complete!")

# Verify that directories and files exist
for author in authors_and_books.keys():
    author_path = os.path.join(base_path, author)
    if not os.path.exists(author_path):
        print(f"Directory does not exist: {author_path}")
    else:
        files = os.listdir(author_path)
        if not files:
            print(f"No files found in directory: {author_path}")
        else:
            print(f"Found {len(files)} files in directory: {author_path}")

# Function to clean text
def clean_text(text):
    # Remove Gutenberg headers/footers
    start_pattern = r'\*\*\* START OF [^\*]* \*\*\*'
    end_pattern = r'\*\*\* END OF [^\*]* \*\*\*'
    start_match = re.search(start_pattern, text)
    end_match = re.search(end_pattern, text)
    if start_match and end_match:
        text = text[start_match.end():end_match.start()]
    # Additional cleaning if necessary
    text = re.sub(r'\s+', ' ', text)  # Remove excessive whitespace
    text = text.encode('ascii', 'ignore').decode('ascii')  # Remove non-ascii characters
    text = text.strip()
    return text

# Aggregate texts
all_texts = []
for author in authors_and_books.keys():
    author_path = os.path.join(base_path, author)
    for filename in os.listdir(author_path):
        if filename.endswith(".txt"):
            with open(os.path.join(author_path, filename), 'r', encoding='utf-8') as file:
                text = file.read()
                cleaned_text = clean_text(text)
                all_texts.append(cleaned_text)

# Combine texts into a single file
combined_texts = "\n\n".join(all_texts)
with open("gothic_novels_combined.txt", 'w', encoding='utf-8') as outfile:
    outfile.write(combined_texts)

print("Gothic novels dataset created successfully!")


The Castle of Otranto already downloaded for author Walpole
The Mysteries of Udolpho already downloaded for author Radcliffe
Failed to download The Italian for author Radcliffe
The Monk already downloaded for author Lewis
Frankenstein already downloaded for author Shelley
Dracula already downloaded for author Stoker
Dracula's Guest already downloaded for author Stoker
Wuthering Heights already downloaded for author E_Bronte
Jane Eyre already downloaded for author C_Bronte
Melmoth the Wanderer already downloaded for author Maturin
The Works of Edgar Allan Poe already downloaded for author Poe
The House of the Seven Gables already downloaded for author Hawthorne
Uncle Silas already downloaded for author Le_Fanu
Carmilla already downloaded for author Le_Fanu
Download complete!
Found 1 files in directory: ./gothic_novels/Walpole
Found 1 files in directory: ./gothic_novels/Radcliffe
Found 1 files in directory: ./gothic_novels/Lewis
Found 1 files in directory: ./gothic_novels/Shelley
Found 2

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
from peft import get_peft_model, LoraConfig
import gc

# Function to free GPU memory
def free_gpu_memory():
    torch.cuda.empty_cache()
    gc.collect()

# Free GPU memory before starting
free_gpu_memory()

# Check if CUDA is available and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load model and tokenizer
model_name = 'gpt2-medium'  # or your chosen model
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Add a padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token

# Load and prepare the dataset
with open('gothic_novels_combined.txt', 'r', encoding='utf-8') as f:
    data = f.read()

# Split the text into smaller chunks if needed
texts = [data[i:i + 2048] for i in range(0, len(data), 2048)]

# Create a dataset from the text chunks
dataset = Dataset.from_dict({'text': texts})

def tokenize_function(examples):
    tokens = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Split the dataset into train and validation sets
split_datasets = tokenized_datasets.train_test_split(test_size=0.1)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

# Prepare LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["c_attn"],
    lora_dropout=0.1,
)
model = get_peft_model(model, lora_config).to(device)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Corrected to evaluation_strategy
    learning_rate=5e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    no_cuda=not torch.cuda.is_available(),  # Ensure correct device usage
)

# Data Collator to handle padding
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

# Fine-tuning
trainer.train()

# Save the model
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# Free GPU memory after training
free_gpu_memory()


Using device: cuda


Map:   0%|          | 0/4255 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss


In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
model_path = './fine-tuned-model'
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

# Ensure the model is in evaluation mode
model.eval()

# Function to generate text
def generate_text(prompt, max_length=100, num_return_sequences=1):
    # Tokenize the input prompt
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    
    # Generate text
    with torch.no_grad():
        output = model.generate(
            input_ids, 
            max_length=max_length, 
            num_return_sequences=num_return_sequences,
            no_repeat_ngram_size=2,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Decode the generated text
    generated_texts = [tokenizer.decode(output_seq, skip_special_tokens=True) for output_seq in output]
    
    return generated_texts

# Example usage
prompt = "Once upon a midnight dreary"
generated_texts = generate_text(prompt, max_length=200, num_return_sequences=3)

# Print the generated texts
for i, text in enumerate(generated_texts):
    print(f"Generated Text {i+1}:\n{text}\n")
