In [None]:
!pip install wikipedia spacy --quiet

In [None]:
import os
import gc
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import zipfile
import random
import unicodedata
from datasets import load_dataset, Dataset as HFDataset
import logging
from tqdm.auto import tqdm
from tqdm.notebook import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    AutoModelForSequenceClassification,
    DataCollatorForSeq2Seq,
    GenerationConfig
)
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from collections import defaultdict
from sentence_transformers import SentenceTransformer, util
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import wikipedia

In [None]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger()
logger.handlers[0].setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))

In [None]:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# DATASET COLLECTION AND CURATION

In [None]:
def clean_text(text):
    """Improved text cleaning function."""
    if not isinstance(text, str) or not text.strip():
        return ""

    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r"\s*\n+\s*", " ", text)
    text = re.sub(r"@+", "", text)
    text = re.sub(r"\s{2,}", " ", text).strip()
    text = re.sub(r"\s+([?.!,])", r"\1", text)
    text = re.sub(r"([?.!,])\s+", r"\1 ", text)
    text = re.sub(r"(\w+) '(\w+)", r"\1'\2", text)
    text = re.sub(r'\\"', '"', text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    text = re.sub(r"http\S+|www\S+|ftp\S+", "", text)

    return text

def filter_by_length(text, min_length=50, max_length=5000):
    """Filter texts by length to remove very short or very long entries."""
    return min_length <= len(text) <= max_length
    text = re.sub(r"<.*?

In [None]:
def collect_diverse_datasets():
    """Collect and combine diverse datasets from Hugging Face with sample limits."""
    datasets = {}

    # 1. News Articles (CNN/DailyMail)
    print("Loading news articles dataset (CNN/DailyMail)...")
    news = load_dataset("cnn_dailymail", "3.0.0", split="train")

    news_df = pd.DataFrame({
        'text': news['article'],
        'source': ['news'] * len(news)
    })
    datasets['news'] = news_df
    print(f"Loaded {len(news_df)} news articles")

    # 2. Blog Posts (WikiText)
    print("Loading blog-like content (WikiText)...")
    wiki = load_dataset("wikitext", "wikitext-103-v1", split="train")
    wiki_texts = [text for text in wiki['text'] if len(text) > 200 and not text.startswith('=')]

    wiki_df = pd.DataFrame({
        'text': wiki_texts,
        'source': ['blog'] * len(wiki_texts)
    })
    datasets['blogs'] = wiki_df
    print(f"Loaded {len(wiki_df)} blog-like entries from WikiText")

    # 3. Product Descriptions (Yelp Reviews)
    print("Loading product-related content (Yelp Reviews)...")
    yelp = load_dataset("yelp_review_full", split="train")

    yelp_df = pd.DataFrame({
        'text': yelp['text'],
        'source': ['product'] * len(yelp)
    })
    datasets['products'] = yelp_df
    print(f"Loaded {len(yelp_df)} product/business reviews")

    # 4. Academic Content (GLUE QNLI)
    print("Loading academic dataset (GLUE QNLI)...")
    qnli = load_dataset("glue", "qnli", split="train")

    qnli_df = pd.DataFrame({
        'text': qnli['sentence'],
        'source': ['academic'] * len(qnli)
    })
    datasets['academic'] = qnli_df
    print(f"Loaded {len(qnli_df)} academic sentences from QNLI")

    # 5. Books (BookCorpus)
    print("Loading book dataset (BookCorpus)...")
    BOOK_SAMPLE_SIZE = 500000
    books = load_dataset("bookcorpus", split="train", trust_remote_code=True)
    
    if len(books) > BOOK_SAMPLE_SIZE:
        book_texts = books.shuffle(seed=42).select(range(BOOK_SAMPLE_SIZE))['text']

    book_df = pd.DataFrame({'text': book_texts, 'source': ['books'] * len(book_texts)})
    datasets['books'] = book_df
    print(f"Loaded {len(book_df)} book passages from BookCorpus (Sampled)")
    
    return datasets

In [None]:
def curate_dataset(datasets):
    """Clean and prepare the collected datasets."""
    print("Combining datasets...")
    combined_df = pd.concat(datasets.values(), ignore_index=True)

    print(f"Raw combined dataset size: {len(combined_df)} documents")

    print("Cleaning text...")
    tqdm.pandas(desc="Cleaning texts")
    combined_df['clean_text'] = combined_df['text'].progress_apply(clean_text)

    print("Filtering by length...")
    combined_df = combined_df[combined_df['clean_text'].apply(filter_by_length)]

    print("Keeping only essential columns (source and clean_text)...")
    combined_df = combined_df[['source', 'clean_text']]

    print(f"Final dataset size: {len(combined_df)} documents")
    print(f"Distribution by source:")
    source_distribution = combined_df['source'].value_counts()
    for source, count in source_distribution.items():
        print(f"  {source}: {count} documents")

    return combined_df

In [None]:
def save_dataset(df, output_path="dataset.csv"):
    """Save the curated dataset to a CSV file."""
    df.to_csv(output_path, index=False)
    print(f"Dataset saved to {output_path}")

In [None]:
def collect_and_process_data():
    print("Starting dataset collection...")
    datasets = collect_diverse_datasets()

    print("Curating datasets...")
    curated_dataset = curate_dataset(datasets)

    save_dataset(curated_dataset)
    print("Dataset collection and curation complete!")

    return curated_dataset

In [None]:
datasets = collect_diverse_datasets()

In [None]:
curated_dataset = curate_dataset(datasets)

In [None]:
save_dataset(curated_dataset, "final_cleaned_dataset.csv")

# MODEL FINE-TUNING AND TRAINING (GPT2)

In [None]:
df = pd.read_csv("my_dataset.csv")
print(df.head())

In [None]:
SEED = 42
torch.manual_seed(SEED)

In [None]:
# Configuration
MODEL_NAME = "gpt2"
MAX_LENGTH = 512
BATCH_SIZE = 8
EPOCHS = 3
LEARNING_RATE = 3e-5
SAVE_DIRECTORY = "./creative_content_model"
TRAIN_SIZE = 0.9
GRADIENT_ACCUMULATION_STEPS = 2
FP16_TRAINING = torch.cuda.is_available()

In [None]:
class CreativeContentDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attention_masks = []
        for text in tqdm(texts, desc="Tokenizing texts"):
            encodings = tokenizer(
                text,
                add_special_tokens=True,
                max_length=max_length,
                padding="max_length",
                truncation=True,
                return_tensors="pt"
            )
            self.input_ids.append(encodings["input_ids"])
            self.attention_masks.append(encodings["attention_mask"])
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx].squeeze(),
            "attention_mask": self.attention_masks[idx].squeeze(),
            "labels": self.input_ids[idx].squeeze()
        }

In [None]:
def load_and_prepare_data(file_path, sample_size=None):
    """Load data from CSV and prepare for training"""
    logger.info(f"Loading data from {file_path}")
    df = pd.read_csv(file_path)

    if sample_size and len(df) > sample_size:
        df = df.sample(sample_size, random_state=SEED)

    logger.info(f"Dataset size: {len(df)} documents")

    source_types = df['source'].unique()
    logger.info(f"Source types: {source_types}")

    texts = df['clean_text'].tolist()

    train_texts, val_texts = train_test_split(texts, train_size=TRAIN_SIZE, random_state=SEED)

    logger.info(f"Training samples: {len(train_texts)}")
    logger.info(f"Validation samples: {len(val_texts)}")

    return train_texts, val_texts

In [None]:
gc.collect()
torch.cuda.empty_cache() if torch.cuda.is_available() else None

In [None]:
def train_model():
    """Train the language model with enhanced parameters for better output"""
    logger.info(f"Using model: {MODEL_NAME}")
    
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token  
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
     
    train_texts, val_texts = load_and_prepare_data("my_dataset.csv", sample_size=100000)  
    
    train_dataset = HFDataset.from_dict({"text": train_texts})
    val_dataset = HFDataset.from_dict({"text": val_texts})
   
    def tokenize_function(examples):
        return tokenizer(examples["text"], truncation=True, max_length=MAX_LENGTH, padding="max_length")
    train_dataset = train_dataset.map(
        tokenize_function,
        batched=True,
        desc="Tokenizing training data",
        remove_columns=["text"],
        num_proc=4  
    )
    val_dataset = val_dataset.map(
        tokenize_function,
        batched=True,
        desc="Tokenizing validation data",
        remove_columns=["text"],
        num_proc=4  
    )
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False  
    )
    
    training_args = TrainingArguments(
        output_dir=SAVE_DIRECTORY,
        overwrite_output_dir=True,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,  
        eval_strategy="epoch",    
        save_strategy="epoch", 
        save_total_limit=EPOCHS,  
        learning_rate=LEARNING_RATE,
        warmup_ratio=0.1,
        lr_scheduler_type="cosine",  
        logging_dir="./logs",
        logging_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        greater_is_better=False,
        fp16=FP16_TRAINING,  
        dataloader_num_workers=4,  
        group_by_length=True,  
        report_to="tensorboard"  
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )
    
    trainer.processing_class = tokenizer
    
    logger.info("Starting training...")
    trainer.train()
   
    logger.info(f"Saving model to {SAVE_DIRECTORY}")
    trainer.save_model(SAVE_DIRECTORY)
    tokenizer.save_pretrained(SAVE_DIRECTORY)
    return model, tokenizer

In [None]:
if torch.cuda.is_available():
  device = torch.device("cuda")
  logger.info(f"Using GPU: {torch.cuda.get_device_name(0)}")
  print(device)
    

else:
  device = torch.device("cpu")
  logger.info("No GPU available, using CPU")
  print("cpu")

In [None]:
model, tokenizer = train_model()

In [None]:
def download_model_to_local(model_directory="./creative_content_model", 
                            output_zip="model_download.zip"):
    """
    Prepare model files for download by zipping them
    
    Parameters:
    model_directory (str): Path to the saved model directory
    output_zip (str): Filename for the zip file to create
    """
    if not os.path.exists(model_directory):
        raise FileNotFoundError(f"Model directory {model_directory} not found")
    
    print(f"Compressing model files from {model_directory}...")
    
    with zipfile.ZipFile(output_zip, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(model_directory):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, os.path.dirname(model_directory))
                zipf.write(file_path, arcname)
    
    print(f"Model compressed successfully to {output_zip}")
    print("You can now download this file from the 'Output' tab in Kaggle")

    return output_zip

In [None]:
zip_path = download_model_to_local()

# GENERATE CREATIVE CONTENT

In [None]:
# Import dataset and model from saved kaggle inputs

df = pd.read_csv("/kaggle/input/curated-dataset/my_dataset.csv")

print("Dataset loaded.")

model_path = "/kaggle/input/gpt2-model/pytorch/default/1/creative_content_model"

model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print(f"Model loaded and moved to {device}")

In [None]:
def generate_creative_content(model, tokenizer, prompt, max_length=200, num_return_sequences=3, temperature=0.9):
    """Generate creative content using the trained model with improved sentence completion."""
    model.eval()

    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)

    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        temperature=temperature,
        top_k=50,
        top_p=0.93,  
        do_sample=True,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id,
        early_stopping=False,
        no_repeat_ngram_size=3,
        repetition_penalty=1.3  
    )

    import re
    generated_texts = []
    for ids in output:
        text = tokenizer.decode(ids, skip_special_tokens=True)
        text = re.sub(r'@-@', '', text)
        text = re.sub(r'[^A-Za-z0-9\s\.\,\!\?\'\"\;\:\-]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        text = ensure_complete_sentences(text)

        generated_texts.append(text)

    return generated_texts

In [None]:
def ensure_complete_sentences(text):
    """Ensure the text ends with a complete sentence."""
    end_markers = ['.', '!', '?']
    if text and text[-1] in end_markers:
        return text
    last_end = max([text.rfind(marker) for marker in end_markers])
    if last_end != -1:
        return text[:last_end+1]
    return text

In [None]:
def sample_generations(model, tokenizer, prompts=None):
    """Generate samples from different prompts."""
    if prompts is None:
        prompts = [
            "The future of artificial intelligence looks",
            "The most interesting aspect of creative writing is",
            "The latest research on climate change suggests"
        ]

    all_generated_texts = []

    for prompt in prompts:
        print(f"\nPrompt: {prompt}")
        generated_texts = generate_creative_content(model, tokenizer, prompt)
        all_generated_texts.append((prompt, generated_texts))

        for i, text in enumerate(generated_texts):
            print(f"\nGeneration {i+1}:\n{text}")
        print("\n" + "="*80)

    return

In [None]:
sample_generations(model, tokenizer)

# CONTENT EVALUATION 

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
try:
    nlp = spacy.load("en_core_web_md")
except OSError:
    print("Downloading spaCy model...")
    spacy.cli.download("en_core_web_md")
    nlp = spacy.load("en_core_web_md")

In [None]:
device='cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
prompts = [
    "The future of artificial intelligence looks",
    "The latest research on climate change suggests",
    "If time travel were possible, humanity would",
    "Deep beneath the ocean's surface, the greatest mystery awaiting explorers is",
    "The beauty of creative writing lies in"
]

In [None]:
def evaluate_fluency(text):
    """
    Evaluate fluency by measuring:
    1. Average sentence length
    2. Lexical diversity (unique words / total words)
    3. Grammatical correctness using a heuristic approach
    
    Args:
        text (str): Text to evaluate
        
    Returns:
        dict: Dictionary of fluency metrics
    """
    if not text.strip():
        return {
            'avg_sent_length': 0,
            'lexical_diversity': 0,
            'grammatical_score': 0,
            'fluency_score': 0
        }
        
    sentences = sent_tokenize(text)
    all_words = word_tokenize(text.lower())
    words = [w for w in all_words if w.isalnum()]
    
    if len(sentences) > 0:
        avg_sent_length = len(words) / len(sentences)
    else:
        avg_sent_length = 0
        
    if len(words) > 0:
        lexical_diversity = len(set(words)) / len(words)
    else:
        lexical_diversity = 0
        
    
    doc = nlp(text)
    grammatical_score = 0
    
    for sent in doc.sents:
        has_subj = any(token.dep_ in ('nsubj', 'nsubjpass') for token in sent)
        has_verb = any(token.pos_ == 'VERB' for token in sent)
        if has_subj and has_verb:
            grammatical_score += 1
    
    if len(list(doc.sents)) > 0:
        grammatical_score /= len(list(doc.sents))
    
    
    fluency_score = (
        min(avg_sent_length / 15, 1) * 0.3 +  
        lexical_diversity * 0.3 +
        grammatical_score * 0.4
    )
        
    return {
        'avg_sent_length': avg_sent_length,
        'lexical_diversity': lexical_diversity,
        'grammatical_score': grammatical_score,
        'fluency_score': fluency_score
    }

In [None]:
def evaluate_flexibility(text):
    """
    Evaluate flexibility by analyzing:
    1. Topic diversity
    2. Semantic range
    3. Concept switching
    """
    if not text.strip():
        return {
            'topic_diversity': 0,
            'semantic_range': 0,
            'concept_transitions': 0,
            'flexibility_score': 0
        }
        
    doc = nlp(text)
    key_nouns = [token.lemma_ for token in doc if token.pos_ == 'NOUN' and token.text.lower() not in stop_words]
    topic_diversity = len(set(key_nouns)) / len(key_nouns) if key_nouns else 0
        
    sentences = list(doc.sents)
    if len(sentences) >= 2:
        sent_embeddings = np.array([sent.vector for sent in sentences])
        similarities = cosine_similarity(sent_embeddings)
        semantic_range = 1 - (np.sum(similarities) - len(sentences)) / (len(sentences) * (len(sentences) - 1))
    else:
        semantic_range = 0
        
    concept_transitions = 0
    prev_key_entities = set()
    for sent in sentences:
        sent_entities = set([token.lemma_ for token in sent 
                             if token.pos_ in ('NOUN', 'PROPN') and token.text.lower() not in stop_words])
        if prev_key_entities and (len(sent_entities.intersection(prev_key_entities)) / max(1, len(prev_key_entities)) < 0.3):
            concept_transitions += 1
        prev_key_entities = sent_entities
    concept_transitions = concept_transitions / (len(sentences) - 1) if len(sentences) > 1 else 0
        
    flexibility_score = (
        topic_diversity * 0.4 +
        semantic_range * 0.4 +
        concept_transitions * 0.2
    )
    
    return {
        'topic_diversity': topic_diversity,
        'semantic_range': semantic_range,
        'concept_transitions': concept_transitions,
        'flexibility_score': flexibility_score
    }

In [None]:
def evaluate_originality(text, reference_texts=None):
    """
    Alternative evaluation of originality by:
    1. Lexical novelty: proportion of unique trigrams in the text
    2. Phrase novelty: proportion of trigrams not found in the reference corpus
    3. Comparison to reference corpus via document vector similarity

    This function avoids using a rare word frequency measure.
    """
    if reference_texts is None:
        reference_texts = []
        
    if not text.strip():
        return {
            'lexical_novelty': 0,
            'phrase_novelty': 0,
            'reference_similarity': 1,  
            'originality_score': 0
        }
        
    doc = nlp(text)
    tokens = [token.text.lower() for token in doc if token.is_alpha]
    
    trigrams = [' '.join(tokens[i:i+3]) for i in range(len(tokens) - 2)]
    if not trigrams:
        lexical_novelty = 0
    else:
        unique_trigrams = set(trigrams)
        lexical_novelty = len(unique_trigrams) / len(trigrams)
    
    reference_trigrams = []
    for ref_text in reference_texts:
        ref_doc = nlp(ref_text)
        ref_tokens = [token.text.lower() for token in ref_doc if token.is_alpha]
        reference_trigrams.extend([' '.join(ref_tokens[i:i+3]) for i in range(len(ref_tokens) - 2)])
    
    phrase_novelty = (sum(1 for tg in trigrams if tg not in reference_trigrams) / len(trigrams)) if trigrams else 0
        
    doc_vector = doc.vector
    reference_similarities = []
    for ref_text in reference_texts:
        ref_doc = nlp(ref_text)
        similarity = cosine_similarity(
            doc_vector.reshape(1, -1), 
            ref_doc.vector.reshape(1, -1)
        )[0][0]
        reference_similarities.append(similarity)
    reference_similarity = max(reference_similarities) if reference_similarities else 0
    
    originality_score = (
        lexical_novelty * 0.4 +
        phrase_novelty * 0.3 +
        (1 - reference_similarity) * 0.3
    )
    
    return {
        'lexical_novelty': lexical_novelty,
        'phrase_novelty': phrase_novelty,
        'reference_similarity': reference_similarity,
        'originality_score': originality_score
    }

In [None]:
def get_reference_texts(query):
    search_results = wikipedia.search(query)
    reference_texts = []
    for title in search_results[:3]:
        try:
            page = wikipedia.page(title)
            reference_texts.append(page.content)
            print(f"Retrieved content for page: {title}")
        except Exception as e:
            print(f"Could not retrieve page for {title}: {e}")
    return reference_texts

In [None]:
def evaluate_elaboration(text):
    """
    Evaluate elaboration by analyzing:
    1. Detail density
    2. Descriptive richness (average adjectives per noun)
    3. Explanation depth (using both keywords and dependency labels)
    """
    if not text.strip():
        return {
            'detail_density': 0,
            'descriptive_richness': 0,
            'explanation_depth': 0,
            'elaboration_score': 0
        }
    
    doc = nlp(text)
    tokens = list(doc)
    
    detail_tokens = [token for token in doc if token.pos_ in ('ADJ', 'ADV') or token.dep_ == 'prep']
    detail_density = len(detail_tokens) / len(tokens) if tokens else 0
    
    nouns = [token for token in doc if token.pos_ in ('NOUN', 'PROPN')]
    adjectives = [token for token in doc if token.pos_ == 'ADJ']
    if nouns:
        avg_adj_per_noun = len(adjectives) / len(nouns)
    else:
        avg_adj_per_noun = 0
        
    scaled_richness = min(avg_adj_per_noun / 0.5, 1)
    
    
    explanation_keywords = {'because', 'since', 'therefore', 'thus', 'consequently', 'due', 'hence'}
    keyword_count = sum(1 for token in doc if token.text.lower() in explanation_keywords)
   
    advcl_count = sum(1 for token in doc if token.dep_ == 'advcl')
    total_explanation = keyword_count + advcl_count
    sentences = list(doc.sents)
    explanation_depth = total_explanation / len(sentences) if sentences else 0
    
    scaled_explanation = min(explanation_depth, 1)
    
    elaboration_score = (
        detail_density * 0.4 +
        scaled_richness * 0.3 +
        scaled_explanation * 0.3
    )
    
    return {
        'detail_density': detail_density,
        'descriptive_richness': scaled_richness,
        'explanation_depth': scaled_explanation,
        'elaboration_score': elaboration_score
    }


In [None]:
def evaluate_all_dimensions(text, reference_texts=None):
    fluency = evaluate_fluency(text)
    flexibility = evaluate_flexibility(text)
    originality = evaluate_originality(text, reference_texts)
    elaboration = evaluate_elaboration(text)
    
    creativity_score = (
        fluency['fluency_score'] * 0.25 +
        flexibility['flexibility_score'] * 0.25 +
        originality['originality_score'] * 0.25 +
        elaboration['elaboration_score'] * 0.25
    )
    0
    return {
        'fluency': fluency['fluency_score'],
        'flexibility': flexibility['flexibility_score'],
        'originality': originality['originality_score'],
        'elaboration': elaboration['elaboration_score'],
        'creativity': creativity_score
    }

In [None]:
all_scores = defaultdict(list)

In [None]:
for prompt in prompts:
    print(f"\nProcessing prompt: {prompt}")
    reference_texts = get_reference_texts(prompt)
    
    generated_texts = generate_creative_content(model, tokenizer, prompt, num_return_sequences=100)
    
    prompt_scores = defaultdict(list)
    for i, generated_text in enumerate(generated_texts):
        print(f"\nGenerated Text {i+1}:\n{generated_text}")
        scores = evaluate_all_dimensions(generated_text, reference_texts)
        
        print(f"\nScores for sequence {i+1}:")
        for key, value in scores.items():
            print(f"{key.capitalize()} Score: {value:.3f}")
            all_scores[key].append(value)
            prompt_scores[key].append(value)
    
    print(f"\nAverages for prompt: {prompt}")
    for key, values in prompt_scores.items():
        print(f"Average {key.capitalize()} Score: {np.mean(values):.3f}")

print("\nOverall Averages across all prompts:")
for key, values in all_scores.items():
    print(f"Average {key.capitalize()} Score: {np.mean(values):.3f}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

dimensions = list(all_scores.keys())
data_for_boxplot = [all_scores[dim] for dim in dimensions]

plt.boxplot(data_for_boxplot, labels=dimensions)
plt.title("Distribution of Scores by Dimension")
plt.xlabel("Dimension")
plt.ylabel("Score")
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde

for dimension, scores in all_scores.items():
    data = np.array(scores)

    plt.hist(data, bins=20, density=True, alpha=0.5)
    
    density = gaussian_kde(data)
    xs = np.linspace(min(data), max(data), 200)
    plt.plot(xs, density(xs))

    plt.title(f"Density Plot for {dimension.capitalize()} Scores")
    plt.xlabel("Score")
    plt.ylabel("Density")
    plt.show()

# ADVANCED FINE-TUNING

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class LoRALayer(nn.Module):
    """Low-Rank Adaptation layer wrapper"""
    def __init__(self, base_layer, rank=8, alpha=16):
        super().__init__()
        self.base_layer = base_layer
        self.in_features = base_layer.weight.shape[1]
        self.out_features = base_layer.weight.shape[0]
        
        self.lora_A = nn.Parameter(torch.zeros((rank, self.in_features)))
        self.lora_B = nn.Parameter(torch.zeros((self.out_features, rank)))
        
        nn.init.normal_(self.lora_A, std=0.02)
        nn.init.zeros_(self.lora_B)
        
        self.alpha = alpha
        self.rank = rank
        
        self.base_layer.weight.requires_grad = False
        if hasattr(self.base_layer, 'bias') and self.base_layer.bias is not None:
            self.base_layer.bias.requires_grad = False
    
    def forward(self, x):
        base_output = self.base_layer(x)
        lora_output = (x @ self.lora_A.T) @ self.lora_B.T
        scaling = self.alpha / self.rank
        
        return base_output + scaling * lora_output

In [None]:
class GPT2Dataset(Dataset):
    """Dataset for GPT-2 fine-tuning"""
    def __init__(self, texts, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.texts = texts
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        if isinstance(text, dict):
            text = text.get('text', text.get('content', ''))
            
        encodings = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"
        )
        
        return {
            'input_ids': encodings['input_ids'].squeeze(),
            'attention_mask': encodings['attention_mask'].squeeze(),
            'labels': encodings['input_ids'].squeeze()
        }

In [None]:
class AdvancedFineTuner:
    """Combines Curriculum Learning, Dynamic Evaluation, and LoRA fine-tuning"""
    def __init__(self, model_name="gpt2", output_dir="./model_output"):
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            
        self.model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
        self.base_model = copy.deepcopy(self.model)  # Keep a copy of base model
        
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)
        
        self.lora_layers = {}
        self.is_lora_applied = False
        
        self.adaptation_steps = 0
        self.max_adaptation_steps = 500
        self.dynamic_optimizer = None
    
        self.current_curriculum_stage = 0
        
        print(f"Model initialized on {device}")
    
    def apply_lora(self, rank=8, alpha=16, target_modules=None):
        """Apply LoRA to model layers"""
        print("Applying LoRA to model...")
        
        if target_modules is None:
            target_modules = ['c_attn', 'c_proj', 'c_fc']
        
        for name, module in self.model.named_modules():
            if any(target_name in name for target_name in target_modules):
                if isinstance(module, nn.Linear):
                    parent_name = name.rsplit('.', 1)[0]
                    layer_name = name.split('.')[-1]
                    
                    lora_layer = LoRALayer(module, rank=rank, alpha=alpha)
                    parent_module = self.model
                    for part in parent_name.split('.'):
                        parent_module = getattr(parent_module, part)
                    
                    setattr(parent_module, layer_name, lora_layer)
                    
                    self.lora_layers[name] = lora_layer
        
        self.is_lora_applied = True
        return self
    
    def get_lora_params(self):
        """Get only LoRA parameters for optimization"""
        if not self.is_lora_applied:
            raise ValueError("LoRA has not been applied to the model yet")
        
        params = []
        for layer in self.lora_layers.values():
            params.extend([layer.lora_A, layer.lora_B])
        return params
    
    def prepare_curriculum(self, texts):
        """Prepare a curriculum of increasingly complex examples"""
        print("Preparing curriculum...")
        
        try:
            nltk.data.find('tokenizers/punkt')
            nltk.data.find('taggers/averaged_perceptron_tagger')
        except LookupError:
            nltk.download('punkt')
            nltk.download('averaged_perceptron_tagger')
        
        complexity_scores = []
        for entry in texts:
            if isinstance(entry, dict):
                text = entry.get('text', entry.get('content', ''))
            else:
                text = entry
            
            words = nltk.word_tokenize(text)
            sentences = nltk.sent_tokenize(text)
            
            if not sentences:  
                continue
                
            avg_sentence_length = len(words) / len(sentences) if sentences else 0
            unique_words_ratio = len(set(words)) / len(words) if words else 0
            long_words_ratio = sum(1 for w in words if len(w) > 6) / len(words) if words else 0
            
            pos_tags = nltk.pos_tag(words)
            
            subordinate_conj = sum(1 for _, tag in pos_tags if tag == 'IN')
            adverbs = sum(1 for _, tag in pos_tags if tag.startswith('RB'))
            
            complexity = (
                0.2 * min(1.0, avg_sentence_length / 25) +
                0.2 * unique_words_ratio +
                0.2 * long_words_ratio +
                0.2 * min(1.0, subordinate_conj / 15) +
                0.2 * min(1.0, adverbs / 10)
            )
            
            complexity_scores.append({
                'entry': entry,
                'complexity': complexity,
                'stats': {
                    'avg_sentence_length': avg_sentence_length,
                    'unique_words_ratio': unique_words_ratio
                }
            })
        
        complexity_scores.sort(key=lambda x: x['complexity'])
        
        num_entries = len(complexity_scores)
        stage_size = num_entries // 5
        
        curriculum = []
        for stage in range(5):
            start_idx = stage * stage_size
            end_idx = (stage + 1) * stage_size if stage < 4 else num_entries
            
            stage_entries = [item['entry'] for item in complexity_scores[start_idx:end_idx]]
            avg_complexity = sum(item['complexity'] for item in complexity_scores[start_idx:end_idx]) / (end_idx - start_idx)
            
            curriculum.append({
                'stage': stage + 1,
                'entries': stage_entries,
                'avg_complexity': avg_complexity
            })
            
            print(f"Stage {stage+1}: {len(stage_entries)} entries, avg complexity: {avg_complexity:.3f}")
        
        return curriculum
    
    def setup_dynamic_evaluation(self, learning_rate=2e-5):
        """Setup dynamic evaluation components"""
        self.adaptation_steps = 0
        
        if self.is_lora_applied:
            params = self.get_lora_params()
        else:
            params = self.model.parameters()
            
        self.dynamic_optimizer = AdamW(params, lr=learning_rate)
        return self
    
    def train(self, texts, training_args):
        """Main training method combining all three strategies"""
        print("Starting advanced fine-tuning with Curriculum Learning, Dynamic Evaluation, and LoRA...")
        
        if not self.is_lora_applied:
            self.apply_lora(
                rank=training_args.get('lora_rank', 8),
                alpha=training_args.get('lora_alpha', 16)
            )
        
        self.setup_dynamic_evaluation(
            learning_rate=training_args.get('dynamic_lr', 2e-5)
        )
        
        curriculum = self.prepare_curriculum(texts)
        
        batch_size = training_args.get('batch_size', 4)
        curriculum_epochs = training_args.get('curriculum_epochs', [4, 3, 3, 2, 2]) 
        eval_interval = training_args.get('eval_interval', 100)
        max_length = training_args.get('max_length', 512)
        
        optimizer = AdamW(
            self.get_lora_params(),
            lr=training_args.get('learning_rate', 5e-5)
        )
        
        metrics_by_stage = []
        
        for stage_idx, stage in enumerate(curriculum):
            print(f"\n===== Training on curriculum stage {stage['stage']} (complexity: {stage['avg_complexity']:.3f}) =====")
            self.current_curriculum_stage = stage['stage']
           
            stage_texts = stage['entries']
            stage_dataset = GPT2Dataset(stage_texts, self.tokenizer, max_length=max_length)
            
            stage_dataloader = DataLoader(
                stage_dataset,
                batch_size=batch_size,
                shuffle=True
            )
        
            stage_epochs = curriculum_epochs[stage_idx] if stage_idx < len(curriculum_epochs) else 1
            stage_metrics = []
            
            for epoch in range(stage_epochs):
                self.model.train()
                epoch_loss = 0
                dynamic_updates = 0
                
                progress_bar = tqdm(stage_dataloader, desc=f"Stage {stage['stage']}, Epoch {epoch+1}")
                for step, batch in enumerate(progress_bar):
                    batch = {k: v.to(device) for k, v in batch.items()}
                    
                    outputs = self.model(**batch)
                    loss = outputs.loss
            
                    loss.backward()
                    optimizer.step()
                    optimizer.zero_grad()
                 
                    if step % training_args.get('dynamic_update_interval', 10) == 0:
            
                        with torch.no_grad():
                            sample_idx = random.randint(0, len(stage_texts) - 1)
                            sample_text = stage_texts[sample_idx]
                            if isinstance(sample_text, dict):
                                sample_text = sample_text.get('text', sample_text.get('content', ''))
                            
                            prompt_ids = self.tokenizer.encode(sample_text[:100], return_tensors='pt').to(device)
                            target_ids = self.tokenizer.encode(sample_text, return_tensors='pt').to(device)
                            
                            self.model.train()
                            outputs = self.model(prompt_ids, labels=target_ids)
                            dynamic_loss = outputs.loss
  
                            dynamic_loss_scaled = dynamic_loss * training_args.get('dynamic_weight', 0.3)
                            dynamic_loss_scaled.backward()
                            self.dynamic_optimizer.step()
                            self.dynamic_optimizer.zero_grad()
                            
                            dynamic_updates += 1

                    epoch_loss += loss.item()
                    progress_bar.set_postfix({
                        'loss': epoch_loss / (step + 1),
                        'dynamic_updates': dynamic_updates
                    })
      
                    self.adaptation_steps += 1
                    if self.adaptation_steps >= self.max_adaptation_steps:
                        print("Resetting model to prevent drift...")
                        
                        with torch.no_grad():
                            for (name1, param1), (name2, param2) in zip(
                                self.model.named_parameters(), self.base_model.named_parameters()
                            ):
      
                                if not self.is_lora_applied or name1 not in self.lora_layers:
                                    # Pull 10% back toward base model
                                    adjustment = 0.1 * (param2 - param1)
                                    param1.add_(adjustment)
                        
                        self.adaptation_steps = 0
         
                avg_loss = epoch_loss / len(stage_dataloader)
                print(f"Stage {stage['stage']}, Epoch {epoch+1}: Average loss = {avg_loss:.4f}, Dynamic updates: {dynamic_updates}")
    
                stage_metrics.append({
                    'stage': stage['stage'],
                    'epoch': epoch + 1,
                    'avg_loss': avg_loss,
                    'dynamic_updates': dynamic_updates
                })
  
            metrics_by_stage.append({
                'stage': stage['stage'],
                'avg_complexity': stage['avg_complexity'],
                'epochs': stage_epochs,
                'metrics': stage_metrics
            })
 
            stage_dir = os.path.join(self.output_dir, f"stage_{stage['stage']}")
            os.makedirs(stage_dir, exist_ok=True)

            self.model.save_pretrained(stage_dir)

            if self.is_lora_applied:
                lora_state_dict = {
                    name + '.lora_A': layer.lora_A,
                    name + '.lora_B': layer.lora_B
                    for name, layer in self.lora_layers.items()
                }
                torch.save(lora_state_dict, os.path.join(stage_dir, "lora_weights.pt"))
    
        final_dir = os.path.join(self.output_dir, "final_model")
        os.makedirs(final_dir, exist_ok=True)
        self.model.save_pretrained(final_dir)

        import json
        with open(os.path.join(self.output_dir, "training_metrics.json"), 'w') as f:
            json.dump(metrics_by_stage, f, indent=2)
        
        print("Advanced fine-tuning completed successfully!")
        return self.model

In [None]:
def generate(self, prompt, max_length=100, temperature=0.8, use_dynamic_eval=True, **kwargs):
        """Generate text with the option to use dynamic evaluation"""
        inputs = self.tokenizer.encode(prompt, return_tensors='pt').to(device)
        
        with torch.no_grad():
            output_sequences = self.model.generate(
                inputs,
                max_length=max_length,
                temperature=temperature,
                top_k=kwargs.get('top_k', 50),
                top_p=kwargs.get('top_p', 0.95),
                do_sample=kwargs.get('do_sample', True),
                num_return_sequences=kwargs.get('num_return_sequences', 1),
                pad_token_id=self.tokenizer.eos_token_id
            )
        
        generated_text = self.tokenizer.decode(output_sequences[0], skip_special_tokens=True)
 
        if use_dynamic_eval and self.dynamic_optimizer is not None:
            target_ids = self.tokenizer.encode(generated_text, return_tensors='pt').to(device)
            self.model.train()
     
            outputs = self.model(inputs, labels=target_ids)
            loss = outputs.loss

            scaled_loss = loss * 0.2

            scaled_loss.backward()
            self.dynamic_optimizer.step()
            self.dynamic_optimizer.zero_grad()
       
            self.adaptation_steps += 1
 
            if self.adaptation_steps >= self.max_adaptation_steps:
                print("Resetting model to prevent drift...")
                with torch.no_grad():
                    for (name1, param1), (name2, param2) in zip(
                        self.model.named_parameters(), self.base_model.named_parameters()
                    ):

                        if not self.is_lora_applied or name1 not in self.lora_layers:
                            adjustment = 0.1 * (param2 - param1)
                            param1.add_(adjustment)
                
                self.adaptation_steps = 0
        
        return generated_text

In [None]:
def main():
    fine_tuner = AdvancedFineTuner(
        model_name="gpt2",
        output_dir="./advanced_gpt2_model"
    )
    
    texts = [
        "The future of artificial intelligence looks promising as researchers develop new algorithms for machine learning.",
        "Recent scientific studies have shown that climate change is accelerating at an alarming rate due to human activity.",
        "If time travel were possible, humanity would need to establish strict protocols to prevent paradoxes.",
        "Deep beneath the ocean's surface, the greatest mystery awaiting explorers is the undiscovered ecosystem.",
        "The beauty of creative writing lies in its ability to transport readers to different worlds and perspectives."
    ]

    texts = texts * 100  

    training_args = {
        'batch_size': 4,
        'learning_rate': 5e-5,
        'lora_rank': 8,
        'lora_alpha': 16,
        'curriculum_epochs': [4, 3, 3, 2, 2],
        'dynamic_lr': 2e-5,
        'dynamic_update_interval': 10,
        'dynamic_weight': 0.3,
        'max_length': 512
    }
    
    fine_tuner.train(texts, training_args)
    
    prompts = [
        "The future of artificial intelligence looks",
        "The latest research on climate change suggests",
        "If time travel were possible, humanity would",
        "Deep beneath the ocean's surface, the greatest mystery awaiting explorers is",
        "The beauty of creative writing lies in"
    ]
    
    for prompt in prompts:
        print(f"\nPrompt: {prompt}")
        
        generated_text = fine_tuner.generate(
            prompt, 
            max_length=200,
            temperature=0.8,
            use_dynamic_eval=True
        )
        
        print(f"Generated text: {generated_text}")
        print("-" * 80)

In [None]:
 main()