In [None]:
!pip install transformers datasets accelerate torch wikipedia --quiet

In [None]:
import torch
import pandas as pd
import transformers
import gc
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset
from transformers import DataCollatorForLanguageModeling

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

In [None]:
df = pd.read_csv("/kaggle/input/my-dataset/my_dataset.csv")

In [None]:
df = df[["clean_text"]].dropna()

df = df.sample(n=min(100000, len(df)), random_state=42).reset_index(drop=True)

dataset = Dataset.from_pandas(df)

In [None]:
MODEL_NAME = "EleutherAI/gpt-neo-125m"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def tokenize_function(example):
    return tokenizer(example["clean_text"], padding="max_length", truncation=True, max_length=216)

In [None]:
print("Tokenizing dataset...")
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)

In [None]:
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device)

model.gradient_checkpointing_enable()

training_args = TrainingArguments(
    output_dir="./llm_output",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,  
    gradient_accumulation_steps=2, 
    num_train_epochs=3,  
    learning_rate=3e-5,
    fp16=True,  
    logging_steps=2,  
    save_total_limit=2,
    push_to_hub=False,
    report_to="none",  
    gradient_checkpointing=True,  
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset.select(range(1000)),  
    data_collator=data_collator
)

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
print("Starting training...")
trainer.train()

print("Saving model...")
model.save_pretrained("./llm_output")
tokenizer.save_pretrained("./llm_output")

print("Training complete! Model saved to './llm_output'")

In [None]:
import os
import zipfile

def download_model_to_local(model_directory="./llm_output", 
                            output_zip="gptneo_model_download.zip"):
    
    if not os.path.exists(model_directory):
        raise FileNotFoundError(f"Model directory {model_directory} not found")
    
    print(f"Compressing model files from {model_directory}...")
    
    with zipfile.ZipFile(output_zip, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(model_directory):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, os.path.dirname(model_directory))
                zipf.write(file_path, arcname)
    
    print(f"Model compressed successfully to {output_zip}")
    print("You can now download this file from the 'Output' tab in Kaggle")

    return output_zip

In [None]:
zip_path = download_model_to_local()

# CONTENT GENERATION

In [None]:
model_path = "/kaggle/input/gpt-neo/llm_output"

model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print(f"Model loaded and moved to {device}")

In [None]:
def generate_creative_content(model, tokenizer, prompt, max_length=200, num_return_sequences=3, temperature=0.9):
    """Generate creative content using the trained model with improved sentence completion."""
    model.eval()

    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)

    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        temperature=temperature,
        top_k=50,
        top_p=0.93,  
        do_sample=True,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id,
        early_stopping=False,
        no_repeat_ngram_size=3,
        repetition_penalty=1.3  
    )

    import re
    generated_texts = []
    for ids in output:
        text = tokenizer.decode(ids, skip_special_tokens=True)
        text = re.sub(r'@-@', '', text)
        text = re.sub(r'[^A-Za-z0-9\s\.\,\!\?\'\"\;\:\-]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        text = ensure_complete_sentences(text)

        generated_texts.append(text)

    return generated_texts

In [None]:
def ensure_complete_sentences(text):
    """Ensure the text ends with a complete sentence."""
    end_markers = ['.', '!', '?']
    if text and text[-1] in end_markers:
        return text
    last_end = max([text.rfind(marker) for marker in end_markers])
    if last_end != -1:
        return text[:last_end+1]
    return text

# CONTENT EVALUATION

In [None]:
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import wikipedia
from collections import defaultdict

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
try:
    nlp = spacy.load("en_core_web_md")
except OSError:
    print("Downloading spaCy model...")
    spacy.cli.download("en_core_web_md")
    nlp = spacy.load("en_core_web_md")

In [None]:
prompts = [
    "The future of artificial intelligence looks",
    "The latest research on climate change suggests",
    "If time travel were possible, humanity would",
    "Deep beneath the ocean's surface, the greatest mystery awaiting explorers is",
    "The beauty of creative writing lies in"
]

In [None]:
def evaluate_fluency(text):
    """
    Evaluate fluency by measuring:
    1. Average sentence length
    2. Lexical diversity (unique words / total words)
    3. Grammatical correctness using a heuristic approach
    
    Args:
        text (str): Text to evaluate
        
    Returns:
        dict: Dictionary of fluency metrics
    """
    if not text.strip():
        return {
            'avg_sent_length': 0,
            'lexical_diversity': 0,
            'grammatical_score': 0,
            'fluency_score': 0
        }
        
    sentences = sent_tokenize(text)
    all_words = word_tokenize(text.lower())
    words = [w for w in all_words if w.isalnum()]
    
    if len(sentences) > 0:
        avg_sent_length = len(words) / len(sentences)
    else:
        avg_sent_length = 0

    if len(words) > 0:
        lexical_diversity = len(set(words)) / len(words)
    else:
        lexical_diversity = 0

    doc = nlp(text)
    grammatical_score = 0

    for sent in doc.sents:
        has_subj = any(token.dep_ in ('nsubj', 'nsubjpass') for token in sent)
        has_verb = any(token.pos_ == 'VERB' for token in sent)
        if has_subj and has_verb:
            grammatical_score += 1
    
    if len(list(doc.sents)) > 0:
        grammatical_score /= len(list(doc.sents))
        
    fluency_score = (
        min(avg_sent_length / 15, 1) * 0.3 +  
        lexical_diversity * 0.3 +
        grammatical_score * 0.4
    )
        
    return {
        'avg_sent_length': avg_sent_length,
        'lexical_diversity': lexical_diversity,
        'grammatical_score': grammatical_score,
        'fluency_score': fluency_score
    }

In [None]:
def evaluate_flexibility(text):
    """
    Evaluate flexibility by analyzing:
    1. Topic diversity
    2. Semantic range
    3. Concept switching
    """
    if not text.strip():
        return {
            'topic_diversity': 0,
            'semantic_range': 0,
            'concept_transitions': 0,
            'flexibility_score': 0
        }
        
    doc = nlp(text)
    key_nouns = [token.lemma_ for token in doc if token.pos_ == 'NOUN' and token.text.lower() not in stop_words]
    topic_diversity = len(set(key_nouns)) / len(key_nouns) if key_nouns else 0
        
    sentences = list(doc.sents)
    if len(sentences) >= 2:
        sent_embeddings = np.array([sent.vector for sent in sentences])
        similarities = cosine_similarity(sent_embeddings)
        semantic_range = 1 - (np.sum(similarities) - len(sentences)) / (len(sentences) * (len(sentences) - 1))
    else:
        semantic_range = 0
        
    concept_transitions = 0
    prev_key_entities = set()
    for sent in sentences:
        sent_entities = set([token.lemma_ for token in sent 
                             if token.pos_ in ('NOUN', 'PROPN') and token.text.lower() not in stop_words])
        if prev_key_entities and (len(sent_entities.intersection(prev_key_entities)) / max(1, len(prev_key_entities)) < 0.3):
            concept_transitions += 1
        prev_key_entities = sent_entities
    concept_transitions = concept_transitions / (len(sentences) - 1) if len(sentences) > 1 else 0
        
    flexibility_score = (
        topic_diversity * 0.4 +
        semantic_range * 0.4 +
        concept_transitions * 0.2
    )
    
    return {
        'topic_diversity': topic_diversity,
        'semantic_range': semantic_range,
        'concept_transitions': concept_transitions,
        'flexibility_score': flexibility_score
    }

In [None]:
def evaluate_originality(text, reference_texts=None):
    """
    Alternative evaluation of originality by:
    1. Lexical novelty: proportion of unique trigrams in the text
    2. Phrase novelty: proportion of trigrams not found in the reference corpus
    3. Comparison to reference corpus via document vector similarity

    This function avoids using a rare word frequency measure.
    """
    if reference_texts is None:
        reference_texts = []
        
    if not text.strip():
        return {
            'lexical_novelty': 0,
            'phrase_novelty': 0,
            'reference_similarity': 1,  
            'originality_score': 0
        }
        
    doc = nlp(text)
    tokens = [token.text.lower() for token in doc if token.is_alpha]
    
    trigrams = [' '.join(tokens[i:i+3]) for i in range(len(tokens) - 2)]
    if not trigrams:
        lexical_novelty = 0
    else:
        unique_trigrams = set(trigrams)
        lexical_novelty = len(unique_trigrams) / len(trigrams)
    
    reference_trigrams = []
    for ref_text in reference_texts:
        ref_doc = nlp(ref_text)
        ref_tokens = [token.text.lower() for token in ref_doc if token.is_alpha]
        reference_trigrams.extend([' '.join(ref_tokens[i:i+3]) for i in range(len(ref_tokens) - 2)])
    
    phrase_novelty = (sum(1 for tg in trigrams if tg not in reference_trigrams) / len(trigrams)) if trigrams else 0
        
    doc_vector = doc.vector
    reference_similarities = []
    for ref_text in reference_texts:
        ref_doc = nlp(ref_text)
        similarity = cosine_similarity(
            doc_vector.reshape(1, -1), 
            ref_doc.vector.reshape(1, -1)
        )[0][0]
        reference_similarities.append(similarity)
    reference_similarity = max(reference_similarities) if reference_similarities else 0
    
    originality_score = (
        lexical_novelty * 0.4 +
        phrase_novelty * 0.3 +
        (1 - reference_similarity) * 0.3
    )
    
    return {
        'lexical_novelty': lexical_novelty,
        'phrase_novelty': phrase_novelty,
        'reference_similarity': reference_similarity,
        'originality_score': originality_score
    }

In [None]:
def get_reference_texts(query):
    search_results = wikipedia.search(query)
    reference_texts = []
    for title in search_results[:3]:
        try:
            page = wikipedia.page(title)
            reference_texts.append(page.content)
            print(f"Retrieved content for page: {title}")
        except Exception as e:
            print(f"Could not retrieve page for {title}: {e}")
    return reference_texts

In [None]:
def evaluate_elaboration(text):
    """
    Evaluate elaboration by analyzing:
    1. Detail density
    2. Descriptive richness (average adjectives per noun)
    3. Explanation depth (using both keywords and dependency labels)
    """
    if not text.strip():
        return {
            'detail_density': 0,
            'descriptive_richness': 0,
            'explanation_depth': 0,
            'elaboration_score': 0
        }
    
    doc = nlp(text)
    tokens = list(doc)
    
    detail_tokens = [token for token in doc if token.pos_ in ('ADJ', 'ADV') or token.dep_ == 'prep']
    detail_density = len(detail_tokens) / len(tokens) if tokens else 0
    
    nouns = [token for token in doc if token.pos_ in ('NOUN', 'PROPN')]
    adjectives = [token for token in doc if token.pos_ == 'ADJ']
    if nouns:
        avg_adj_per_noun = len(adjectives) / len(nouns)
    else:
        avg_adj_per_noun = 0

    scaled_richness = min(avg_adj_per_noun / 0.5, 1)
    
    explanation_keywords = {'because', 'since', 'therefore', 'thus', 'consequently', 'due', 'hence'}
    keyword_count = sum(1 for token in doc if token.text.lower() in explanation_keywords)
    
    advcl_count = sum(1 for token in doc if token.dep_ == 'advcl')
    total_explanation = keyword_count + advcl_count
    sentences = list(doc.sents)
    explanation_depth = total_explanation / len(sentences) if sentences else 0
    
    scaled_explanation = min(explanation_depth, 1)
    
    elaboration_score = (
        detail_density * 0.4 +
        scaled_richness * 0.3 +
        scaled_explanation * 0.3
    )
    
    return {
        'detail_density': detail_density,
        'descriptive_richness': scaled_richness,
        'explanation_depth': scaled_explanation,
        'elaboration_score': elaboration_score
    }


In [None]:
def evaluate_all_dimensions(text, reference_texts=None):
    fluency = evaluate_fluency(text)
    flexibility = evaluate_flexibility(text)
    originality = evaluate_originality(text, reference_texts)
    elaboration = evaluate_elaboration(text)
    
    creativity_score = (
        fluency['fluency_score'] * 0.25 +
        flexibility['flexibility_score'] * 0.25 +
        originality['originality_score'] * 0.25 +
        elaboration['elaboration_score'] * 0.25
    )
    
    return {
        'fluency': fluency['fluency_score'],
        'flexibility': flexibility['flexibility_score'],
        'originality': originality['originality_score'],
        'elaboration': elaboration['elaboration_score'],
        'creativity': creativity_score
    }

In [None]:
all_scores = defaultdict(list)

In [None]:
for prompt in prompts:
    print(f"\nProcessing prompt: {prompt}")
    reference_texts = get_reference_texts(prompt)
    
    generated_texts = generate_creative_content(model, tokenizer, prompt, num_return_sequences=100)
    
    prompt_scores = defaultdict(list)
    for i, generated_text in enumerate(generated_texts):
        print(f"\nGenerated Text {i+1}:\n{generated_text}")
        scores = evaluate_all_dimensions(generated_text, reference_texts)
        
        print(f"\nScores for sequence {i+1}:")
        for key, value in scores.items():
            print(f"{key.capitalize()} Score: {value:.3f}")
            all_scores[key].append(value)
            prompt_scores[key].append(value)
    
    print(f"\nAverages for prompt: {prompt}")
    for key, values in prompt_scores.items():
        print(f"Average {key.capitalize()} Score: {np.mean(values):.3f}")

print("\nOverall Averages across all prompts:")
for key, values in all_scores.items():
    print(f"Average {key.capitalize()} Score: {np.mean(values):.3f}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

dimensions = list(all_scores.keys())
data_for_boxplot = [all_scores[dim] for dim in dimensions]

plt.boxplot(data_for_boxplot, labels=dimensions)
plt.title("Distribution of Scores by Dimension")
plt.xlabel("Dimension")
plt.ylabel("Score")
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde

for dimension, scores in all_scores.items():
    data = np.array(scores)

    plt.hist(data, bins=20, density=True, alpha=0.5)
    
    density = gaussian_kde(data)
    xs = np.linspace(min(data), max(data), 200)
    plt.plot(xs, density(xs))

    plt.title(f"Density Plot for {dimension.capitalize()} Scores")
    plt.xlabel("Score")
    plt.ylabel("Density")
    plt.show()