<a href="https://colab.research.google.com/github/kumar045/Assignment/blob/main/Readability.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import pandas as pd
import torch
from datasets import Dataset
from transformers import TrainingArguments
from unsloth import FastLanguageModel
from trl import SFTTrainer
import textstat
import backoff
import os
import logging
from tqdm import tqdm
from openai import OpenAI
from sklearn.model_selection import train_test_split

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize OpenAI client
client = OpenAI()

# Load and combine datasets
def load_and_combine_datasets():
    clear_df = pd.read_csv("path_to_your_CLEAR_dataset.csv")
    clear_df['dataset'] = 'CLEAR'
    clear_df['language'] = 'en'

    ratings_df = pd.read_csv("ratings.csv", encoding="iso-8859-1")
    ratings_df.rename(lambda x: str(x).lower(), axis="columns", inplace=True)
    ratings_df['dataset'] = 'ratings'
    ratings_df['language'] = 'de'
    ratings_df.rename(columns={'sentence': 'Excerpt'}, inplace=True)

    return pd.concat([clear_df, ratings_df], ignore_index=True)

@backoff.on_exception(backoff.expo, OpenAI.error.RateLimitError)
def estimate_bt_easiness(original_text):
    prompt = f"""Estimate the BT_easiness score for the following text. BT_easiness is a measure of text readability,
    where higher scores indicate easier-to-read text. The score typically ranges from 0 to 100.

    Text:
    {original_text}

    Based on the complexity, vocabulary, and structure of the text, estimate the BT_easiness score.
    Provide only the numeric score without any explanation."""

    response = client.chat.completions.create(
        model="gpt-4-1106-preview",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3,
    )

    return float(response.choices[0].message.content.strip())

def calculate_readability_scores(text, language, row=None):
    if language == 'en':
        bt_easiness = estimate_bt_easiness(text)
        return {
            "BT_easiness": bt_easiness,
            "Flesch-Reading-Ease": textstat.flesch_reading_ease(text),
            "SMOG Readability": textstat.smog_index(text),
            "Automated Readability Index": textstat.automated_readability_index(text)
        }
    elif language == 'de':
        return {
            "mos_complexity": row['mos_complexity'] if row is not None else 0,
            "votes_complexity": row['votes_complexity'] if row is not None else 0,
            "votes_understandability": row['votes_understandability'] if row is not None else 0,
            "vote_lexical_difficulty": row['vote_lexical_difficulty'] if row is not None else 0
        }

@backoff.on_exception(backoff.expo, OpenAI.error.RateLimitError)
def modify_text_gpt4(text, current_scores, language):
    if language == 'en':
        prompt = f"""Improve the readability of the following text. Current metrics:
        - BT_easiness: {current_scores['BT_easiness']:.2f}
        - Flesch-Reading-Ease: {current_scores['Flesch-Reading-Ease']:.2f}
        - SMOG Readability: {current_scores['SMOG Readability']:.2f}
        - Automated Readability Index: {current_scores['Automated Readability Index']:.2f}

    Original text:
    {text}

    Rewrite the text to improve all readability metrics. Aim for:
        - Higher BT_easiness
        - Higher Flesch-Reading-Ease
        - Lower SMOG Readability
        - Lower Automated Readability Index

    Improved text:"""
    elif language == 'de':
        prompt = f"""Verbessere die Lesbarkeit des folgenden Textes. Aktuelle Metriken:
        - MOS Komplexität: {current_scores['mos_complexity']:.2f}
        - Komplexitätsbewertungen: {current_scores['votes_complexity']}
        - Verständlichkeitsbewertungen: {current_scores['votes_understandability']}
        - Lexikalische Schwierigkeit: {current_scores['vote_lexical_difficulty']}

    Originaltext:
    {text}

    Schreibe den Text um, um alle Lesbarkeitsmetriken zu verbessern. Ziele:
        - Niedrigere MOS Komplexität
        - Weniger Komplexitätsbewertungen
        - Mehr Verständlichkeitsbewertungen
        - Niedrigere lexikalische Schwierigkeit

    Verbesserter Text:"""

    response = client.chat.completions.create(
        model="gpt-4-1106-preview",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7,
    )
    return response.choices[0].message.content

@backoff.on_exception(backoff.expo, OpenAI.error.RateLimitError)
def estimate_german_metrics(original_text, modified_text, original_scores):
    prompt = f"""Given the original German text and its improved version, estimate the new readability metrics.
    The original metrics were:
    - MOS Komplexität: {original_scores['mos_complexity']:.2f}
    - Komplexitätsbewertungen: {original_scores['votes_complexity']}
    - Verständlichkeitsbewertungen: {original_scores['votes_understandability']}
    - Lexikalische Schwierigkeit: {original_scores['vote_lexical_difficulty']}

    Original text:
    {original_text}

    Improved text:
    {modified_text}

    Estimate the new metrics. They should show improvement over the original scores.
    Provide only the numeric scores in the following format:
    MOS Komplexität: [value]
    Komplexitätsbewertungen: [value]
    Verständlichkeitsbewertungen: [value]
    Lexikalische Schwierigkeit: [value]"""

    response = client.chat.completions.create(
        model="gpt-4-1106-preview",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3,
    )

    lines = response.choices[0].message.content.strip().split('\n')
    new_scores = {}
    for line in lines:
        key, value = line.split(':')
        new_scores[key.strip()] = float(value.strip())

    return {
        "mos_complexity": new_scores["MOS Komplexität"],
        "votes_complexity": int(new_scores["Komplexitätsbewertungen"]),
        "votes_understandability": int(new_scores["Verständlichkeitsbewertungen"]),
        "vote_lexical_difficulty": int(new_scores["Lexikalische Schwierigkeit"])
    }

def prepare_data(df, sample_size=1000):
    df_subset = df.sample(sample_size, random_state=42).reset_index(drop=True)
    prepared_data = []

    for _, row in tqdm(df_subset.iterrows(), total=len(df_subset), desc="Preparing data"):
        original_scores = calculate_readability_scores(row['Excerpt'], row['language'], row)

        gpt4_modified_text = modify_text_gpt4(row['Excerpt'], original_scores, row['language'])
        if row['language'] == 'en':
            gpt4_scores = calculate_readability_scores(gpt4_modified_text, row['language'])
        else:
            gpt4_scores = estimate_german_metrics(row['Excerpt'], gpt4_modified_text, original_scores)

        prepared_data.append({
            "instruction": "Simplify the following text while preserving its meaning:",
            "input": row['Excerpt'],
            "output": gpt4_modified_text,
            "original_scores": original_scores,
            "gpt4_scores": gpt4_scores,
            "language": row['language']
        })

    return prepared_data

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:
{output}"""

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction=instruction, input=input, output=output)
        texts.append(text)
    return {"text": texts}

def evaluate_improvement(original_text, gpt4_text, llama3_text, original_scores, gpt4_scores, language):
    if language == 'en':
        llama3_scores = calculate_readability_scores(llama3_text, language)
    else:
        llama3_scores = estimate_german_metrics(original_text, llama3_text, original_scores)

    improvements = {}
    metrics = list(original_scores.keys())
    for metric in metrics:
        original_val = original_scores[metric]
        gpt4_val = gpt4_scores[metric]
        llama3_val = llama3_scores[metric]

        if language == 'en' and metric in ['BT_easiness', 'Flesch-Reading-Ease']:
            gpt4_improvement = (gpt4_val - original_val) / original_val * 100
            llama3_improvement = (llama3_val - original_val) / original_val * 100
        elif language == 'de' and metric in ['votes_understandability']:
            gpt4_improvement = (gpt4_val - original_val) / original_val * 100
            llama3_improvement = (llama3_val - original_val) / original_val * 100
        else:
            gpt4_improvement = (original_val - gpt4_val) / original_val * 100
            llama3_improvement = (original_val - llama3_val) / original_val * 100

        improvements[metric] = {
            'gpt4': gpt4_improvement,
            'llama3': llama3_improvement
        }

    return llama3_scores, improvements

def generate_feedback(improvements, language):
    feedback = "Based on the improvements:\n" if language == 'en' else "Basierend auf den Verbesserungen:\n"
    for metric, values in improvements.items():
        if values['llama3'] >= values['gpt4']:
            feedback += f"- {metric}: {'Great job! You\'ve matched or exceeded GPT-4\'s improvement.' if language == 'en' else 'Großartig! Sie haben die Verbesserung von GPT-4 erreicht oder übertroffen.'}\n"
        else:
            feedback += f"- {metric}: {'There\'s room for improvement. Try to match GPT-4\'s performance.' if language == 'en' else 'Es gibt Raum für Verbesserungen. Versuchen Sie, die Leistung von GPT-4 zu erreichen.'}\n"
    return feedback

@backoff.on_exception(backoff.expo, OpenAI.error.RateLimitError)
def get_gpt4_analysis(original_text, gpt4_text, llama3_text, improvements, language):
    prompt = f"""Analyze the readability improvements made by LLaMA 3 compared to GPT-4 for the following {'English' if language == 'en' else 'German'} text.

Original text:
{original_text}

GPT-4 improved version:
{gpt4_text}

LLaMA 3 improved version:
{llama3_text}

Improvement percentages:
{improvements}

Provide a detailed analysis of LLaMA 3's performance:
1. What did LLaMA 3 do well in improving readability?
2. Where did LLaMA 3 fall short compared to GPT-4?
3. Suggest specific strategies for LLaMA 3 to improve its performance.
"""

    response = client.chat.completions.create(
        model="gpt-4-1106-preview",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7,
    )
    return response.choices[0].message.content

def feedback_loop_training(model, tokenizer, dataset, num_iterations=5):
    for iteration in range(num_iterations):
        logging.info(f"Starting iteration {iteration + 1}")

        new_training_data = []
        for item in tqdm(dataset, desc=f"Processing iteration {iteration + 1}"):
            llama3_text = model.generate(
                **tokenizer(item['input'], return_tensors="pt", max_length=1024, truncation=True),
                max_new_tokens=100,
                temperature=0.7,
                top_p=0.9
            )
            llama3_text = tokenizer.decode(llama3_text[0], skip_special_tokens=True)

            llama3_scores, improvements = evaluate_improvement(
                item['input'], item['output'], llama3_text,
                item['original_scores'], item['gpt4_scores'], item['language']
            )

            feedback = generate_feedback(improvements, item['language'])
            gpt4_analysis = get_gpt4_analysis(item['input'], item['output'], llama3_text, improvements, item['language'])

            new_example = f"""{'Original text' if item['language'] == 'en' else 'Originaltext'}: {item['input']}

{'GPT-4 improved version' if item['language'] == 'en' else 'Von GPT-4 verbesserte Version'}: {item['output']}

{'Your previous improvement' if item['language'] == 'en' else 'Ihre vorherige Verbesserung'}: {llama3_text}

{'Feedback' if item['language'] == 'en' else 'Rückmeldung'}: {feedback}

{'Expert analysis' if item['language'] == 'en' else 'Expertenanalyse'}: {gpt4_analysis}

{'Now, provide an improved version addressing the feedback and analysis' if item['language'] == 'en' else 'Geben Sie nun eine verbesserte Version an, die auf das Feedback und die Analyse eingeht'}:

{'Improved text' if item['language'] == 'en' else 'Verbesserter Text'}:"""

            new_training_data.append({"text": new_example})

        new_dataset = Dataset.from_pandas(pd.DataFrame(new_training_data))

        trainer = SFTTrainer(
            model=model,
            tokenizer=tokenizer,
            train_dataset=new_dataset,
            dataset_text_field="text",
            max_seq_length=1024,
            dataset_num_proc=2,
            packing=False,
            args=TrainingArguments(
                per_device_train_batch_size=2,
                gradient_accumulation_steps=4,
                warmup_steps=5,
                max_steps=60,
                learning_rate=2e-4,
                fp16=not FastLanguageModel.is_bfloat16_supported(),
                bf16=FastLanguageModel.is_bfloat16_supported(),
                logging_steps=1,
                optim="adamw_8bit",
                weight_decay=0.01,
                lr_scheduler_type="linear",
                seed=3407,
                output_dir=f"outputs_iteration_{iteration+1}",
            ),
        )

        trainer.train()
        model = trainer.model

        logging.info(f"Completed iteration {iteration + 1}")

    return model

def main():
    # Load and combine datasets
    combined_df = load_and_combine_datasets()

    # Split the combined dataset into train and test sets
    train_df, test_df = train_test_split(combined_df, test_size=0.2, stratify=combined_df['language'], random_state=42)

    # Prepare initial data
    prepared_data = prepare_data(train_df, sample_size=1000)

    # Create dataset
    dataset = Dataset.from_pandas(pd.DataFrame(prepared_data))
    dataset = dataset.map(formatting_prompts_func, batched=True)

    # Setup LLaMA 3.1 model
    max_seq_length = 1024
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/Meta-Llama-3.1-8B-bnb-4bit",
        max_seq_length=max_seq_length,
        load_in_4bit=True,
    )

    # Setup PEFT (Parameter-Efficient Fine-Tuning)
    model = FastLanguageModel.get_peft_model(
        model,
        r=16,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_alpha=16,
        lora_dropout=0,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=3407,
    )

    # Initial training
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=dataset,
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        dataset_num_proc=2,
        packing=False,
        args=TrainingArguments(
            per_device_train_batch_size=2,
            gradient_accumulation_steps=4,
            warmup_steps=5,
            max_steps=60,
            learning_rate=2e-4,
            fp16=not FastLanguageModel.is_bfloat16_supported(),
            bf16=FastLanguageModel.is_bfloat16_supported(),
            logging_steps=1,
            optim="adamw_8bit",
            weight_decay=0.01,
            lr_scheduler_type="linear",
            seed=3407,
            output_dir="outputs_initial",
        ),
    )

    trainer.train()

    # Feedback loop training
    final_model = feedback_loop_training(trainer.model, tokenizer, dataset, num_iterations=5)

    # Save the final model
    final_model.save_pretrained("llama3_readability_improvement_final_with_feedback")
    tokenizer.save_pretrained("llama3_readability_improvement_final_with_feedback")

    logging.info("Training completed. Final model saved.")

    # Evaluate on test set
    test_data = prepare_data(test_df, sample_size=100)  # Using a smaller sample for quick testing

    for item in test_data:
        llama3_text = final_model.generate(
            **tokenizer(item['input'], return_tensors="pt", max_length=1024, truncation=True),
            max_new_tokens=100,
            temperature=0.7,
            top_p=0.9
        )
        llama3_text = tokenizer.decode(llama3_text[0], skip_special_tokens=True)

        llama3_scores, improvements = evaluate_improvement(
            item['input'], item['output'], llama3_text,
            item['original_scores'], item['gpt4_scores'], item['language']
        )

        print(f"Original ({item['language']}): {item['input']}")
        print(f"GPT-4 Simplified: {item['output']}")
        print(f"LLaMA 3 Simplified: {llama3_text}")
        print("Improvements:", improvements)
        print("---")

if __name__ == "__main__":
    main()