In [None]:
!pip install gradio transformers unsloth

Collecting gradio
  Downloading gradio-5.8.0-py3-none-any.whl.metadata (16 kB)
Collecting unsloth
  Downloading unsloth-2024.12.4-py3-none-any.whl.metadata (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.5.1 (from gradio)
  Downloading gradio_client-1.5.1-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18

In [None]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=d4cedddf28f60b17223e0090f797a7431f7dfcc1dfe2ee511e72abc21fa64208
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
from unsloth import FastLanguageModel
import torch
from rouge_score import rouge_scorer
import nltk
import numpy as np

try:
    nltk.download('punkt')
    nltk.download('punkt_tab')
except:
    pass

def load_model(model_name):
    """loading model"""
    print(f"loading {model_name}...")
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=2048,
        dtype=torch.float16,
        load_in_4bit=True
    )
    model = FastLanguageModel.for_inference(model)
    print("finish")
    return model, tokenizer

model1_name = "lemongooooo/lora_data_0"
model1, tokenizer1 = load_model(model1_name)

model2_name = "lemongooooo/lora_data_200k"
model2, tokenizer2 = load_model(model2_name)

model3_name = "lemongooooo/lora_data_300k"
model3, tokenizer3 = load_model(model3_name)

def generate_response(model, tokenizer, prompt):
    """Generate model response"""
    messages = [{"role": "user", "content": prompt}]
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    attention_mask = torch.ones_like(inputs).to(model.device)

    outputs = model.generate(
        inputs,
        attention_mask=attention_mask,
        max_new_tokens=256,
        temperature=0.7,
        do_sample=True
    )

    return tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)

def evaluate_response(generated, reference):
    """Evaluate generated response"""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(reference, generated)

    return {
        'rouge1': rouge_scores['rouge1'].fmeasure,
        'rouge2': rouge_scores['rouge2'].fmeasure,
        'rougeL': rouge_scores['rougeL'].fmeasure
    }

# Test cases remain the same
test_cases = [
    {
        'prompt': 'Summarize the following text: "Climate change is a global environmental crisis. Rising temperatures are causing polar ice caps to melt, leading to rising sea levels. This threatens coastal communities and marine ecosystems. Additionally, extreme weather events are becoming more frequent and intense, causing widespread damage and displacement. Scientists warn that immediate action is necessary to prevent catastrophic consequences."',
        'reference': 'Climate change is causing rising temperatures and melting ice caps, resulting in rising sea levels and threats to coastal areas. Extreme weather events are increasing, and immediate action is needed to prevent severe consequences.'
    },
    {
        'prompt': 'Explain how social media has changed modern communication in 2-3 sentences.',
        'reference': 'Social media has revolutionized modern communication by enabling instant global connectivity and information sharing. It has created new ways for people to connect, share experiences, and form communities regardless of geographical boundaries.'
    },
    {
        'prompt': 'Describe the process of photosynthesis in simple terms.',
        'reference': 'Photosynthesis is the process where plants convert sunlight into energy. They use this energy, along with water and carbon dioxide, to create glucose and oxygen. This process is essential for plant growth and provides oxygen for other living things.'
    },
    {
        'prompt': 'Give a brief overview of the Industrial Revolution and its main impacts.',
        'reference': 'The Industrial Revolution marked a major shift from manual production to machine manufacturing, primarily during the 18th and 19th centuries. It led to significant technological, economic, and social changes, including urbanization, improved manufacturing processes, and new working conditions. This period fundamentally transformed how people lived and worked.'
    }
]

# Initialize arrays to store results from multiple runs
num_runs = 3
all_scores_1 = []
all_scores_2 = []
all_scores_3 = []

print(f"Starting evaluation with {num_runs} runs...")
for run in range(num_runs):
    print(f"\nRun {run + 1}/{num_runs}")
    run_results = []

    for test_case in test_cases:
        prompt = test_case['prompt']
        reference = test_case['reference']

        print(f"\nProcessing task: {prompt[:50]}...")
        response1 = generate_response(model1, tokenizer1, prompt)
        response2 = generate_response(model2, tokenizer2, prompt)
        response3 = generate_response(model3, tokenizer3, prompt)

        scores1 = evaluate_response(response1, reference)
        scores2 = evaluate_response(response2, reference)
        scores3 = evaluate_response(response3, reference)

        run_results.append({
            'prompt': prompt,
            'reference': reference,
            'model1_response': response1,
            'model2_response': response2,
            'model3_response': response3,
            'model1_scores': scores1,
            'model2_scores': scores2,
            'model3_scores': scores3
        })

    # Calculate average scores for this run
    run_avg_1 = {metric: np.mean([r['model1_scores'][metric] for r in run_results])
                for metric in ['rouge1', 'rouge2', 'rougeL']}
    run_avg_2 = {metric: np.mean([r['model2_scores'][metric] for r in run_results])
                for metric in ['rouge1', 'rouge2', 'rougeL']}
    run_avg_3 = {metric: np.mean([r['model3_scores'][metric] for r in run_results])
                for metric in ['rouge1', 'rouge2', 'rougeL']}

    all_scores_1.append(run_avg_1)
    all_scores_2.append(run_avg_2)
    all_scores_3.append(run_avg_3)

    # Print detailed results for this run
    print(f"\n=== Detailed Results for Run {run + 1} ===")
    for idx, result in enumerate(run_results):
        print(f"\nTest Case {idx + 1}:")
        print(f"Task: {result['prompt'][:100]}...")
        print(f"Reference: {result['reference']}")
        print(f"\nModel 1 response: {result['model1_response']}")
        print(f"Model 1 scores: {result['model1_scores']}")
        print(f"\nModel 2 response: {result['model2_response']}")
        print(f"Model 2 scores: {result['model2_scores']}")
        print(f"\nModel 3 response: {result['model3_response']}")
        print(f"Model 3 scores: {result['model3_scores']}")

# Calculate final average scores across all runs
final_avg_1 = {
    metric: np.mean([run[metric] for run in all_scores_1])
    for metric in ['rouge1', 'rouge2', 'rougeL']
}
final_avg_2 = {
    metric: np.mean([run[metric] for run in all_scores_2])
    for metric in ['rouge1', 'rouge2', 'rougeL']
}
final_avg_3 = {
    metric: np.mean([run[metric] for run in all_scores_3])
    for metric in ['rouge1', 'rouge2', 'rougeL']
}

print("\n=== Final Average Scores (Across All Runs) ===")
print(f"Model 1 ({model1_name}) final average scores: {final_avg_1}")
print(f"Model 2 ({model2_name}) final average scores: {final_avg_2}")
print(f"Model 3 ({model3_name}) final average scores: {final_avg_3}")

# Calculate and print standard deviations
std_1 = {
    metric: np.std([run[metric] for run in all_scores_1])
    for metric in ['rouge1', 'rouge2', 'rougeL']
}
std_2 = {
    metric: np.std([run[metric] for run in all_scores_2])
    for metric in ['rouge1', 'rouge2', 'rougeL']
}
std_3 = {
    metric: np.std([run[metric] for run in all_scores_3])
    for metric in ['rouge1', 'rouge2', 'rougeL']
}

print("\n=== Standard Deviations ===")
print(f"Model 1 ({model1_name}) score standard deviations: {std_1}")
print(f"Model 2 ({model2_name}) score standard deviations: {std_2}")
print(f"Model 3 ({model3_name}) score standard deviations: {std_3}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


loading lemongooooo/lora_data_0...
==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.9. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
finish
loading lemongooooo/lora_data_200k...
==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.9. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
finish
loadi