In [None]:
# Install necessary libraries
!pip install transformers datasets torch accelerate evaluate wandb kaggle -q

# Import Libraries
import os
import random
import numpy as np
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset, DatasetDict, Dataset
import evaluate
import wandb
from gemini_library import GeminiModel
import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi

# Set seed for reproducibility
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

# Kaggle API authentication (if needed)
os.environ['KAGGLE_USERNAME'] = 'rishabhrathi123'  # Replace with your Kaggle username
os.environ['KAGGLE_KEY'] = 'adb0071fbff71b03dc3c895cefaceec4'  # Replace with your Kaggle API key

# Step 1: Download Dataset
api = KaggleApi()
api.authenticate()

dataset_path = "patzshane/football-commentary-data-set-college-and-nfl"
local_path = "./football_commentary_dataset"
os.makedirs(local_path, exist_ok=True)
api.dataset_download_files(dataset_path, path=local_path, unzip=True)

# Load Dataset from Local Path
dataset_file = os.path.join(local_path, "data.csv")  # Adjust if the file name differs
data = pd.read_csv(dataset_file)
print(data.head())  # Preview the dataset

# Step 2: Convert Dataset to Hugging Face Format
dataset = DatasetDict({
    "train": load_dataset("csv", data_files={"train": dataset_file})["train"]
})

# Preview dataset
print(dataset)

# Step 3: Initialize Pre-Trained Model and Tokenizer
model_name = "gemini-1.5-flash"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Add padding token if not present
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# Step 4: Tokenize Dataset with Dynamic Padding
def tokenize_function(examples):
    # Tokenize based on the dialogue structure
    return tokenizer(examples["text"], truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Use DataCollator for dynamic padding during training
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Set to True for masked language modeling tasks
)

# Step 5: Set Up Training Arguments
training_args = TrainingArguments(
    output_dir="./autogen_results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,  # Adjust for multiple agents
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=1000,
    push_to_hub=False,
    fp16=torch.cuda.is_available(),  # Enable mixed precision if GPU is available
    gradient_accumulation_steps=2,  # For better utilization of GPU with longer sequences
    report_to="wandb"  # Enable WandB logging
)

# Step 6: Initialize Evaluation Metrics
perplexity = evaluate.load("perplexity")
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

def compute_advanced_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    perplexity_score = perplexity.compute(predictions=predictions, references=labels)
    bleu_score = bleu.compute(predictions=predictions, references=labels)
    rouge_score = rouge.compute(predictions=predictions, references=labels)
    return {
        "perplexity": perplexity_score["perplexity"],
        "bleu": bleu_score["bleu"],
        "rouge": rouge_score["rouge1"],
    }

# Step 7: Initialize Trainer (Handling Agent Interaction in the Model)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["train"],  # Use a proper eval split
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_advanced_metrics,
)

# Step 8: Fine-Tune Model (with Autogen-specific setup)
trainer.train()

# Step 9: Evaluate Model
results = trainer.evaluate()
print("Evaluation Results:", results)

# Step 10: Save Fine-Tuned Model
output_dir = "./finetuned_model"
os.makedirs(output_dir, exist_ok=True)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model saved to {output_dir}")

# Step 11: Generate Text with Fine-Tuned Model (for Dialogue Simulation)
prompt = "In the field of artificial intelligence, one of the most exciting developments is"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")

outputs = model.generate(
    inputs["input_ids"],
    max_length=50,
    num_return_sequences=1,
    temperature=0.8,
    top_k=50,
    top_p=0.9,
    repetition_penalty=1.2
)

print("Generated Text:")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
