In [None]:
import os
import random
import numpy as np
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset, DatasetDict
import evaluate
import wandb
import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi

# Set seed for reproducibility
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

# Kaggle API authentication (Use environment variables securely)
api = KaggleApi()
api.authenticate()

# Step 1: Download Dataset
dataset_path = "patzshane/football-commentary-data-set-college-and-nfl"
local_path = "./football_commentary_dataset"
os.makedirs(local_path, exist_ok=True)
api.dataset_download_files(dataset_path, path=local_path, unzip=True)

# Load Dataset from Local Path
dataset_file = os.path.join(local_path, "data.csv")  # Adjust if needed
data = pd.read_csv(dataset_file)
print(data.head())

# Step 2: Convert Dataset to Hugging Face Format
raw_datasets = load_dataset("csv", data_files={"train": dataset_file})
split_datasets = raw_datasets["train"].train_test_split(test_size=0.1)  # 90% train, 10% validation

# Step 3: Initialize Pre-Trained Model and Tokenizer
model_name = "mistralai/Mistral-7B-Instruct"  # Replaced incorrect Gemini model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Add padding token if missing
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))
    model.config.pad_token_id = tokenizer.pad_token_id  # Ensure model recognizes pad token

# Step 4: Tokenize Dataset with Dynamic Padding
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_datasets = split_datasets.map(tokenize_function, batched=True, remove_columns=["text"])

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # We are not using masked language modeling
)

# Step 5: Set Up Training Arguments
training_args = TrainingArguments(
    output_dir="./autogen_results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=500,
    fp16=torch.cuda.is_available(),
    gradient_accumulation_steps=2,
    report_to="wandb"
)

# Step 6: Initialize Evaluation Metrics
perplexity = evaluate.load("perplexity")
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

def compute_advanced_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1).tolist()
    
    predictions_texts = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    references_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    bleu_score = bleu.compute(predictions=predictions_texts, references=references_texts)
    rouge_score = rouge.compute(predictions=predictions_texts, references=references_texts)
    perplexity_score = perplexity.compute(predictions=logits, references=labels)
    
    return {
        "perplexity": perplexity_score["perplexity"],
        "bleu": bleu_score["bleu"],
        "rouge": rouge_score["rouge1"],
    }

# Step 7: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],  # Proper validation set
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_advanced_metrics,
)

# Step 8: Fine-Tune Model
trainer.train()

# Step 9: Evaluate Model
results = trainer.evaluate()
print("Evaluation Results:", results)

# Step 10: Save Fine-Tuned Model
output_dir = "./finetuned_model"
os.makedirs(output_dir, exist_ok=True)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

# Step 11: Generate Text with Fine-Tuned Model
prompt = "In the field of artificial intelligence, one of the most exciting developments is"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

outputs = model.generate(
    inputs["input_ids"],
    max_length=50,
    num_return_sequences=1,
    temperature=0.8,
    top_k=50,
    top_p=0.9,
    repetition_penalty=1.2
)

print("Generated Text:")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
