In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/requirements-cleaned-csv/formatted_dataset.json
/kaggle/input/requirements-cleaned-csv/cleaned.csv


In [2]:
!pip install --upgrade torch torchvision transformers



Collecting torch
  Downloading torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl.metadata (28 kB)
Collecting torchvision
  Downloading torchvision-0.21.0-cp310-cp310-manylinux1_x86_64.whl.metadata (6.1 kB)
Collecting transformers
  Downloading transformers-4.48.3-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x

In [13]:
# ----------------------
# Configuration settings
# ----------------------
# Update the following paths and parameters as needed.
# In Kaggle, if your JSONL file is part of a dataset, adjust the path accordingly.
data_file = "/kaggle/input/requirements-cleaned-csv/cleaned.csv"  # <-- update this path
model_name_or_path = "t5-small"  # Using a small model for faster experiments; change if needed.
output_dir = "/kaggle/working/fine_tuned_model_t5"



os.makedirs(output_dir, exist_ok=True)

In [12]:
import pandas as pd

# Read the cleaned CSV file
df = pd.read_csv(data_file)

# Assuming columns 'raw_requirement' and 'standard_syntax'
transformed_data = []
for index, row in df.iterrows():
    input_text = str(row['Raw Requirements'])
    output_text = str(row['Requirement with Standard Syntax'])
    
    # Create a dictionary with "input" and "output"
    entry = {
        "input": input_text,
        "output": output_text
    }
    
    transformed_data.append(entry)

# Write the transformed data to a JSON file
import json

with open(os.path.join(output_dir, 'formatted_dataset.json'), 'w') as f:
    json.dump(transformed_data, indent=2, ensure_ascii=False, fp=f)

In [3]:
data_file = "/kaggle/input/requirements-cleaned-csv/formatted_dataset.json"  # <-- change this!

model_name_or_path = "t5-small"  # Using a small model for experimentation.
output_dir = "/kaggle/working/fine_tuned_model_t5"

In [11]:
# Add these imports at the top of your file
import os
import torch
from datasets import load_dataset

# Then proceed with other imports
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq  # Import the data collator for Seq2Seq tasks
)

# ---------------------------------
# Configuration & Paths
# ---------------------------------
# Update this path to point to your JSON file in Kaggle.
# For example:
# data_file = "../input/your-json-dataset/your_data.json"
data_file = "/kaggle/input/requirements-cleaned-csv/formatted_dataset.json"  # <-- change this!

model_name_or_path = "t5-small"  # Using a small model for experimentation.
output_dir = "/kaggle/working/fine_tuned_model_t5"

# Training and tokenization parameters
max_source_length = 512
max_target_length = 128
num_train_epochs = 3
per_device_train_batch_size = 8
logging_steps = 500
save_steps = 500

# Enable FP16 if a GPU is available.
use_fp16 = True if os.environ.get("CUDA_VISIBLE_DEVICES", "") else False

# ---------------------------------
# Tokenization Function
# ---------------------------------
def tokenize_function(example):
    """
    Tokenizes a single example.
    
    Assumes each example is a dictionary with:
      - "input": raw requirement text (source)
      - "output": standardized requirement text (target)
    
    Note: We remove padding here so that dynamic padding (in the data collator) works properly.
    """
    source = example["input"]
    target = example["output"]

    # Tokenize the source text without padding.
    model_inputs = tokenizer(source, max_length=max_source_length, truncation=True)
    
    # Tokenize the target text (labels) within the target tokenizer context, without padding.
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(target, max_length=max_target_length, truncation=True)
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# ---------------------------------
# Load the Dataset
# ---------------------------------
# This will load your JSON file. If it contains a list of dictionaries, it will be parsed correctly.
dataset = load_dataset("json", data_files={"train": data_file})

# ---------------------------------
# Load Model and Tokenizer
# ---------------------------------
tokenizer = T5Tokenizer.from_pretrained(model_name_or_path)
model = T5ForConditionalGeneration.from_pretrained(model_name_or_path)

# ---------------------------------
# Tokenize the Dataset
# ---------------------------------
print("Tokenizing dataset...")
# We use batched=False since each example is processed individually.
tokenized_dataset = dataset["train"].map(tokenize_function, batched=False)

# ---------------------------------
# Create a Data Collator for Dynamic Padding
# ---------------------------------
# This collator will dynamically pad your input_ids and labels to the max length in each batch.
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

# ---------------------------------
# Setup Training Arguments
# ---------------------------------
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    logging_steps=logging_steps,
    save_steps=save_steps,
    save_total_limit=2,
    evaluation_strategy="no",  # Change if you have an evaluation set.
    fp16=use_fp16,
    report_to="none"  # Disables reporting to third-party integrations (e.g., WandB)
)

# ---------------------------------
# Initialize Trainer and Train
# ---------------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator  # Use our custom data collator for dynamic padding
)

print("Starting training...")
trainer.train()

# ---------------------------------
# Save the Fine-Tuned Model
# ---------------------------------
trainer.save_model(output_dir)
print(f"Model saved to {output_dir}")


Tokenizing dataset...


Map:   0%|          | 0/9140 [00:00<?, ? examples/s]

  trainer = Trainer(


Starting training...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
500,0.7741
1000,0.5823
1500,0.5341
2000,0.504
2500,0.4826
3000,0.4636


Model saved to /kaggle/working/fine_tuned_model_t5


In [None]:
# Install required packages (if needed)
!pip install transformers datasets evaluate nltk --quiet

import os
import torch
from datasets import load_dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)

# ---------------------------------
# Configuration & Paths
# ---------------------------------
# Update these paths:
# - eval_data_file: path to your evaluation JSON file.
# - model_dir: directory of your fine-tuned model (saved previously).
eval_data_file = "../input/your-json-dataset/eval_data.json"  # <-- update this path!
model_dir = "./fine_tuned_model"  # directory where your fine-tuned model is saved

# Training/tokenization parameters (should match those used during training)
max_source_length = 512  # maximum length for input text

# ---------------------------------
# Load Model and Tokenizer
# ---------------------------------
tokenizer = T5Tokenizer.from_pretrained(model_dir)
model = T5ForConditionalGeneration.from_pretrained(model_dir)

# ---------------------------------
# Load the Evaluation Dataset
# ---------------------------------
# This assumes your evaluation JSON file is structured as a list of dictionaries,
# where each dictionary has "input" and "output" keys.
dataset_eval = load_dataset("json", data_files={"eval": eval_data_file})["eval"]

# ---------------------------------
# Tokenization Function for Evaluation
# ---------------------------------
# For evaluation, we only tokenize the "input" field because we'll use generate()
def tokenize_eval(example):
    # Tokenize the input text without padding (dynamic padding will be handled later)
    model_inputs = tokenizer(example["input"], max_length=max_source_length, truncation=True)
    return model_inputs

tokenized_eval = dataset_eval.map(tokenize_eval, batched=False)

# ---------------------------------
# Create a Data Collator for Dynamic Padding
# ---------------------------------
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

# ---------------------------------
# Setup Evaluation TrainingArguments (for the Trainer)
# ---------------------------------
eval_args = TrainingArguments(
    output_dir="./eval_output",  # temporary output directory for evaluation artifacts
    per_device_eval_batch_size=8,
    report_to="none"  # disable reporting to third-party integrations
)

# ---------------------------------
# Initialize Trainer for Evaluation
# ---------------------------------
trainer = Trainer(
    model=model,
    args=eval_args,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# ---------------------------------
# Evaluate the Model (Loss Calculation)
# ---------------------------------
# This will compute the evaluation loss (if your dataset contains labels).
eval_results = trainer.evaluate(tokenized_eval)
print("Evaluation Loss:", eval_results.get("eval_loss"))

# ---------------------------------
# Generate Predictions
# ---------------------------------
# Generate predictions for each example in the evaluation dataset.
predictions_output = trainer.predict(tokenized_eval)
# The predictions are token ids; decode them into strings.
decoded_preds = tokenizer.batch_decode(predictions_output.predictions, skip_special_tokens=True)

# Get the reference outputs from your evaluation dataset.
references = dataset_eval["output"]

# ---------------------------------
# Compute Evaluation Metrics
# ---------------------------------
# We'll compute ROUGE (using the `evaluate` package) and BLEU (using nltk).

# -- ROUGE --
import evaluate
rouge_metric = evaluate.load("rouge")
rouge_results = rouge_metric.compute(predictions=decoded_preds, references=references)
print("\nROUGE scores:")
for key, value in rouge_results.items():
    print(f"{key}: {value}")

# -- BLEU --
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import corpus_bleu

# Tokenize predictions and references for BLEU calculation.
decoded_preds_tokens = [word_tokenize(pred) for pred in decoded_preds]
# BLEU expects a list of reference token lists for each prediction.
references_tokens = [[word_tokenize(ref)] for ref in references]

bleu_score = corpus_bleu(references_tokens, decoded_preds_tokens)
print("\nBLEU score:", bleu_score)


In [1]:
!pip install transformers datasets evaluate nltk

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [7]:
from datasets import load_dataset
json_file = "/kaggle/input/requirements-cleaned-csv/formatted_dataset.json"  # update with the correct path to your JSON file
dataset = load_dataset("json", data_files={"train": json_file})
dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)

train_dataset = dataset["train"]
test_dataset = dataset["test"]

In [2]:
import os
import math
import torch
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import corpus_bleu


from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
import evaluate

def train_gpt2():
    # ---------------- Load the JSON Dataset ---------------- #
    # Your JSON file should contain one example per line with keys "input" and "output".
    
    
    # Split the dataset into train (80%) and test (20%) splits.
    
    

    # ---------------- Load Pretrained GPT-2 ---------------- #
    model_name = "gpt2"  # You can switch to a different GPT-2 variant if needed.
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name).to("cuda")
    # GPT-2 does not have a pad token, so assign the EOS token as the pad token.
    tokenizer.pad_token = tokenizer.eos_token

    # ---------------- Preprocessing Function for Fine-Tuning ---------------- #
    def preprocess_function(examples):
        texts = []
        # For each example, we create a prompt that includes both the input and output.
        for inp, out in zip(examples["input"], examples["output"]):
            # Format: "Input: <input text>\nOutput: <output text>\n"
            prompt = f"Input: {inp}\nOutput: {out}\n"
            texts.append(prompt)
        return tokenizer(texts, truncation=True, max_length=512, padding="max_length")

    # Tokenize the training and test splits.
    tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
    tokenized_test = test_dataset.map(preprocess_function, batched=True, remove_columns=test_dataset.column_names)

    # ---------------- Setup Data Collator ---------------- #
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    # ---------------- Setup Training Arguments ---------------- #
    training_args = TrainingArguments(
        output_dir="./fine_tuned_model",
        num_train_epochs=3,
        per_device_train_batch_size=16,   # Adjust based on available GPU memory.
        gradient_accumulation_steps=2,
        evaluation_strategy="epoch",
        logging_steps=500,
        save_steps=500,
        fp16=True,
        report_to=[]  # Disable external logging integrations.
    )

    # ---------------- Initialize Trainer ---------------- #
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    # ---------------- Fine-Tune the Model ---------------- #
    print("Starting training...")
    trainer.train()

    # ---------------- Evaluate Perplexity ---------------- #
    print("Evaluating model on test set (perplexity)...")
    eval_results = trainer.evaluate()
    eval_loss = eval_results.get("eval_loss")
    if eval_loss is not None and eval_loss < float("inf"):
        perplexity = math.exp(eval_loss)
    else:
        perplexity = float("inf")
    eval_results["perplexity"] = perplexity
    print("Evaluation results (Trainer):")
    print(eval_results)

    # ---------------- BLEU Score Evaluation ---------------- #
    generated_texts = []
    references = []
    print("\nGenerating outputs for BLEU evaluation on the test set...")

    # Generate outputs sample-by-sample.
    for sample in test_dataset:
        inp = sample["input"]
        reference = sample["output"]
        # Create a prompt that contains only the input.
        prompt = f"Input: {inp}\nOutput:"
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to("cuda")

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=512,
                num_beams=2,
                early_stopping=True,
                pad_token_id=tokenizer.eos_token_id
            )
        generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Extract the generated text after "Output:".
        if "Output:" in generated:
            generated = generated.split("Output:")[-1].strip()
        generated_texts.append(generated)
        # For corpus_bleu, each reference should be wrapped in a list.
        references.append([reference.strip()])

    # Compute BLEU using nltk's corpus_bleu.
    # Tokenize generated texts and references.
    gen_tokens = [generated.split() for generated in generated_texts]
    ref_tokens = [[ref.split()] for ref in [r[0] for r in references]]
    bleu_score = corpus_bleu(ref_tokens, gen_tokens)
    print(f"\nBLEU score on test set: {bleu_score:.4f}")




[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
train_gpt2()

Generating train split: 0 examples [00:00, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map:   0%|          | 0/7312 [00:00<?, ? examples/s]

Map:   0%|          | 0/1828 [00:00<?, ? examples/s]

  trainer = Trainer(


Starting training...


Epoch,Training Loss,Validation Loss
1,No log,1.482942
2,1.581300,1.409898


Evaluating model on test set (perplexity)...


Evaluation results (Trainer):
{'eval_loss': 1.409897804260254, 'eval_runtime': 47.0534, 'eval_samples_per_second': 38.85, 'eval_steps_per_second': 4.867, 'epoch': 2.989059080962801, 'perplexity': 4.0955368362669775}

Generating outputs for BLEU evaluation on the test set...


KeyboardInterrupt: 

In [None]:
# ---------------- BLEU Score Evaluation ---------------- #
generated_texts = []
references = []
print("\nGenerating outputs for BLEU evaluation on the test set...")

# Generate outputs sample-by-sample.
for sample in test_dataset:
    inp = sample["input"]
    reference = sample["output"]
    # Create a prompt that contains only the input.
    prompt = f"Input: {inp}\nOutput:"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=512,
            num_beams=2,
            early_stopping=True,
            pad_token_id=tokenizer.eos_token_id
        )
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract the generated text after "Output:".
    if "Output:" in generated:
        generated = generated.split("Output:")[-1].strip()
        bleu_score = corpus_blue([generated], [reference])
        
        print(generated)
        print(bleu_score)
    generated_texts.append(generated)
    # For corpus_bleu, each reference should be wrapped in a list.
    references.append([reference.strip()])

# Compute BLEU using nltk's corpus_bleu.
# Tokenize generated texts and references.
gen_tokens = [generated.split() for generated in generated_texts]
ref_tokens = [[ref.split()] for ref in [r[0] for r in references]]
bleu_score = corpus_bleu(ref_tokens, gen_tokens)
print(f"\nBLEU score on test set: {bleu_score:.4f}")

In [12]:
import os
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# ---------------------------------
# Configuration & Paths
# ---------------------------------
# Update this path if your fine-tuned model is saved somewhere else.
model_dir = "/kaggle/working/fine_tuned_model_t5/checkpoint-3429"  # Path where your fine-tuned model is saved

# ---------------------------------
# Load the Model and Tokenizer
# ---------------------------------
tokenizer = T5Tokenizer.from_pretrained(model_dir)
model = T5ForConditionalGeneration.from_pretrained(model_dir)

# ---------------------------------
# Define Sample Inputs for Evaluation
# ---------------------------------
# Replace these sample inputs with your own raw requirement texts.
sample_inputs = [
    "Raw requirement text example 1: Describe a system that can manage user accounts.",
    "Raw requirement text example 2: Explain how the login process should handle authentication failures.",
    "Raw requirement text example 3: Detail the steps for password recovery and security measures."
]

# ---------------------------------
# Tokenize the Sample Inputs
# ---------------------------------
# Tokenize the inputs with dynamic padding to create a batch.
inputs = tokenizer(
    sample_inputs,
    return_tensors="pt",
    max_length=512,
    truncation=True,
    padding=True
)

# ---------------------------------
# Generate Predictions
# ---------------------------------
# Adjust generation parameters (e.g., max_length, num_beams) as needed.
with torch.no_grad():
    generated_ids = model.generate(
        inputs.input_ids,
        max_length=128,
        num_beams=4,
        early_stopping=True
    )

# Decode the generated ids to text
generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

# ---------------------------------
# Print the Results
# ---------------------------------
for i, (raw_input, prediction) in enumerate(zip(sample_inputs, generated_texts)):
    print(f"Input {i+1}: {raw_input}\n")
    print(f"Generated Output {i+1}: {prediction}\n")
    print("-" * 50)

Input 1: Raw requirement text example 1: Describe a system that can manage user accounts.

Generated Output 1: The system shall allow the user to manage user accounts.

--------------------------------------------------
Input 2: Raw requirement text example 2: Explain how the login process should handle authentication failures.

Generated Output 2: The system shall allow the login process to handle authentication failures.

--------------------------------------------------
Input 3: Raw requirement text example 3: Detail the steps for password recovery and security measures.

Generated Output 3: The system shall detail the steps for password recovery and security measures.

--------------------------------------------------


In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [5]:
model = None
tokenizer= None
dataset = None
trainer = None
torch.cuda.empty_cache()

In [6]:
torch.cuda.memory_allocated()

16277398528

In [None]:
# # Add these imports at the top of your file
# import os
# import torch
# import numpy as np
# import nltk
# nltk.download('punkt')
# from nltk.tokenize import word_tokenize
# from nltk.translate.bleu_score import corpus_bleu

# from datasets import load_dataset
# from transformers import (
#     T5Tokenizer,
#     T5ForConditionalGeneration,
#     Trainer,
#     TrainingArguments,
#     DataCollatorForSeq2Seq
# )
# import evaluate

# # ---------------------------------
# # Clean up GPU memory and adjust allocation configuration
# # ---------------------------------
# # This setting may help avoid memory fragmentation.
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
# # Empty any cached memory in PyTorch
# torch.cuda.empty_cache()

# # ---------------------------------
# # Configuration & Paths
# # ---------------------------------
# # Update this path to point to your JSON file in Kaggle.
# data_file = "/kaggle/input/requirements-cleaned-csv/formatted_dataset.json"  # <-- change this!

# # Use a more capable model instead of the basic T5.
# # In this example, we're using google/flan-t5-large.
# model_name_or_path = "google/flan-t5-large"
# output_dir = "/kaggle/working/fine_tuned_model_flan_t5_large"

# # Training and tokenization parameters
# max_source_length = 512
# max_target_length = 128
# num_train_epochs = 3

# # Consider reducing the batch size if you continue to run into memory issues.
# per_device_train_batch_size = 2  # try lowering to 4 if OOM persists

# logging_steps = 500
# save_steps = 500

# # Enable FP16 if a GPU is available.
# use_fp16 = True if os.environ.get("CUDA_VISIBLE_DEVICES", "") else False

# # ---------------------------------
# # Tokenization Function
# # ---------------------------------
# def tokenize_function(example):
#     """
#     Tokenizes a single example.
    
#     Assumes each example is a dictionary with:
#       - "input": raw requirement text (source)
#       - "output": standardized requirement text (target)
    
#     Note: We remove padding here so that dynamic padding (in the data collator) works properly.
#     """
#     source = example["input"]
#     target = example["output"]

#     # Tokenize the source text without padding.
#     model_inputs = tokenizer(source, max_length=max_source_length, truncation=True)
    
#     # Tokenize the target text (labels) within the target tokenizer context, without padding.
#     with tokenizer.as_target_tokenizer():
#         labels = tokenizer(target, max_length=max_target_length, truncation=True)
    
#     model_inputs["labels"] = labels["input_ids"]
#     return model_inputs

# # ---------------------------------
# # Load the Dataset
# # ---------------------------------
# # This will load your JSON file. If it contains a list of dictionaries, it will be parsed correctly.
# dataset = load_dataset("json", data_files={"train": data_file})

# # ---------------------------------
# # Load Model and Tokenizer
# # ---------------------------------
# tokenizer = T5Tokenizer.from_pretrained(model_name_or_path)
# model = T5ForConditionalGeneration.from_pretrained(model_name_or_path)

# # ---------------------------------
# # Tokenize the Dataset
# # ---------------------------------
# print("Tokenizing dataset...")
# tokenized_dataset = dataset["train"].map(tokenize_function, batched=False)

# # ---------------------------------
# # Split the Dataset for Training and Evaluation
# # ---------------------------------
# # Since you don't have a separate evaluation file, we split the dataset into 90% training and 10% evaluation.
# split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
# train_dataset = split_dataset["train"]
# eval_dataset = split_dataset["test"]

# # ---------------------------------
# # Create a Data Collator for Dynamic Padding
# # ---------------------------------
# data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

# # ---------------------------------
# # Define Compute Metrics Function
# # ---------------------------------
# def compute_metrics(eval_preds):
#     """
#     Computes evaluation metrics (ROUGE and BLEU) using model predictions and labels.
#     """
#     preds, labels = eval_preds
#     if isinstance(preds, tuple):
#         preds = preds[0]
    
#     # Decode predictions and labels.
#     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
#     # Replace -100 in the labels (if any) with the pad token id.
#     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
#     # Compute ROUGE scores.
#     rouge_metric = evaluate.load("rouge")
#     rouge_results = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)
    
#     # Compute BLEU score using nltk.
#     decoded_preds_tokens = [word_tokenize(pred) for pred in decoded_preds]
#     decoded_labels_tokens = [[word_tokenize(label)] for label in decoded_labels]
#     bleu_score = corpus_bleu(decoded_labels_tokens, decoded_preds_tokens)
    
#     return {
#         "rouge1": rouge_results["rouge1"],
#         "rouge2": rouge_results["rouge2"],
#         "rougeL": rouge_results["rougeL"],
#         "bleu": bleu_score
#     }

# # ---------------------------------
# # Setup Training Arguments
# # ---------------------------------
# training_args = TrainingArguments(
#     output_dir=output_dir,
#     num_train_epochs=num_train_epochs,
#     per_device_train_batch_size=per_device_train_batch_size,
#     logging_steps=logging_steps,
#     save_steps=save_steps,
#     save_total_limit=2,
#     evaluation_strategy="epoch",  # Evaluate at the end of each epoch.
#     fp16=use_fp16,
#     report_to="none"  # Disables reporting to third-party integrations (e.g., WandB)
# )

# # ---------------------------------
# # Initialize Trainer and Train
# # ---------------------------------
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics  # Use our custom evaluation metrics
# )

# # Clean GPU memory one more time before training starts
# torch.cuda.empty_cache()

# print("Starting training...")
# trainer.train()

# print("Evaluating model on evaluation set...")
# eval_metrics = trainer.evaluate()
# print("Evaluation Metrics:", eval_metrics)

# # ---------------------------------
# # Save the Fine-Tuned Model
# # ---------------------------------
# trainer.save_model(output_dir)
# print(f"Model saved to {output_dir}")


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Generating train split: 0 examples [00:00, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Tokenizing dataset...


Map:   0%|          | 0/9140 [00:00<?, ? examples/s]

  trainer = Trainer(


Starting training...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


In [1]:
# !pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [2]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl (69.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.2


In [3]:
from huggingface_hub import notebook_login

In [4]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
# # Add these imports at the top of your file
# import os
# import torch
# import numpy as np
# import nltk
# nltk.download('punkt')
# from nltk.tokenize import word_tokenize
# from nltk.translate.bleu_score import corpus_bleu

# from datasets import load_dataset
# from transformers import (
#     AutoTokenizer,
#     LlamaForCausalLM,
#     Trainer,
#     TrainingArguments,
#     DataCollatorForLanguageModeling
# )
# import evaluate

# # ---------------------------------
# # Clean up GPU memory and adjust allocation configuration
# # ---------------------------------
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
# torch.cuda.empty_cache()

# # ---------------------------------
# # Configuration & Paths
# # ---------------------------------
# data_file = "/kaggle/input/requirements-cleaned-csv/formatted_dataset.json"  # <-- update as needed

# # Use a LLaMA model (here we use Llama-2-7B HF version)
# model_name_or_path = "meta-llama/Llama-2-7b-hf"  # Change as needed.
# output_dir = "/kaggle/working/fine_tuned_model_llama_7b"

# # Training and tokenization parameters.
# max_source_length = 512   # Maximum tokens for the full prompt.
# max_target_length = 128   # (Not used directly in tokenization)
# num_train_epochs = 3

# # For minimal memory consumption, we use a very low per-device batch size.
# per_device_train_batch_size = 1
# # Use gradient accumulation to simulate a larger effective batch size.
# gradient_accumulation_steps = 2

# logging_steps = 500
# save_steps = 500

# # Choose whether to use 8-bit quantization.
# use_8bit = True  # Set to True to load the model in 8-bit (requires bitsandbytes); otherwise, use FP16.

# # We'll use FP16 only if not in 8-bit mode.
# use_fp16 = (not use_8bit) and (True if os.environ.get("CUDA_VISIBLE_DEVICES", "") else False)

# # ---------------------------------
# # Helper: Extract Response from Text
# # ---------------------------------
# def extract_response(text):
#     """Extract the part of the text after 'Response:'."""
#     parts = text.split("Response:")
#     if len(parts) > 1:
#         return parts[1].strip()
#     return text.strip()

# # ---------------------------------
# # Tokenization Function for LLaMA
# # ---------------------------------
# def tokenize_function(example):
#     """
#     Formats and tokenizes a single example for a causal LM.
#     Expects each example to have:
#       - "input": raw requirement text (instruction)
#       - "output": standardized requirement text (response)
#     The prompt format is:
#       "Instruction: <input text>\nResponse: <output text>"
#     """
#     full_prompt = "Instruction: " + example["input"].strip() + "\nResponse: " + example["output"].strip()
#     tokenized = tokenizer(full_prompt, max_length=max_source_length, truncation=True)
#     return tokenized

# # ---------------------------------
# # Load the Dataset
# # ---------------------------------
# dataset = load_dataset("json", data_files={"train": data_file})

# # ---------------------------------
# # Load Model and Tokenizer for LLaMA
# # ---------------------------------
# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
# if tokenizer.pad_token is None:
#     tokenizer.pad_token = tokenizer.eos_token

# if use_8bit:
#     # Load model in 8-bit quantization mode (requires bitsandbytes)
#     model = LlamaForCausalLM.from_pretrained(
#         model_name_or_path,
#         load_in_8bit=True,
#         device_map="auto"
#     )
# else:
#     # Load model in FP16 mode and enable gradient checkpointing.
#     model = LlamaForCausalLM.from_pretrained(
#         model_name_or_path,
#         torch_dtype=torch.float16
#     )
#     model.gradient_checkpointing_enable()

# # ---------------------------------
# # Tokenize the Dataset
# # ---------------------------------
# print("Tokenizing dataset...")
# tokenized_dataset = dataset["train"].map(tokenize_function, batched=False)

# # ---------------------------------
# # Split the Dataset for Training and Evaluation
# # ---------------------------------
# split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
# train_dataset = split_dataset["train"]
# eval_dataset = split_dataset["test"]

# # ---------------------------------
# # Create a Data Collator for Causal LM
# # ---------------------------------
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# # ---------------------------------
# # Define Compute Metrics Function
# # ---------------------------------
# def compute_metrics(eval_preds):
#     """
#     Computes evaluation metrics (ROUGE and BLEU) using model predictions and labels.
#     The predictions and labels are decoded, and then the part after 'Response:' is extracted.
#     """
#     preds, labels = eval_preds
#     if isinstance(preds, tuple):
#         preds = preds[0]
#     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
#     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
#     extracted_preds = [extract_response(text) for text in decoded_preds]
#     extracted_labels = [extract_response(text) for text in decoded_labels]
#     rouge_metric = evaluate.load("rouge")
#     rouge_results = rouge_metric.compute(predictions=extracted_preds, references=extracted_labels)
#     preds_tokens = [word_tokenize(pred) for pred in extracted_preds]
#     labels_tokens = [[word_tokenize(label)] for label in extracted_labels]
#     bleu_score = corpus_bleu(labels_tokens, preds_tokens)
#     return {
#         "rouge1": rouge_results["rouge1"],
#         "rouge2": rouge_results["rouge2"],
#         "rougeL": rouge_results["rougeL"],
#         "bleu": bleu_score
#     }

# # ---------------------------------
# # Setup Training Arguments
# # ---------------------------------
# training_args = TrainingArguments(
#     output_dir=output_dir,
#     num_train_epochs=num_train_epochs,
#     per_device_train_batch_size=per_device_train_batch_size,
#     gradient_accumulation_steps=gradient_accumulation_steps,
#     logging_steps=logging_steps,
#     save_steps=save_steps,
#     save_total_limit=2,
#     evaluation_strategy="epoch",
#     fp16=use_fp16,
#     report_to="none"
# )

# # ---------------------------------
# # Initialize Trainer and Train
# # ---------------------------------
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics
# )

# torch.cuda.empty_cache()
# print("Starting training...")
# trainer.train()

# print("Evaluating model on evaluation set...")
# eval_metrics = trainer.evaluate()
# print("Evaluation Metrics:", eval_metrics)

# trainer.save_model(output_dir)
# print(f"Model saved to {output_dir}")


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Tokenizing dataset...


Map:   0%|          | 0/9140 [00:00<?, ? examples/s]

  trainer = Trainer(


ValueError: You cannot perform fine-tuning on purely quantized models. Please attach trainable adapters on top of the quantized model to correctly perform fine-tuning. Please see: https://huggingface.co/docs/transformers/peft for more details

In [28]:
import gc
import torch

def clear_all_globals():
    """
    Deletes most global variables from the current namespace (skipping built-ins and a few essential modules),
    runs garbage collection, and empties the CUDA cache.
    """
    # List keys that we want to keep (builtins and a few essential modules)
    keep_keys = {"__builtins__", "__name__", "__package__", "__loader__", "__spec__", "__file__", "__cached__",
                 "gc", "torch", "os", "sys"}
    
    # Get a list of all keys in globals
    all_keys = list(globals().keys())
    
    for key in all_keys:
        if key not in keep_keys:
            try:
                del globals()[key]
            except Exception as e:
                print(f"Could not delete {key}: {e}")
    
    # Run garbage collection
    gc.collect()
    # Empty the PyTorch CUDA cache
    torch.cuda.empty_cache()
    
    # Print memory usage for confirmation (if desired)
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1e6
        reserved = torch.cuda.memory_reserved() / 1e6
        print(f"After clearing globals: Allocated: {allocated:.2f} MB, Reserved: {reserved:.2f} MB")
    else:
        print("CUDA is not available.")

# Call the function at the end of your script:
clear_all_globals()
print("All declared variables cleared and GPU memory has been emptied.")


After clearing globals: Allocated: 16629.14 MB, Reserved: 16687.04 MB
All declared variables cleared and GPU memory has been emptied.


In [29]:
# # del model
# del tokenizer
# gc.collect()
# torch.cuda.empty_cache()


NameError: name 'tokenizer' is not defined

In [27]:
# import gc
# import torch

# def empty_gpu_memory():
#     """
#     Clears the CUDA cache and runs the garbage collector to free up GPU memory.
#     """
#     # Clear the PyTorch CUDA cache
#     torch.cuda.empty_cache()
#     # Run garbage collection to clear any unreferenced objects
#     gc.collect()
#     # Optionally, print out GPU memory stats if you like
#     if torch.cuda.is_available():
#         allocated = torch.cuda.memory_allocated()
#         reserved = torch.cuda.memory_reserved()
#         print(f"After emptying cache: Allocated: {allocated/1e6:.2f} MB, Reserved: {reserved/1e6:.2f} MB")
#     else:
#         print("CUDA is not available.")

# empty_gpu_memory()

After emptying cache: Allocated: 16629.14 MB, Reserved: 16687.04 MB


In [26]:
# import gc
# import torch

# def force_clear_gpu_memory():
#     # Attempt to delete all objects in the global namespace that might be on the GPU.
#     # WARNING: This is a bit aggressive and might delete things you need!
#     # Use with caution or in a controlled script.
#     global_vars = list(globals().keys())
#     for var in global_vars:
#         try:
#             obj = globals()[var]
#             # Check if the object is a tensor or model
#             if torch.is_tensor(obj) or hasattr(obj, "to") and callable(obj.to):
#                 del globals()[var]
#         except Exception:
#             pass
#     gc.collect()
#     torch.cuda.empty_cache()
    
#     allocated = torch.cuda.memory_allocated() / 1e6
#     reserved = torch.cuda.memory_reserved() / 1e6
#     print(f"After force clearing: Allocated: {allocated:.2f} MB, Reserved: {reserved:.2f} MB")

# force_clear_gpu_memory()


After force clearing: Allocated: 16629.14 MB, Reserved: 16687.04 MB


NameError: name 'model' is not defined

In [7]:
!pip install transformers bitsandbytes peft datasets evaluate nltk



In [12]:
# # Add these imports at the top of your file
# import os
# import torch
# import numpy as np
# import nltk
# nltk.download('punkt')
# from nltk.tokenize import word_tokenize
# from nltk.translate.bleu_score import corpus_bleu

# from datasets import load_dataset
# from transformers import (
#     AutoTokenizer,
#     LlamaForCausalLM,
#     Trainer,
#     TrainingArguments,
#     DataCollatorForLanguageModeling,
#     BitsAndBytesConfig
# )
# import evaluate

# # Import PEFT utilities for adapter-based fine-tuning.
# from peft import LoraConfig, get_peft_model, TaskType

# # ---------------------------------
# # Clean up GPU memory and adjust allocation configuration
# # ---------------------------------
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
# # Try a different CUBLAS workspace configuration if needed; you can experiment with ":4096:8"
# os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
# torch.cuda.empty_cache()

# # ---------------------------------
# # Configuration & Paths
# # ---------------------------------
# data_file = "/kaggle/input/requirements-cleaned-csv/formatted_dataset.json"  # <-- update as needed

# # Use a LLaMA model (here we use Llama-2-7B HF version)
# model_name_or_path = "meta-llama/Llama-2-7b-hf"  # Change as needed.
# output_dir = "/kaggle/working/fine_tuned_model_llama_7b"

# # Training and tokenization parameters.
# max_source_length = 512   # Maximum tokens for the full prompt.
# max_target_length = 128   # (Not used directly in tokenization)
# num_train_epochs = 3

# # For minimal memory consumption, we use a very low per-device batch size.
# per_device_train_batch_size = 1
# # Use gradient accumulation to simulate a larger effective batch size.
# gradient_accumulation_steps = 2

# logging_steps = 500
# save_steps = 500

# # Use 8-bit quantization mode.
# use_8bit = True  # Set to True to load the model in 8-bit mode (requires bitsandbytes).
# # When using 8-bit mode with adapters, we do not use FP16.
# use_fp16 = False

# # ---------------------------------
# # Helper: Extract Response from Text
# # ---------------------------------
# def extract_response(text):
#     """Extract the part of the text after 'Response:'."""
#     parts = text.split("Response:")
#     if len(parts) > 1:
#         return parts[1].strip()
#     return text.strip()

# # ---------------------------------
# # Tokenization Function for LLaMA
# # ---------------------------------
# def tokenize_function(example):
#     """
#     Formats and tokenizes a single example for a causal LM.
#     Expects each example to have:
#       - "input": raw requirement text (instruction)
#       - "output": standardized requirement text (response)
#     The prompt format is:
#       "Instruction: <input text>\nResponse: <output text>"
#     """
#     full_prompt = "Instruction: " + example["input"].strip() + "\nResponse: " + example["output"].strip()
#     tokenized = tokenizer(full_prompt, max_length=max_source_length, truncation=True)
#     return tokenized

# # ---------------------------------
# # Load the Dataset
# # ---------------------------------
# dataset = load_dataset("json", data_files={"train": data_file})

# # ---------------------------------
# # Load Model and Tokenizer for LLaMA with PEFT and 8-bit Quantization
# # ---------------------------------
# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
# if tokenizer.pad_token is None:
#     tokenizer.pad_token = tokenizer.eos_token

# if use_8bit:
#     # Define the quantization configuration using BitsAndBytesConfig.
#     quantization_config = BitsAndBytesConfig(
#         load_in_8bit=True,
#         llm_int8_enable_fp32_cpu_offload=False  # Do not offload any module to CPU/disk.
#     )
#     # Force the model to be loaded entirely on GPU by setting device_map to "cuda:0".
#     model = LlamaForCausalLM.from_pretrained(
#         model_name_or_path,
#         quantization_config=quantization_config,
#         device_map="cuda:0"
#     )
#     # Configure LoRA for causal LM fine-tuning with a lower rank.
#     lora_config = LoraConfig(
#         task_type=TaskType.CAUSAL_LM,
#         inference_mode=False,  # Must be False during training.
#         r=4,                 # Lowered LoRA rank (try 4 instead of 8)
#         lora_alpha=32,       # Scaling factor.
#         lora_dropout=0.1     # Dropout rate.
#     )
#     # Wrap the 8-bit model with LoRA adapters.
#     model = get_peft_model(model, lora_config)
# else:
#     model = LlamaForCausalLM.from_pretrained(
#         model_name_or_path,
#         torch_dtype=torch.float16
#     )
#     model.gradient_checkpointing_enable()

# # ---------------------------------
# # Tokenize the Dataset
# # ---------------------------------
# print("Tokenizing dataset...")
# tokenized_dataset = dataset["train"].map(tokenize_function, batched=False)

# # ---------------------------------
# # Split the Dataset for Training and Evaluation
# # ---------------------------------
# split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
# train_dataset = split_dataset["train"]
# eval_dataset = split_dataset["test"]

# # ---------------------------------
# # Create a Data Collator for Causal LM
# # ---------------------------------
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# # ---------------------------------
# # Define Compute Metrics Function
# # ---------------------------------
# def compute_metrics(eval_preds):
#     """
#     Computes evaluation metrics (ROUGE and BLEU) using model predictions and labels.
#     The predictions and labels are decoded, and then the part after 'Response:' is extracted.
#     """
#     preds, labels = eval_preds
#     if isinstance(preds, tuple):
#         preds = preds[0]
#     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
#     # Replace -100 in labels with the pad token id and decode.
#     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
#     extracted_preds = [extract_response(text) for text in decoded_preds]
#     extracted_labels = [extract_response(text) for text in decoded_labels]
#     rouge_metric = evaluate.load("rouge")
#     rouge_results = rouge_metric.compute(predictions=extracted_preds, references=extracted_labels)
#     preds_tokens = [word_tokenize(pred) for pred in extracted_preds]
#     labels_tokens = [[word_tokenize(label)] for label in extracted_labels]
#     bleu_score = corpus_bleu(labels_tokens, preds_tokens)
#     return {
#         "rouge1": rouge_results["rouge1"],
#         "rouge2": rouge_results["rouge2"],
#         "rougeL": rouge_results["rougeL"],
#         "bleu": bleu_score
#     }

# # ---------------------------------
# # Setup Training Arguments
# # ---------------------------------
# training_args = TrainingArguments(
#     output_dir=output_dir,
#     num_train_epochs=num_train_epochs,
#     per_device_train_batch_size=per_device_train_batch_size,
#     gradient_accumulation_steps=gradient_accumulation_steps,
#     logging_steps=logging_steps,
#     save_steps=save_steps,
#     save_total_limit=2,
#     evaluation_strategy="epoch",  # Evaluate at the end of each epoch.
#     fp16=use_fp16,
#     report_to="none"
# )

# # ---------------------------------
# # Initialize Trainer and Train
# # ---------------------------------
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics
# )

# torch.cuda.empty_cache()
# print("Starting training...")
# trainer.train()

# print("Evaluating model on evaluation set...")
# eval_metrics = trainer.evaluate()
# print("Evaluation Metrics:", eval_metrics)

# trainer.save_model(output_dir)
# print(f"Model saved to {output_dir}")


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Tokenizing dataset...
Starting training...


  trainer = Trainer(


RuntimeError: cublasLt ran into an error!
	shapeA=torch.Size([4096, 4096]), shapeB=torch.Size([33, 4096]), shapeC=(33, 4096)
	(lda, ldb, ldc)=(c_int(4096), c_int(4096), c_int(4096))
	(m, n, k)=(c_int(4096), c_int(33), c_int(4096))

In [1]:
# !pip install transformers bitsandbytes peft datasets evaluate nltk

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl (69.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate, bitsandbytes
Successfully installed bitsandbytes-0.45.2 evaluate-0.4.3


In [2]:
# from huggingface_hub import notebook_login

In [3]:
# notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
# # Add these imports at the top of your file
# import os
# import torch
# import numpy as np
# import nltk
# nltk.download('punkt')
# from nltk.tokenize import word_tokenize
# from nltk.translate.bleu_score import corpus_bleu

# from datasets import load_dataset
# from transformers import (
#     AutoTokenizer,
#     LlamaForCausalLM,
#     Trainer,
#     TrainingArguments,
#     DataCollatorForLanguageModeling,
#     BitsAndBytesConfig
# )
# import evaluate

# # Import PEFT utilities for adapter-based fine-tuning.
# from peft import LoraConfig, get_peft_model, TaskType

# # ---------------------------------
# # Clean up GPU memory and adjust allocation configuration
# # ---------------------------------
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
# # Optionally, set CUBLAS workspace configuration (experiment with these values if needed)
# os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
# torch.cuda.empty_cache()

# # ---------------------------------
# # Configuration & Paths
# # ---------------------------------
# data_file = "/kaggle/input/requirements-cleaned-csv/formatted_dataset.json"  # <-- update as needed

# # Use the smallest official LLaMA model: Llama-2-7B HF version.
# model_name_or_path = "meta-llama/Llama-2-7b-hf"  # This is the smallest available official model.
# output_dir = "/kaggle/working/fine_tuned_model_llama_7b"

# # Training and tokenization parameters.
# max_source_length = 512   # Maximum tokens for the full prompt.
# max_target_length = 128   # (Not used directly in tokenization)
# num_train_epochs = 3

# # For minimal memory consumption, we use a very low per-device batch size.
# per_device_train_batch_size = 1
# # Use gradient accumulation to simulate a larger effective batch size.
# gradient_accumulation_steps = 2

# logging_steps = 500
# save_steps = 500

# # Use 8-bit quantization mode.
# use_8bit = True  # Set to True to load the model in 8-bit mode (requires bitsandbytes).
# # When using 8-bit mode with adapters, we do not use FP16.
# use_fp16 = False

# # ---------------------------------
# # Helper: Extract Response from Text
# # ---------------------------------
# def extract_response(text):
#     """Extract the part of the text after 'Response:'."""
#     parts = text.split("Response:")
#     if len(parts) > 1:
#         return parts[1].strip()
#     return text.strip()

# # ---------------------------------
# # Tokenization Function for LLaMA
# # ---------------------------------
# def tokenize_function(example):
#     """
#     Formats and tokenizes a single example for a causal LM.
#     Expects each example to have:
#       - "input": raw requirement text (instruction)
#       - "output": standardized requirement text (response)
#     The prompt format is:
#       "Instruction: <input text>\nResponse: <output text>"
#     """
#     full_prompt = "Instruction: " + example["input"].strip() + "\nResponse: " + example["output"].strip()
#     tokenized = tokenizer(full_prompt, max_length=max_source_length, truncation=True)
#     return tokenized

# # ---------------------------------
# # Load the Dataset
# # ---------------------------------
# dataset = load_dataset("json", data_files={"train": data_file})

# # ---------------------------------
# # Load Model and Tokenizer for LLaMA with PEFT and 8-bit Quantization
# # ---------------------------------
# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
# if tokenizer.pad_token is None:
#     tokenizer.pad_token = tokenizer.eos_token

# if use_8bit:
#     quantization_config = BitsAndBytesConfig(
#         load_in_8bit=True,
#         llm_int8_enable_fp32_cpu_offload=False  # Ensure the model stays on GPU.
#     )
#     model = LlamaForCausalLM.from_pretrained(
#         model_name_or_path,
#         quantization_config=quantization_config,
#         device_map="cuda:0"  # Force model to load entirely on GPU.
#     )
#     # Configure LoRA with a lower rank if needed.
#     lora_config = LoraConfig(
#         task_type=TaskType.CAUSAL_LM,
#         inference_mode=False,  # Must be False during training.
#         r=4,                   # Lower LoRA rank (if memory is very tight).
#         lora_alpha=32,
#         lora_dropout=0.1
#     )
#     model = get_peft_model(model, lora_config)
# else:
#     model = LlamaForCausalLM.from_pretrained(
#         model_name_or_path,
#         torch_dtype=torch.float16
#     )
#     model.gradient_checkpointing_enable()

# # ---------------------------------
# # Tokenize the Dataset
# # ---------------------------------
# print("Tokenizing dataset...")
# tokenized_dataset = dataset["train"].map(tokenize_function, batched=False)

# # ---------------------------------
# # Split the Dataset for Training and Evaluation
# # ---------------------------------
# split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
# train_dataset = split_dataset["train"]
# eval_dataset = split_dataset["test"]

# # ---------------------------------
# # Create a Data Collator for Causal LM
# # ---------------------------------
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# # ---------------------------------
# # Define Compute Metrics Function
# # ---------------------------------
# def compute_metrics(eval_preds):
#     """
#     Computes evaluation metrics (ROUGE and BLEU) using model predictions and labels.
#     The predictions and labels are decoded, and then the part after 'Response:' is extracted.
#     """
#     preds, labels = eval_preds
#     if isinstance(preds, tuple):
#         preds = preds[0]
#     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
#     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
#     extracted_preds = [extract_response(text) for text in decoded_preds]
#     extracted_labels = [extract_response(text) for text in decoded_labels]
#     rouge_metric = evaluate.load("rouge")
#     rouge_results = rouge_metric.compute(predictions=extracted_preds, references=extracted_labels)
#     preds_tokens = [word_tokenize(pred) for pred in extracted_preds]
#     labels_tokens = [[word_tokenize(label)] for label in extracted_labels]
#     bleu_score = corpus_bleu(labels_tokens, preds_tokens)
#     return {
#         "rouge1": rouge_results["rouge1"],
#         "rouge2": rouge_results["rouge2"],
#         "rougeL": rouge_results["rougeL"],
#         "bleu": bleu_score
#     }

# # ---------------------------------
# # Setup Training Arguments
# # ---------------------------------
# training_args = TrainingArguments(
#     output_dir=output_dir,
#     num_train_epochs=num_train_epochs,
#     per_device_train_batch_size=per_device_train_batch_size,
#     gradient_accumulation_steps=gradient_accumulation_steps,
#     logging_steps=logging_steps,
#     save_steps=save_steps,
#     save_total_limit=2,
#     evaluation_strategy="epoch",
#     fp16=use_fp16,
#     report_to="none"
# )

# # ---------------------------------
# # Initialize Trainer and Train
# # ---------------------------------
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics
# )

# torch.cuda.empty_cache()
# print("Starting training...")
# trainer.train()

# print("Evaluating model on evaluation set...")
# eval_metrics = trainer.evaluate()
# print("Evaluation Metrics:", eval_metrics)

# trainer.save_model(output_dir)
# print(f"Model saved to {output_dir}")


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Tokenizing dataset...


Map:   0%|          | 0/9140 [00:00<?, ? examples/s]

  trainer = Trainer(


Starting training...


RuntimeError: cublasLt ran into an error!
	shapeA=torch.Size([4096, 4096]), shapeB=torch.Size([33, 4096]), shapeC=(33, 4096)
	(lda, ldb, ldc)=(c_int(4096), c_int(4096), c_int(4096))
	(m, n, k)=(c_int(4096), c_int(33), c_int(4096))

In [None]:
# import os
# import math
# import torch
# import nltk
# nltk.download('punkt')
# from nltk.tokenize import word_tokenize
# from transformers import (
#     GPT2LMHeadModel,
#     GPT2Tokenizer,
#     Trainer,
#     TrainingArguments,
#     DataCollatorForLanguageModeling
# )
# from datasets import load_dataset
# import evaluate

# def main():
#     # ---------------- Load Dataset ---------------- #
#     # This CSV file should contain columns "Raw Requirements" and "Requirement with Standard Syntax"
#     csv_file = "cleaned.csv"  
#     # Load the dataset (by default it loads into the "train" split)
#     dataset = load_dataset("csv", data_files=csv_file)
#     # Split the dataset into training (80%) and test (20%) splits.
#     dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
#     train_dataset = dataset["train"]
#     test_dataset = dataset["test"]

#     # ---------------- Preprocessing Function ---------------- #
#     # For fine-tuning, we build a prompt that contains both the raw and formatted text.
#     def preprocess_function(examples):
#         texts = []
#         for raw, formatted in zip(examples["Raw Requirements"], examples["Requirement with Standard Syntax"]):
#             # The training prompt includes both raw and formatted text.
#             prompt = f"Raw Requirements: {raw}\nRequirement with Standard Syntax: {formatted}\n"
#             texts.append(prompt)
#         return tokenizer(texts, truncation=True, max_length=512, padding="max_length")

#     # ---------------- Load Pretrained GPT-2 ---------------- #
#     model_name = "gpt2"  # You can change this to a different GPT-2 variant if desired.
#     tokenizer = GPT2Tokenizer.from_pretrained(model_name)
#     model = GPT2LMHeadModel.from_pretrained(model_name).to("cuda")
#     # GPT-2 does not have a pad token; assign the end-of-sequence token as the pad token.
#     tokenizer.pad_token = tokenizer.eos_token

#     # ---------------- Tokenize the Dataset ---------------- #
#     # For training, tokenize the prompts (includes both raw and formatted requirements).
#     tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
#     tokenized_test = test_dataset.map(preprocess_function, batched=True, remove_columns=test_dataset.column_names)

#     # ---------------- Setup Data Collator ---------------- #
#     data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

#     # ---------------- Setup Training Arguments ---------------- #
#     training_args = TrainingArguments(
#         output_dir="./fine_tuned_model",
#         num_train_epochs=3,
#         per_device_train_batch_size=2,  # Adjust batch size as needed.
#         gradient_accumulation_steps=2,
#         evaluation_strategy="epoch",
#         logging_steps=500,
#         save_steps=500,
#         fp16=True,
#         report_to=[]  # Disable logging integrations (e.g., wandb)
#     )

#     # ---------------- Initialize Trainer ---------------- #
#     trainer = Trainer(
#         model=model,
#         args=training_args,
#         train_dataset=tokenized_train,
#         eval_dataset=tokenized_test,
#         data_collator=data_collator,
#         tokenizer=tokenizer,
#     )

#     # ---------------- Fine-tune the Model ---------------- #
#     print("Starting training...")
#     trainer.train()

#     # ---------------- Evaluate Perplexity ---------------- #
#     print("Evaluating model on test set (perplexity)...")
#     eval_results = trainer.evaluate()
#     eval_loss = eval_results.get("eval_loss")
#     if eval_loss is not None and eval_loss < float("inf"):
#         perplexity = math.exp(eval_loss)
#         eval_results["perplexity"] = perplexity
#     else:
#         eval_results["perplexity"] = float("inf")
#     print("Evaluation results (Trainer):")
#     print(eval_results)

#     # ---------------- BLEU Score Evaluation ---------------- #
#     # For BLEU evaluation, we generate outputs on the raw test samples.
#     generated_texts = []
#     references = []
#     print("\nGenerating outputs for BLEU evaluation on the test set...")

#     # We'll generate outputs sample-by-sample.
#     for sample in test_dataset:
#         raw_req = sample["Raw Requirements"]
#         reference_formatted = sample["Requirement with Standard Syntax"]

#         # Create a prompt using only the raw requirement.
#         prompt = f"Raw Requirements: {raw_req}\nRequirement with Standard Syntax:"
#         inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to("cuda")

#         with torch.no_grad():
#             outputs = model.generate(
#                 **inputs,
#                 max_length=512,
#                 num_beams=2,
#                 early_stopping=True,
#                 pad_token_id=tokenizer.eos_token_id
#             )
#         generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
#         # Extract the generated formatted text (everything after "Requirement with Standard Syntax:")
#         if "Requirement with Standard Syntax:" in generated_text:
#             generated_text = generated_text.split("Requirement with Standard Syntax:")[-1].strip()
#         else:
#             generated_text = generated_text.strip()
#         generated_texts.append(generated_text)
#         # Each reference should be a list of reference strings (even if only one).
#         references.append([reference_formatted.strip()])

#     # Use the evaluate library to compute BLEU.
#     bleu_metric = evaluate.load("bleu")
#     bleu_result = bleu_metric.compute(predictions=generated_texts, references=references)
#     print(f"\nBLEU score on test set: {bleu_result['bleu']:.4f}")

# if __name__ == "__main__":
#     main()
