In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration, DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import Dataset
from peft import LoraConfig, get_peft_model, TaskType

In [None]:
# Step 1: Load and Prepare the Dataset
df = pd.read_csv("/content/agricultural_land_clauses.csv")
df["label"] = df["label"].map({"SAFE": 0, "RISKY": 1})  # For risk analysis
dataset = Dataset.from_pandas(df)

In [None]:
# Step 2: Fine-Tune BART for Summarization with LoRA
# Load tokenizer and model
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [None]:
# Configure LoRA for BART
lora_config_bart = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,  # Sequence-to-sequence task (summarization)
    r=8,  # Rank of the low-rank matrices
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1,  # Dropout for LoRA layers
    target_modules=["q_proj", "v_proj"],  # Apply LoRA to attention layers
)
bart_model = get_peft_model(bart_model, lora_config_bart)
print("LoRA applied to BART model")

LoRA applied to BART model


In [None]:
# Tokenize the dataset for summarization
def preprocess_bart_function(examples):
    inputs = examples["text"]
    targets = examples["summary"]
    model_inputs = bart_tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = bart_tokenizer(targets, max_length=150, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_bart_dataset = dataset.map(preprocess_bart_function, batched=True)
train_bart_dataset = tokenized_bart_dataset.train_test_split(test_size=0.1)["train"]
eval_bart_dataset = tokenized_bart_dataset.train_test_split(test_size=0.1)["test"]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
# Define training arguments for BART
bart_training_args = TrainingArguments(
    output_dir="./fine_tuned_bart_lora",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs_bart",
    logging_steps=10,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=500,
    load_best_model_at_end=True,
)



In [None]:
# Initialize Trainer for BART
bart_trainer = Trainer(
    model=bart_model,
    args=bart_training_args,
    train_dataset=train_bart_dataset,
    eval_dataset=eval_bart_dataset,
)

No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
# Fine-tune BART
print("Fine-tuning BART with LoRA...")
bart_trainer.train()



Fine-tuning BART with LoRA...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mimayank45[0m ([33mimayank45-svkm-s-narsee-monjee-institute-of-management-s[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
10,10.4176,10.284489
20,10.2725,10.268735
30,10.2518,10.241377
40,10.3121,10.201578
50,10.3873,10.147865
60,10.1578,10.07793
70,10.1078,9.990453
80,10.1274,9.885183
90,10.0637,9.763935
100,9.8385,9.631842


Could not locate the best model at ./fine_tuned_bart_lora/checkpoint-670/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


TrainOutput(global_step=675, training_loss=4.363753256621184, metrics={'train_runtime': 1045.5279, 'train_samples_per_second': 2.582, 'train_steps_per_second': 0.646, 'total_flos': 2935375685222400.0, 'train_loss': 4.363753256621184, 'epoch': 3.0})

In [None]:
# Save the fine-tuned BART model
bart_model.save_pretrained("./fine_tuned_bart_lora")
bart_tokenizer.save_pretrained("./fine_tuned_bart_lora")

('./fine_tuned_bart_lora/tokenizer_config.json',
 './fine_tuned_bart_lora/special_tokens_map.json',
 './fine_tuned_bart_lora/vocab.json',
 './fine_tuned_bart_lora/merges.txt',
 './fine_tuned_bart_lora/added_tokens.json')

In [None]:
!zip -r fine_tuned_bart_lora.zip fine_tuned_bart_lora

  adding: fine_tuned_bart_lora/ (stored 0%)
  adding: fine_tuned_bart_lora/special_tokens_map.json (deflated 85%)
  adding: fine_tuned_bart_lora/merges.txt (deflated 53%)
  adding: fine_tuned_bart_lora/checkpoint-500/ (stored 0%)
  adding: fine_tuned_bart_lora/checkpoint-500/training_args.bin (deflated 52%)
  adding: fine_tuned_bart_lora/checkpoint-500/adapter_config.json (deflated 54%)
  adding: fine_tuned_bart_lora/checkpoint-500/optimizer.pt (deflated 7%)
  adding: fine_tuned_bart_lora/checkpoint-500/README.md (deflated 66%)
  adding: fine_tuned_bart_lora/checkpoint-500/scheduler.pt (deflated 57%)
  adding: fine_tuned_bart_lora/checkpoint-500/rng_state.pth (deflated 25%)
  adding: fine_tuned_bart_lora/checkpoint-500/adapter_model.safetensors (deflated 7%)
  adding: fine_tuned_bart_lora/checkpoint-500/trainer_state.json (deflated 83%)
  adding: fine_tuned_bart_lora/tokenizer_config.json (deflated 75%)
  adding: fine_tuned_bart_lora/adapter_config.json (deflated 54%)
  adding: fine_tu

In [None]:
# Step 3: Fine-Tune DistilBERT for Risk Analysis with LoRA
# Load tokenizer and model
distilbert_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
distilbert_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Configure LoRA for DistilBERT
lora_config_distilbert = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # Sequence classification task (risk analysis)
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_lin", "v_lin"],  # Apply LoRA to attention layers in DistilBERT
)
distilbert_model = get_peft_model(distilbert_model, lora_config_distilbert)
print("LoRA applied to DistilBERT model")

LoRA applied to DistilBERT model


In [None]:
# Tokenize the dataset for risk analysis
def preprocess_distilbert_function(examples):
    model_inputs = distilbert_tokenizer(examples["text"], max_length=512, truncation=True, padding="max_length")
    model_inputs["labels"] = examples["label"]
    return model_inputs

tokenized_distilbert_dataset = dataset.map(preprocess_distilbert_function, batched=True)
train_distilbert_dataset = tokenized_distilbert_dataset.train_test_split(test_size=0.1)["train"]
eval_distilbert_dataset = tokenized_distilbert_dataset.train_test_split(test_size=0.1)["test"]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
# Define training arguments for DistilBERT
distilbert_training_args = TrainingArguments(
    output_dir="./fine_tuned_distilbert_lora",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs_distilbert",
    logging_steps=10,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=500,
    load_best_model_at_end=True,
)



In [None]:
# Initialize Trainer for DistilBERT
distilbert_trainer = Trainer(
    model=distilbert_model,
    args=distilbert_training_args,
    train_dataset=train_distilbert_dataset,
    eval_dataset=eval_distilbert_dataset,
)

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
# Fine-tune DistilBERT
print("Fine-tuning DistilBERT with LoRA...")
distilbert_trainer.train()

Fine-tuning DistilBERT with LoRA...


Step,Training Loss,Validation Loss
10,0.6984,0.693896
20,0.6942,0.692714
30,0.6967,0.690803
40,0.6841,0.687997
50,0.6831,0.684377
60,0.6807,0.680167
70,0.6802,0.674976
80,0.6804,0.669351
90,0.6654,0.662815
100,0.6675,0.65565


Could not locate the best model at ./fine_tuned_distilbert_lora/checkpoint-670/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


TrainOutput(global_step=675, training_loss=0.2239943719657, metrics={'train_runtime': 206.669, 'train_samples_per_second': 13.064, 'train_steps_per_second': 3.266, 'total_flos': 363796398489600.0, 'train_loss': 0.2239943719657, 'epoch': 3.0})

In [None]:
# Save the fine-tuned DistilBERT model
distilbert_model.save_pretrained("./fine_tuned_distilbert_lora")
distilbert_tokenizer.save_pretrained("./fine_tuned_distilbert_lora")

('./fine_tuned_distilbert_lora/tokenizer_config.json',
 './fine_tuned_distilbert_lora/special_tokens_map.json',
 './fine_tuned_distilbert_lora/vocab.txt',
 './fine_tuned_distilbert_lora/added_tokens.json')

In [None]:
!zip -r fine_tuned_distilbert_lora.zip fine_tuned_distilbert_lora

  adding: fine_tuned_distilbert_lora/ (stored 0%)
  adding: fine_tuned_distilbert_lora/special_tokens_map.json (deflated 42%)
  adding: fine_tuned_distilbert_lora/checkpoint-500/ (stored 0%)
  adding: fine_tuned_distilbert_lora/checkpoint-500/training_args.bin (deflated 51%)
  adding: fine_tuned_distilbert_lora/checkpoint-500/adapter_config.json (deflated 54%)
  adding: fine_tuned_distilbert_lora/checkpoint-500/optimizer.pt (deflated 8%)
  adding: fine_tuned_distilbert_lora/checkpoint-500/README.md (deflated 66%)
  adding: fine_tuned_distilbert_lora/checkpoint-500/scheduler.pt (deflated 57%)
  adding: fine_tuned_distilbert_lora/checkpoint-500/rng_state.pth (deflated 25%)
  adding: fine_tuned_distilbert_lora/checkpoint-500/adapter_model.safetensors (deflated 7%)
  adding: fine_tuned_distilbert_lora/checkpoint-500/trainer_state.json (deflated 83%)
  adding: fine_tuned_distilbert_lora/vocab.txt (deflated 53%)
  adding: fine_tuned_distilbert_lora/tokenizer_config.json (deflated 75%)
  addi