In [1]:

!pip uninstall -y numpy scipy scikit-learn torch torchvision torchaudio tensorboard tensorflow keras matplotlib

!pip install numpy==1.21.6

!pip install torch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1
!pip install transformers==4.25.1 accelerate bitsandbytes
!pip install datasets==2.21.0 huggingface-hub==0.34.0 
!pip install groq spacy nltk tiktoken

!python -m spacy download en_core_web_sm
!git clone https://github.com/ilatims-b/LLMLingua.git

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: scipy 1.15.3
Uninstalling scipy-1.15.3:
  Successfully uninstalled scipy-1.15.3
Found existing installation: scikit-learn 1.2.2
Uninstalling scikit-learn-1.2.2:
  Successfully uninstalled scikit-learn-1.2.2
Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Successfully uninstalled torchvision-0.21.0+cu124
Found existing installation: torchaudio 2.6.0+cu124
Uninstalling torchaudio-2.6.0+cu124:
  Successfully uninstalled torchaudio-2.6.0+cu124
Found existing installation: tensorboard 2.18.0
Uninstalling tensorboard-2.18.0:
  Successfully uninstalled tensorboard-2.18.0
Found existing installation: tensorflow 2.18.0
Uninstalling tensorflow-2.18.0:
  Successfully uninstalled tensorflow-2.1

In [2]:
#not needed if not using groq
!pip install groq



In [3]:
from datasets import load_dataset
import json
import os

In [4]:
data = []
NUM_SAMPLES = 1
datasets = ['multifieldqa_en', 'triviaqa']
current_idx = 0  # Running index across datasets

for dataset_ in datasets:
    data_ = load_dataset('THUDM/LongBench', dataset_, split='test', trust_remote_code=True)
    
    for idx, instance in enumerate(data_):
        if idx >= NUM_SAMPLES:
            break
        
        temp = {}
        temp["dataset"] = dataset_
        temp["idx"] = current_idx  # Use running index here instead of idx
        temp["prompt"] = instance['context'] + '\n' + instance['input']
        temp["answer"] = instance['answers']
        temp["length"] = instance['length']
        data.append(temp)
        
        current_idx += 1  # Increment the running index
        
os.makedirs("/kaggle/working/dataset", exist_ok=True)
with open("/kaggle/working/dataset/longbench_1.json", "w") as f:
    json.dump(data, f, indent=4)



Downloading builder script:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/114M [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [5]:

os.chdir('/kaggle/working/LLMLingua/experiments/llmlingua2/data_collection')

In [None]:
!pip install openai==0.28#not needed if not using gpt4

In [6]:

#generating compressed prompts from gpt4 for data annotation
!python compress.py --load_origin_from /kaggle/working/dataset/longbench_1.json \
--chunk_size 512 \
--compressor groq\
--model_name openai/gpt-oss-20b \
--api_key "your api key :)"\
--save_path /kaggle/working/dataset/longbench_1_compressed.json

num data: 2
You are an excellent linguist and very good at compressing passages into short expressions by removing unimportant words, while retaining as much information as possible.
Compress some text to short expressions, and such that you (GPT-4) can reconstruct it as close as possible to the original. Unlike the usual text compression, I need you to comply with the 5 conditions below: 1. You can ONLY remove unimportant words. 2. Do not change the order of words. 3. Do not change the original words, e.g. 'asking'->'ask' is NOT OK, 'current'->'now' is NOT OK. 4. Do not use abbreviations or emojis, e.g. 'without'->'w/o' is NOT OK, 'as soon as possible'->'ASAP' is NOT OK. 5. Do not add new words or symbols, this is very important. For example, 'dedicate 3 hours to each chapter'->'3 hours/chapter' is NOT OK because you add new token '/', just compress it into '3 hours each chapter'. '30 eggs plus 20 eggs equals 50 eggs'->'30+20=50' is also NOT OK becuase you add new symbols + and =, jus

In [7]:
os.makedirs("/kaggle/working/dataset/longbench_hf_dataset", exist_ok=True)
json.dump(list(json.load(open("/kaggle/working/dataset/longbench_1_compressed.json")).values()), open("/kaggle/working/dataset/longbench_hf_dataset/train.json", "w"), ensure_ascii=False)

In [8]:
#train data annotations
!python label_word.py \
--load_prompt_from /kaggle/working/dataset/longbench_hf_dataset  \
--window_size 400 \
--save_path /kaggle/working/dataset/longbench_1_labelled.json

Generating train split: 2 examples [00:00, 57.04 examples/s]
15it [00:01, 13.07it/s]
window size: 400, comp rate: 0.14955640453642954, hitting_rate: 0.1469762469261397, retrieval rate: 0.14175059371948887


In [9]:
#train data filtering, they have defined various metrics in paper
!python filter.py \
--load_path /kaggle/working/dataset/longbench_1_labelled.pt\
--save_path /kaggle/working/dataset/longbench_1_filtered.pt

15


In [10]:

os.chdir('/kaggle/working/LLMLingua/experiments/llmlingua2/model_training')

In [11]:
!pip install --force-reinstall tensorboard==2.13.0

Collecting tensorboard==2.13.0
  Downloading tensorboard-2.13.0-py3-none-any.whl.metadata (1.8 kB)
Collecting absl-py>=0.4 (from tensorboard==2.13.0)
  Downloading absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting grpcio>=1.48.2 (from tensorboard==2.13.0)
  Downloading grpcio-1.74.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting google-auth<3,>=1.6.3 (from tensorboard==2.13.0)
  Downloading google_auth-2.40.3-py2.py3-none-any.whl.metadata (6.2 kB)
Collecting google-auth-oauthlib<1.1,>=0.5 (from tensorboard==2.13.0)
  Downloading google_auth_oauthlib-1.0.0-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting markdown>=2.6.8 (from tensorboard==2.13.0)
  Downloading markdown-3.8.2-py3-none-any.whl.metadata (5.1 kB)
Collecting numpy>=1.12.0 (from tensorboard==2.13.0)
  Using cached numpy-2.3.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting protobuf>=3.19.6 (from tensorboard==2.13.0)
  Downloading proto

In [12]:
import numpy as np
import torch
from torch.utils.tensorboard import SummaryWriter

print(f"✅ NumPy: {np.__version__}")

print(f"✅ PyTorch: {torch.__version__}")
print("✅ All imports successful!")

✅ NumPy: 1.26.4
✅ PyTorch: 2.8.0+cu128
✅ All imports successful!


In [14]:
os.makedirs("/kaggle/working/roberta_custom", exist_ok=True)


In [15]:
!python train_roberta.py \
    --data_path /kaggle/working/dataset/longbench_1_filtered.pt \
    --save_path /kaggle/working/roberta_custom \
    --num_epoch 3 \
    --lr 3e-5 \
    --quantization float16 \
    --batch_size 16


CUDA test successful on cuda
Loading model with float16 precision
Some weights of the model checkpoint at FacebookAI/xlm-roberta-large were not used when initializing XLMRobertaForTokenClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-large and are newly initialized: ['classifier.weight', 'cla