## Step 1: Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Step 2: Preprocess FEVER Dataset from Raw JSONL

In [2]:
import os

# Path to the zip file on Drive
fever_zip = '/content/drive/MyDrive/fever_processed.zip'
fever_dir = '/content/drive/MyDrive/fever_processed'

# Check if unzip already done
if os.path.exists(os.path.join(fever_dir, 'dataset_dict.json')):
    print(f"Processed dataset already exists at {fever_dir}")
elif os.path.exists(fever_zip):
    print(f"Unzipping {fever_zip}...")
    !unzip -q {fever_zip} -d /content/drive/MyDrive/
    print("Done unzipping.")

    # Check if unzip created nested structure (fever/fever/...)
    print("\nChecking unzipped structure...")
    !ls -lh /content/drive/MyDrive/fever_processed/ 2>/dev/null || echo "fever_processed folder not found"

    # If dataset_dict.json is in a nested 'fever' subfolder, move it up
    nested_fever = os.path.join(fever_dir, 'fever')
    if os.path.exists(nested_fever) and os.path.exists(os.path.join(nested_fever, 'dataset_dict.json')):
        print(f"Found nested structure. Moving contents from {nested_fever} to {fever_dir}...")
        !mv {nested_fever}/* {fever_dir}/ && rmdir {nested_fever}
        print("Moved successfully.")
else:
    raise FileNotFoundError(f"fever_processed.zip not found at {fever_zip}. Upload it to Drive first.")

# Verify final contents
print(f"\nFinal contents of {fever_dir}:")
!ls -lh {fever_dir}

Unzipping /content/drive/MyDrive/fever_processed.zip...
replace /content/drive/MyDrive/fever/dataset_dict.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
Done unzipping.

Checking unzipped structure...
fever_processed folder not found

Final contents of /content/drive/MyDrive/fever_processed:
ls: cannot access '/content/drive/MyDrive/fever_processed': No such file or directory


## Step 3: Clone Repository

In [3]:
# Clone your repo (update with your actual repo URL)
!git clone https://github.com/kvj-085/NLP_project.git
%cd /content/NLP_project

Cloning into 'NLP_project'...
remote: Enumerating objects: 34, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 34 (delta 9), reused 32 (delta 7), pack-reused 0 (from 0)[K
Receiving objects: 100% (34/34), 128.75 KiB | 4.77 MiB/s, done.
Resolving deltas: 100% (9/9), done.
/content/NLP_project


## Step 4: Install Dependencies

In [4]:
# Install required packages with --no-deps to avoid breaking Colab system packages
!pip install --no-deps transformers datasets tokenizers huggingface_hub safetensors

print("\n=== Package versions ===")
!pip show transformers datasets


=== Package versions ===
Name: transformers
Version: 4.57.2
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.12/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: peft, sentence-transformers
---
Name: datasets
Version: 4.0.0
Summary: HuggingFace community-driven open-source library of datasets
Home-page: https://github.com/huggingface/datasets
Author: HuggingFace Inc.
Author-email: thomas@huggingface.co
License: Apache 2.0
Location: /usr/local/lib/python3.12/dist-packages
Requires: dill, filelock, fsspec, huggingface-hub, multiprocess, numpy, packaging, pandas, pyarrow, 

## Step 5: Preprocess Raw FEVER Data and Tokenize for RoBERTa

In [None]:
import os
from src.models.train import run_training

# Use the unzipped processed dataset folder
fever_data_dir = '/content/drive/MyDrive/fever'

if not os.path.exists(fever_data_dir):
    raise FileNotFoundError(f"Processed dataset not found at {fever_data_dir}. Unzip fever_processed.zip first (Step 2).")

print("Using processed dataset from:", fever_data_dir)
print("Contents:", os.listdir(fever_data_dir))

# Tokenize this dataset for RoBERTa
print("\nTokenizing dataset for RoBERTa (BPE tokenizer)...")

trainer, tokenized = run_training(
    processed_data_dir=fever_data_dir,
    model_name='roberta-base',
    output_dir='/tmp/tmp_output',  
    num_labels=3,
    epochs=0,  # tokenization only
    max_length=128,
    save_tokenized=True,
    tokenized_out_dir='/content/drive/MyDrive/fever_tokenized_roberta'
)

print("\n=== Tokenization Complete ===")
print("Tokenized dataset saved to: /content/drive/MyDrive/fever_tokenized_roberta")

Using processed dataset from: /content/drive/MyDrive/fever
Contents: ['fever_hub_repo_files.txt', 'dataset_dict.json', 'test', 'train', 'validation']

Tokenizing dataset for RoBERTa (BPE tokenizer)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/145449 [00:00<?, ? examples/s]

Map:   0%|          | 0/19998 [00:00<?, ? examples/s]

Map:   0%|          | 0/19998 [00:00<?, ? examples/s]

Saving tokenized dataset to /content/drive/MyDrive/fever_tokenized_roberta


Saving the dataset (0/1 shards):   0%|          | 0/145449 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/19998 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/19998 [00:00<?, ? examples/s]

Epochs <= 0 — tokenization-only run requested. Skipping Trainer setup.

=== Tokenization Complete ===
Tokenized dataset saved to: /content/drive/MyDrive/fever_tokenized_roberta


## Step 6: Verify Tokenized Dataset

In [7]:
from datasets import load_from_disk

tokenized_path = '/content/drive/MyDrive/fever_tokenized_roberta'
ds = load_from_disk(tokenized_path)

print("Dataset splits:", {k: len(v) for k, v in ds.items()})
print("Train columns:", ds['train'].column_names)
print("Sample input_ids length:", len(ds['train'][0]['input_ids']))
print("\nTokenization successful! Ready for training.")

Dataset splits: {'train': 145449, 'validation': 19998, 'test': 19998}
Train columns: ['text', 'label', 'input_ids', 'attention_mask']
Sample input_ids length: 128

Tokenization successful! Ready for training.


## Step 7: List Files in Tokenized Directory

In [8]:
!ls -lh /content/drive/MyDrive/fever_tokenized_roberta/

total 13K
-rw------- 1 root root   43 Dec  7 21:51 dataset_dict.json
drwx------ 2 root root 4.0K Dec  7 21:51 test
drwx------ 2 root root 4.0K Dec  7 21:51 train
drwx------ 2 root root 4.0K Dec  7 21:51 validation




The tokenized RoBERTa dataset is now saved to your Google Drive at:
`/content/drive/MyDrive/fever_tokenized_roberta`
