In [None]:
!pip uninstall torch torchvision torchaudio -y
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [3]:
from pathlib import Path
import os

# Set up dynamic project paths
# Try to detect if we're in a notebook subdirectory
current_dir = Path(os.getcwd())

# Check if we're in the notebooks folder
if current_dir.name == "notebooks":
    PROJECT_ROOT = current_dir.parent
else:
    # Otherwise, assume we're already in project root or find it
    PROJECT_ROOT = current_dir if (current_dir / "notebooks").exists() else current_dir.parent

DATA_DIR = PROJECT_ROOT / "data" / "OPUS"
CHECKPOINT_DIR = PROJECT_ROOT / "checkpoints"

# Create directories if they don't exist
DATA_DIR.mkdir(parents=True, exist_ok=True)
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Current Directory: {current_dir}")
print(f"Project Root: {PROJECT_ROOT}")
print(f"Data Directory: {DATA_DIR}")
print(f"Checkpoint Directory: {CHECKPOINT_DIR}")

Current Directory: /root/LLM-Language-Learning/notebooks
Project Root: /root/LLM-Language-Learning
Data Directory: /root/LLM-Language-Learning/data/OPUS
Checkpoint Directory: /root/LLM-Language-Learning/checkpoints


In [4]:
from datasets import load_dataset

In [None]:
!curl -L -o train.parquet "https://huggingface.co/datasets/opus100/resolve/main/en-hi/train-00000-of-00001.parquet"
!curl -L -o validation.parquet "https://huggingface.co/datasets/opus100/resolve/main/en-hi/validation-00000-of-00001.parquet"
!curl -L -o test.parquet "https://huggingface.co/datasets/opus100/resolve/main/en-hi/test-00000-of-00001.parquet"

In [5]:
dataset = load_dataset(
    "parquet",
    data_files={
        "train": "/root/LLM-Language-Learning/notebooks/train.parquet",
        "validation": "/root/LLM-Language-Learning/notebooks/validation.parquet",
        "test": "/root/LLM-Language-Learning/notebooks/test.parquet"
    }
)

print(dataset)
print(dataset["train"][0])

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 534319
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})
{'translation': {'en': 'Other, Private Use', 'hi': 'अन्य, निज़ी उपयोग'}}


In [6]:
import pandas as pd

df = pd.DataFrame(dataset['train']['translation'])

In [7]:
small_df = df.sample(5000, random_state=42)


In [8]:
small_df = small_df[(small_df['en'].str.len() < 200) & (small_df['hi'].str.len() < 200)]


In [9]:
small_df.to_json("/root/LLM-Language-Learning/data/OPUS/OPUSen_hi_opus.json", orient="records", lines=True)


In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("/root/LLM-Language-Learning/pythia-1.4b")


In [11]:
MAX_LENGTH = 128  # adjust based on your study design

def filter_long_sentences(example):
    return len(example["en"].split()) < MAX_LENGTH and \
           len(example["hi"].split()) < MAX_LENGTH

In [12]:
tokenized = dataset["train"].map(lambda x: {
    "en": x["translation"]["en"],
    "hi": x["translation"]["hi"]
})
tokenized = tokenized.filter(filter_long_sentences)


In [13]:
def tokenize_function(examples):
    return tokenizer(examples["en"], text_target=examples["hi"])

tokenized = tokenized.map(tokenize_function, batched=True)

In [14]:
tokenized_split = tokenized.train_test_split(test_size=0.1, seed=42)
train_dataset = tokenized_split['train']
eval_dataset  = tokenized_split['test']

In [15]:
import torch
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))

Torch version: 2.5.1+cu121
CUDA available: True
GPU name: NVIDIA A40


In [16]:
import torch
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
# 1️⃣ Load configuration manually
config = AutoConfig.from_pretrained("/root/LLM-Language-Learning/pythia-1.4b")

# 2️⃣ Initialize model skeleton from config
model = AutoModelForCausalLM.from_config(config)


cuda


In [17]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="/root/LLM-Language-Learning/checkpoints",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    save_strategy="epoch",
    logging_dir="./logs",
    eval_strategy="epoch",
    learning_rate=5e-5,
)



In [18]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

In [19]:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

In [20]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_split["train"],
    eval_dataset=tokenized_split["test"],
    data_collator=data_collator,
)

In [21]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mkaustubh-kislay[0m ([33mkaustubh-kislay-university-of-wisconsin-madison[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 