In [None]:
from pathlib import Path
import os

# Set up dynamic project paths
# Try to detect if we're in a notebook subdirectory
current_dir = Path(os.getcwd())

# Check if we're in the notebooks folder
if current_dir.name == "notebooks":
    PROJECT_ROOT = current_dir.parent
else:
    # Otherwise, assume we're already in project root or find it
    PROJECT_ROOT = current_dir if (current_dir / "notebooks").exists() else current_dir.parent

DATA_DIR = PROJECT_ROOT / "data" / "OPUS"
CHECKPOINT_DIR = PROJECT_ROOT / "checkpoints"

# Create directories if they don't exist
DATA_DIR.mkdir(parents=True, exist_ok=True)
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Current Directory: {current_dir}")
print(f"Project Root: {PROJECT_ROOT}")
print(f"Data Directory: {DATA_DIR}")
print(f"Checkpoint Directory: {CHECKPOINT_DIR}")

In [1]:
from datasets import load_dataset

In [2]:
!curl -L -o train.parquet "https://huggingface.co/datasets/opus100/resolve/main/en-hi/train-00000-of-00001.parquet"
!curl -L -o validation.parquet "https://huggingface.co/datasets/opus100/resolve/main/en-hi/validation-00000-of-00001.parquet"
!curl -L -o test.parquet "https://huggingface.co/datasets/opus100/resolve/main/en-hi/test-00000-of-00001.parquet"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   114  100   114    0     0    690      0 --:--:-- --:--:-- --:--:--   690
100  1347  100  1347    0     0   4338      0 --:--:-- --:--:-- --:--:--  4338
100 62.1M  100 62.1M    0     0  93.2M      0 --:--:-- --:--:-- --:--:--  246M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   119  100   119    0     0    723      0 --:--:-- --:--:-- --:--:--   725
100  1347  100  1347    0     0   4016      0 --:--:-- --:--:-- --:--:--  4016
100  241k  100  241k    0     0   572k      0 --:--:-- --:--:-- --:--:--  572k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   113  100   113    0     0    701      0 --:--:

In [3]:
dataset = load_dataset(
    "parquet",
    data_files={
        "train": "/root/LLM-Language-Learning/notebooks/train.parquet",
        "validation": "/root/LLM-Language-Learning/notebooks/validation.parquet",
        "test": "/root/LLM-Language-Learning/notebooks/test.parquet"
    }
)

print(dataset)
print(dataset["train"][0])

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 534319
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})
{'translation': {'en': 'Other, Private Use', 'hi': 'अन्य, निज़ी उपयोग'}}


In [4]:
import pandas as pd

df = pd.DataFrame(dataset['train']['translation'])

Unnamed: 0,en,hi
0,"Other, Private Use","अन्य, निज़ी उपयोग"
1,[SCREAMING],ऊबड़ .
2,Spouse,जीवनसाथी
3,I will never salute you!,- तुम एक कमांडर कभी नहीं होगा!
4,and the stars and the trees bow themselves;,और तारे और वृक्ष सजदा करते है;


In [5]:
small_df = df.sample(5000, random_state=42)


In [6]:
small_df = small_df[(small_df['en'].str.len() < 200) & (small_df['hi'].str.len() < 200)]


In [7]:
small_df.to_json("/root/LLM-Language-Learning/data/OPUS/OPUSen_hi_opus.json", orient="records", lines=True)


In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-1.4b")



In [9]:
MAX_LENGTH = 128  # adjust based on your study design

def filter_long_sentences(example):
    return len(example["en"].split()) < MAX_LENGTH and \
           len(example["hi"].split()) < MAX_LENGTH

In [10]:
tokenized = dataset["train"].map(lambda x: {
    "en": x["translation"]["en"],
    "hi": x["translation"]["hi"]
})
tokenized = tokenized.filter(filter_long_sentences)


Map:   0%|          | 0/534319 [00:00<?, ? examples/s]

Filter:   0%|          | 0/534319 [00:00<?, ? examples/s]

In [11]:
def tokenize_function(examples):
    return tokenizer(examples["en"], text_target=examples["hi"])

tokenized = tokenized.map(tokenize_function, batched=True)

Map:   0%|          | 0/533220 [00:00<?, ? examples/s]

In [12]:
tokenized_split = tokenized.train_test_split(test_size=0.1, seed=42)
train_dataset = tokenized_split['train']
eval_dataset  = tokenized_split['test']

In [13]:
import torch
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1️⃣ Load configuration manually
config = AutoConfig.from_pretrained("/root/LLM-Language-Learning/pythia-1.4b")

# 2️⃣ Initialize model skeleton from config
model = AutoModelForCausalLM.from_config(config)


In [14]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="/root/LLM-Language-Learning/checkpoints",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    save_strategy="epoch",
    logging_dir="./logs",
    eval_strategy="epoch",
    learning_rate=5e-5,
)



In [15]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

In [16]:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

In [17]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_split["train"],
    eval_dataset=tokenized_split["test"],
    data_collator=data_collator,
)

In [18]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mkaustubh-kislay[0m ([33mkaustubh-kislay-university-of-wisconsin-madison[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 