In [6]:
import sys

TDECOMP_PATH = '..'
if not TDECOMP_PATH in sys.path:
    sys.path.append(TDECOMP_PATH)

Берём SmallLM 

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_NAME = "arnir0/Tiny-LLM"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,  use_fast=False)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# train_imdb.py
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    pipeline
)
from datasets import load_dataset
import torch
# from peft import LoraConfig, get_peft_model, TaskType
import evaluate
import numpy as np

# model_name = "Qwen/Qwen2-0.5B-Instruct"
model_name = 'arnir0/Tiny-LLM'
dataset_name = "imdb"
# output_dir = "./qwen2-0.5b-imdb-finetuned"
output_dir = './tiny-llm'
max_length = 512  # Maximum context length for each sample

# Use 4-bit quantization to drastically reduce memory usage
use_4bit = False
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

# LoRA configuration for Parameter-Efficient Fine-Tuning
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1

# Training arguments
num_train_epochs = 3
per_device_train_batch_size = 4
per_device_eval_batch_size = 4
gradient_accumulation_steps = 4
learning_rate = 2e-4
logging_steps = 10
save_steps = 500

print("Loading model and tokenizer...")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True)
# Set padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model with 4-bit quantization if enabled
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
if use_4bit:
    from transformers import BitsAndBytesConfig
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=use_4bit,
        bnb_4bit_quant_type=bnb_4bit_quant_type,
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=use_nested_quant,
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",  # Automatically places layers on available GPUs
        trust_remote_code=True
    )
else:
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="cpu", #auto
        torch_dtype=torch.float32,
        trust_remote_code=True
    )

# # Option 1: Tiny Shakespeare (literary text)
# dataset = load_dataset("tiny_shakespeare", split="train[:5%]")  # First 5%

# Option 2: CNN Daily Mail (news summaries) - smaller subset
# dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:100]")

# Option 3: Wikitext (Wikipedia articles) - small subset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1000]")

# Option 4: Twitter Complaints (short text)
# dataset = load_dataset("twitter_complaints", split="train[:200]")

# Option 5: AG News (news articles)
# dataset = load_dataset("ag_news", split="train[:100]")

print(f"Dataset size: {len(dataset)}")
print(f"Dataset features: {dataset.features}")

# Preprocess the dataset based on its structure
def preprocess_dataset(examples):
    """Extract text from different dataset formats"""
    if 'text' in examples:
        return {"text": examples["text"]}
    elif 'article' in examples:  # CNN Daily Mail
        return {"text": examples["article"]}
    elif 'content' in examples:  # Some datasets
        return {"text": examples["content"]}
    elif 'sentence' in examples:  # Some sentence datasets
        return {"text": examples["sentence"]}
    else:
        # Try to use the first string column
        for key, value in examples.items():
            if isinstance(value[0], str):
                return {"text": examples[key]}
        return {"text": [str(x) for x in examples[list(examples.keys())[0]]]}

# Apply preprocessing
dataset = dataset.map(preprocess_dataset, batched=True)

# Filter out empty texts
dataset = dataset.filter(lambda example: len(example["text"].strip()) > 0)

# Tokenization function
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding=True,
        max_length=128,  # Reduced for tiny model
        return_tensors="pt"
    )
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Split dataset
train_test_split = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(eval_dataset)}")


# This will dynamically pad the batches during training
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # We are doing causal LM, not masked LM
)

# Load accuracy metric
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Shift labels and predictions for causal LM (next token prediction)
    # Predictions are for the next token, so we shift labels accordingly
    shift_logits = logits[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()
    
    # Flatten the tokens and get predictions
    predictions = np.argmax(shift_logits, axis=-1).flatten()
    labels = shift_labels.flatten()
    
    # Calculate accuracy, ignoring padding tokens (where label = -100)
    mask = labels != -100
    predictions = predictions[mask]
    labels = labels[mask]
    return accuracy_metric.compute(predictions=predictions, references=labels)

# ----------------------------
# 7. Training Arguments
# ----------------------------
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    # logging_steps=logging_steps,
    logging_steps=5,
    save_steps=save_steps,
    eval_strategy="steps",
    eval_steps=5,
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to=None,  # Disable external logging like Weights & Biases for simplicity
    fp16=False,  # Use mixed precision training
)

Loading model and tokenizer...
Dataset size: 1000
Dataset features: {'text': Value(dtype='string', id=None)}
Training samples: 517
Validation samples: 130


In [10]:
from tdecomp.grad_proj.tensorgrad.config import TensorGRaDConfig, DataConfig, OptimizerConfig
import tdecomp.matrix.functional as F 

ParallelTG, ULTG - это вспомогательные классы-фабрики, убирающие лишние настройки, чтобы было проще. 
Если понадобится более тонкая настройка -- смотрите TensorGRaDConfig и передавайте соответствующие поля (там есть **kwargs) 

In [13]:
from tdecomp.grad_proj.tensorgrad.prepared_tg import ParallelTG, ULTG

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    optimizers=ParallelTG(model, 
                    F.cur, # svd_type можете зарегистрировать свои разложения матричные в модуле F, 
                    # а также просто `truncated_svd` и `randomized_svd`
                    (8, 0.5), # первый и второй ранги. может быть Number, тогда ранги ставятся одинкаовыми.
                    # 0 < float < 1 интерпретируется как доля параметров, int - непосредственно ранг
                                  n_train=len(train_dataset),
                                  batch_size=per_device_train_batch_size,
                                  scheduler='StepLR'
                                  ),
    # compute_metrics=compute_metrics, # Uncomment for per-epoch metrics (slower)
)


In [14]:
print("Starting training...")
trainer.train()

Starting training...
### Using Composite Projector Configuration ###
    => Swapping projectors to ensure smaller one is first
    => Sizes after swap: first=0.25, second=8.0
Update gap scheduler: <tdecomp.grad_proj.tensorgrad.projectors.projector_utils.UpdateGapScheduler object at 0x7efb91f5ad40>
UnstructuredSparseProjector initialized with sparse_ratio=0.25, sparse_type=topk, scale_by_mask_ratio=False
    => First projector: unstructured_sparse
    => Second projector: low_rank
### Using Composite Projector Configuration ###
    => Swapping projectors to ensure smaller one is first
    => Sizes after swap: first=0.25, second=8.0
Update gap scheduler: <tdecomp.grad_proj.tensorgrad.projectors.projector_utils.UpdateGapScheduler object at 0x7efb91f5b580>
UnstructuredSparseProjector initialized with sparse_ratio=0.25, sparse_type=topk, scale_by_mask_ratio=False
    => First projector: unstructured_sparse
    => Second projector: low_rank
### Using Composite Projector Configuration ###
   

Step,Training Loss,Validation Loss
5,5.3304,5.43249
10,5.2352,5.362298
15,5.208,5.290714
20,5.1437,5.216618
25,5.0655,5.143417
30,4.9276,5.076861
35,5.0054,5.070557
40,4.8951,5.064242
45,4.9276,5.057898
50,5.0329,5.05156


TrainOutput(global_step=99, training_loss=4.999285476376312, metrics={'train_runtime': 6.9874, 'train_samples_per_second': 221.971, 'train_steps_per_second': 14.168, 'total_flos': 8153535430656.0, 'train_loss': 4.999285476376312, 'epoch': 3.0})