In [None]:
import os

os.environ['HF_TOKEN'] = ""

In [None]:
%env HF_ENDPOINT=<your jf hf repo url>
%env HF_HUB_ETAG_TIMEOUT=86400
%env HF_HUB_DOWNLOAD_TIMEOUT=86400
# %env JF_URL=https://jfrogmldemo.jfrog.io
%env finetuning=False
%env batch_size=64
avg_eval_loss = 0.7315

In [None]:
## Unsafe model
##dbalencar/vgg16_light

## Unapproved

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import AutoTokenizer, AutoModelForCausalLM

import os
import logging

logger = logging.getLogger(__name__)
model_name = os.getenv("model_name", "moonshotai/Kimi-K2-Thinking")
hyper_parameters = {
    'learning_rate' : os.getenv("learning_rate", 0.0001),
    'epochs' : os.getenv("epochs", 20),
    'batch_size': int(os.getenv("batch_size", 200)),
    'early_stopping' : os.getenv("early_stopping", "True") == "True",
    'Finetunning' : os.getenv("finetuning", "False") == "True"
}

tokenizer = tokenizer = AutoTokenizer.from_pretrained(
    model_name
)
model = AutoModelForCausalLM.from_pretrained(
    model_name
)

## Approved

In [None]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from main.finetuning import eval_model, generate_dataset, train_model
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import os
import logging

logger = logging.getLogger(__name__)
model_name = os.getenv("model_name", "distilbert/distilbert-base-uncased-finetuned-sst-2-english")
repository = 'nlp-models'
model_id = 'sentiment_analysis'
model_path = "./fine_tuned_distilbert_sst2"
hyper_parameters = {
    'learning_rate' : os.getenv("learning_rate", 0.0001),
    'epochs' : os.getenv("epochs", 20),
    'batch_size': int(os.getenv("batch_size", 200)),
    'early_stopping' : os.getenv("early_stopping", "True") == "True",
    'Finetunning' : os.getenv("finetuning", "False") == "True"
}
finetuning = True

tokenizer = DistilBertTokenizer.from_pretrained(
    model_name
)
model = DistilBertForSequenceClassification.from_pretrained(
    model_name
)

In [None]:
model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Setting device as {device}")
print("Downloading dataset")
dataset = load_dataset("stanfordnlp/sst2")
print("Generating datasets")
train_dataset, eval_dataset = generate_dataset(tokenizer, dataset)
df_train = train_dataset.examples.data.to_pandas()
df_train['num_spaces'] = df_train['sentence'].apply(lambda x: x.count(' '))
df_train['num_words'] = df_train['sentence'].apply(lambda x: len(x.split()))
df_train['sentence_length'] = df_train['sentence'].apply(len)

In [None]:
# 1. Create data loaders with minimal worker settings
print("Setting up data loaders...")
batch_size = hyper_parameters['batch_size'] * (torch.cuda.device_count() if torch.cuda.is_available() else 1)
print(f"Using batch size: {batch_size}")

train_loader = DataLoader(
    train_dataset, 
    batch_size=batch_size,
    shuffle=True,
    num_workers=0,  # Set to 0 to avoid shared memory issues
    pin_memory=True
)

eval_loader = DataLoader(
    eval_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=0,  # Set to 0 to avoid shared memory issues
    pin_memory=True
)
print("Data loader setup complete")

------------------------------

In [None]:
# 2. Set environment variables
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(i) for i in range(torch.cuda.device_count()))

# 3. Setup GPU
def setup_gpu():
    """Simple GPU setup for DataParallel"""
    print("Starting GPU setup...")
    
    if not torch.cuda.is_available():
        print("CUDA not available, using CPU")
        return torch.device('cpu'), False
        
    num_gpus = torch.cuda.device_count()
    print(f"Found {num_gpus} GPUs")
    
    if num_gpus <= 1:
        print("Using single GPU")
        return torch.device('cuda'), False
    
    print(f"Using {num_gpus} GPUs with DataParallel")
    return torch.device('cuda'), True

# 4. Setup device
device, is_multi_gpu = setup_gpu()
print(f"Device: {device}, Multi-GPU: {is_multi_gpu}")

In [None]:
# 5. Model setup
if is_multi_gpu:
    print("Wrapping model with DataParallel")
    model = torch.nn.DataParallel(model)
print("Model setup complete")

In [None]:
# 6. Training
if finetuning:
    print("Starting training...")
    # Use PyTorch's AdamW instead of transformers' version
    optimizer = torch.optim.AdamW(model.parameters(), lr=hyper_parameters['learning_rate'])
    
    try:
        model = train_model(
            model,
            device,
            hyper_parameters['learning_rate'],
            hyper_parameters['epochs'],
            train_loader,
            eval_loader,
            hyper_parameters['early_stopping'],
            logger,
            is_distributed=False,
            local_rank=0
        )
    except Exception as e:
        print(f"Error during training: {str(e)}")
        if is_multi_gpu:
            print("Falling back to single GPU...")
            model = model.module.to(device)
            model = train_model(
                model,
                device,
                hyper_parameters['learning_rate'],
                hyper_parameters['epochs'],
                train_loader,
                eval_loader,
                hyper_parameters['early_stopping'],
                logger,
                is_distributed=False,
                local_rank=0
            )

# 7. Save model
print("Saving model...")
if isinstance(model, torch.nn.DataParallel):
    model.module.save_pretrained(model_path)
else:
    model.save_pretrained(model_path)
print(f"Model saved to {model_path}")

In [None]:
# Continue with model evaluation and logging as before

avg_eval_loss, loss_list = eval_model(model, device, eval_loader)
print(f"Eval Loss: {avg_eval_loss:.4f}")

-------

## Log Model to ML Repo

In [None]:
import frogml
from pathlib import Path

model = model.module if isinstance(model, torch.nn.DataParallel) else model
main_dir = Path.cwd() / "main"
metrics = {"eval_loss": avg_eval_loss}
repository = 'ml-prod'
model_name = 'sentiment_analysis'

frogml.huggingface.log_model(
    model=model,
    tokenizer=tokenizer,
    repository=repository,
    model_name=model_name,
    # dependencies=[str(main_dir / ("pyproject.toml"))],
    # code_dir=main_dir,
    # predict_file=main_dir / "predict.py",
    parameters=hyper_parameters,
    metrics=metrics
)



## Load Model from Repo

In [None]:
import frogml.huggingface

model, tokenizer = frogml.huggingface.load_model(
    repository=repository,
    model_name=model_id,
    version="025-03-18-21-30-47-053" # Model version
)