In [1]:
!nvidia-smi

Sun Dec  1 20:07:36 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.42.02              Driver Version: 555.42.02      CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla V100-PCIE-32GB           On  |   00000000:5E:00.0 Off |                    0 |
| N/A   33C    P0             25W /  250W |       1MiB /  32768MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla V100-PCIE-32GB          

In [2]:
print("starfdgted")

starfdgted


In [None]:
import os

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

import torch
import torch.nn as nn
from torch.nn import functional as F
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    Trainer, 
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import load_dataset
from transformers import TrainerCallback


# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")



# Custom Sentiment Classification Model
class SentimentClassificationModel(nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.base_model = base_model
        
        # Freeze base model parameters
        for param in self.base_model.parameters():
            param.requires_grad = False
        
        # Get the dimension of the base model's last hidden state
        hidden_size = base_model.config.hidden_size
        
        # Remove existing classification head
        if hasattr(base_model, 'score'):
            delattr(base_model, 'score')
        
        # Add new classification layer
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 256),
            nn.GELU(),
            nn.Dropout(p=0.2),
            nn.BatchNorm1d(256),
            nn.Linear(256, 2)  # 2 neurons for binary sentiment
        )
    
    def forward(self, input_ids, attention_mask=None, labels=None):
        # Get base model outputs
        outputs = self.base_model(
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            output_hidden_states=True
        )
        
    
        hidden_state = outputs.hidden_states[-1][:, -2, :]
        #Last sentence is "This movie review is" so -1 is " is" and -2 is " review"
        
        
        # Classify
        logits = self.classifier(hidden_state)
        
        # Compute loss if labels are provided
        loss = None
        if labels is not None:
            loss = F.cross_entropy(logits, labels)
        
        return {
            'loss': loss,
            'logits': logits
        }
    def save_pretrained(self, save_path):
        # Only save the trainable classifier weights and configuration
        torch.save({
            'classifier_state_dict': self.classifier.state_dict(),
            'hidden_size': self.base_model.config.hidden_size
        }, save_path)

    @classmethod
    def from_pretrained(cls, base_model, load_path):
        # Recreate the model with the base model
        model = cls(base_model)
        
        # Load the saved state
        checkpoint = torch.load(load_path)
        
        # Ensure the hidden size matches
        assert checkpoint['hidden_size'] == base_model.config.hidden_size, \
            "Loaded model's hidden size does not match the base model"
        
        # Load only the classifier weights
        model.classifier.load_state_dict(checkpoint['classifier_state_dict'])
        
        return model
import os

class SaveModelCallback(TrainerCallback):
    def __init__(self, model, save_dir="custom_checkpoints"):
        self.model = model
        self.save_dir = save_dir
        os.makedirs(save_dir, exist_ok=True)

    def on_epoch_end(self, args, state, control, **kwargs):
        # Define the save path for the current epoch
        save_path = os.path.join(self.save_dir, f"epoch-{state.epoch:.0f}")
        print(f"Saving model at {save_path}")
        
        # Save the model
        self.model.save_pretrained(save_path)

# Load tokenizer and base model
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M")

# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load base model
base_model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-135M")
base_model.resize_token_embeddings(len(tokenizer))
base_model = base_model.to(device)

# Wrap base model in custom sentiment classification model
model = SentimentClassificationModel(base_model).to(device)

# Load IMDB dataset
def prepare_dataset():
    # Load dataset
    # Shuffle dataset to ensure diversity
    dataset = load_dataset('imdb')
    dataset = dataset.shuffle(seed=42)
    
    # Reduce dataset size if needed
    train_size = int(1 * len(dataset['train']))
    test_size = int(0.2 * len(dataset['test']))
    dataset['train'] = dataset['train'].select(range(train_size))
    dataset['test'] = dataset['test'].select(range(test_size))
    dataset['unsupervised'] = dataset['unsupervised'].select(range(1)) # should not be used
    
    # Check label distribution before tokenization
    train_labels = [example['label'] for example in dataset['train']]
    test_labels = [example['label'] for example in dataset['test']]
    print(f"Label distribution in training set: {dict((x, train_labels.count(x)) for x in set(train_labels))}")
    print(f"Label distribution in testing set: {dict((x, test_labels.count(x)) for x in set(test_labels))}")
    
   
    def modify_text(t):
        
        t = t[:2200] # approx. ensures that this fits in 768 tokens max
        return f'"I loved this ! Great actors" This movie review is positive. "The ending was a bit disappointing. Also hard to understand." This movie review is negative. "{t}" This movie review is'
    
    # Tokenization function
    def tokenize_function(examples):

        return tokenizer(
            [modify_text(t) for t in examples['text']],
            truncation=True, 
            max_length=768
        )
    
    # Tokenize dataset
    tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['text'])
    
    # Rename label column to match Trainer's expectation
    tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')

    #print sample example from train and label
    print(tokenized_datasets['train'][4]['labels'])
    print(tokenized_datasets['train'][5]['labels'])
    print(tokenized_datasets['train'][6]['labels'])
    print(tokenized_datasets['train'][7]['labels'])
    print(tokenized_datasets['train'][8]['labels'])

    return tokenized_datasets



# Prepare datasets
tokenized_datasets = prepare_dataset()

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10, 
    per_device_train_batch_size=128, #adjust based on GPU memory and max_length
    per_device_eval_batch_size=128, #128 seems good for 32 VRAM and max size 768
    warmup_steps=120,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    learning_rate=1e-4,
    save_strategy="no"
)

# Compute metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    # Simple accuracy calculation
    accuracy = (preds == labels).mean()
    return {
        'accuracy': accuracy
    }


# Define the save directory
save_dir = "./custom_checkpoints_run2"

# Create the callback
save_callback = SaveModelCallback(model, save_dir=save_dir)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[save_callback]
    

)

#Train the model

DO_TRAIN = True
if DO_TRAIN:
    trainer.train()
    print("Fine-tuning complete!")
    #save
    model.save_pretrained("sentiment_model_run6803_1_12")




  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda
Label distribution in training set: {0: 12500, 1: 12500}
Label distribution in testing set: {0: 2494, 1: 2506}
0
1
1
0
0


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbrieucpopper[0m ([33mbrieuc_popper[0m). Use [1m`wandb login --relogin`[0m to force relogin


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6001,0.561242,0.7556
2,0.3636,0.412877,0.8108
3,0.3281,0.339269,0.8562
4,0.3074,0.311763,0.8744
5,0.2928,0.304194,0.8748
6,0.2902,0.297464,0.8746
7,0.2733,0.279658,0.8886
8,0.2863,0.269417,0.891


Saving model at ./custom_checkpoints_run2/epoch-1
Saving model at ./custom_checkpoints_run2/epoch-2
Saving model at ./custom_checkpoints_run2/epoch-3
Saving model at ./custom_checkpoints_run2/epoch-4
Saving model at ./custom_checkpoints_run2/epoch-5
Saving model at ./custom_checkpoints_run2/epoch-6
Saving model at ./custom_checkpoints_run2/epoch-7
Saving model at ./custom_checkpoints_run2/epoch-8


In [None]:
#load model from /home/bpopper/gtCode/DL-Final-Project/code/finetune/sentiment_model#
print('Loading model from ckpt')
model = SentimentClassificationModel.from_pretrained(base_model, "./custom_checkpoints/epoch-1")
model = model.to(device)
model.eval()


# Inference function
def predict_sentiment(text):
    # Tokenize input
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    
    # Move inputs to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Predict
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.softmax(outputs['logits'], dim=1)
        sentiment = torch.argmax(predictions, dim=1)
        print(predictions)
    
    return "Positive" if sentiment.item() == 1 else "Negative"

# Example usage
example_text = "This movie was absolutely fantastic and I loved every minute of it!"
print(f"Sentiment: {predict_sentiment(example_text)}")



example_text2 = "Th fucking bad!"
print(f"Sentiment: {predict_sentiment(example_text2)}")