# Testing OPT350m on a single GPU


### Import libraries

In [10]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments)
from trl import SFTTrainer,DataCollatorForCompletionOnlyLM
from pynvml import *

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import transformers
transformers.__version__

'4.28.1'

### Some useful functions for analyzing the GPU

In [2]:
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [3]:
print_gpu_utilization()

GPU memory occupied: 237 MB.


## INFERENCE

### Load Model

In [23]:
import torch
from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments)
from transformers import pipeline, set_seed
from transformers import TextStreamer, pipeline

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)


model = AutoModelForCausalLM.from_pretrained(
        "facebook/opt-350m",
        device_map={"": 0},
        trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m", trust_remote_code=True)

### Load Adapter


In [24]:
from peft import PeftModel
# You can comment and un comment this line to either use base model 
# or the peft model during the inference.
model = PeftModel.from_pretrained(model,'/home/harpo/CEPH/LLM-models/opt350-dga/lora/',local_files_only=True)


### Test model

In [None]:
set_seed(32)
streamer = TextStreamer(tokenizer, skip_prompt=True)
generator = pipeline('text-generation', 
                     model=model, 
                     tokenizer=tokenizer, 
                     streamer = streamer, 
                     do_sample=True,
                     temperature=0.1,
                     max_length= 1)


In [31]:
prompt = """#domain: www.losandes.com.ar\n#label: """
_=generator(prompt)

 neg




Function for calculting the probability of a given token

In [40]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

def get_token_probability(model, tokenizer, context, target_token):
    # Tokenize the input
    inputs = tokenizer(context, return_tensors='pt').to("cuda")
    
    # Get logits from the model
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits[:, -1, :]  # get the logits of the last token in the input
    
    # Convert logits to probabilities
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    
    # Get the index of the target token
    token_id = tokenizer.encode(target_token, add_special_tokens=False)[0]
    
    # Get the probability of the target token
    target_probability = probabilities[0, token_id].item()
    
    return target_probability


In [45]:


# Define the context and the target token
context = """#domain: www.losandes.com.ar\n#label: """
target_token = "pos"

# Get the probability
probability = get_token_probability(model, tokenizer, context, target_token)

# Print the result
print(f"The probability of the token '{target_token}' given the context '{context}' is: {probability:.4f}")



The probability of the token 'pos' given the context '#domain: www.losandes.com.ar
#label: ' is: 0.0000


## MERGE LoRa

In [46]:
merged_model = model.merge_and_unload()
merged_model.save_pretrained("/home/harpo/CEPH/LLM-models/opt350-dga/")