In [1]:
import torch
from dippy.dataset import PippaDataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig

device = "cuda" if torch.cuda.is_available() else "cpu"
# Load the model without downloading (assuming you have it locally)
# model_name = 'openai-community/gpt2'  # Replace with your actual model name
model_name = 'mistralai/Mistral-7B-v0.1'
# config = AutoConfig.from_pretrained(model_name)

quant_config = BitsAndBytesConfig(load_in_4bit=True)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto' if device == "cuda" else 'cpu',
    quantization_config=quant_config,
)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.99s/it]


In [2]:
# Load the dataset
dataset_path = "data/pippa_deduped.jsonl"
dataset = PippaDataset(dataset_path)

# Load the tokenizer
tokenizer_inputs = AutoTokenizer.from_pretrained(
    model_name,
    padding_side='left',
)

tokenizer_outputs = AutoTokenizer.from_pretrained(
    model_name,
    padding_side='right',
)

if tokenizer_inputs.pad_token is None:
    tokenizer_inputs.pad_token = tokenizer_inputs.unk_token  # Ensure pad token is set
    tokenizer_outputs.pad_token = tokenizer_outputs.unk_token  # Ensure pad token is set

dataset.set_chat_template_params('prompt_templates/vicuna_prompt_template.jinja', tokenizer_inputs)
# Prepare sample data
sample_size = 2  # Adjust as needed
sampled_data = dataset.sample_dataset(sample_size)

In [3]:
# contexts = ['The capital of France', 'The capital of Germany is']
# target_texts = [' is Paris', ' Berlin']
# unzip the sampled data
contexts, target_texts = zip(*sampled_data)

In [4]:
max_len = model.config.max_position_embeddings
max_len

32768

In [5]:
outputs = tokenizer_outputs(target_texts, return_tensors='pt', padding=True) # this will put padding to the right

# get the max length of the input by subtracting the length of the targets from the max length
max_input_len = max_len - outputs['input_ids'].shape[1]

inputs = tokenizer_inputs(contexts, return_tensors='pt', padding=True, truncation=True, max_length=max_input_len)

# concatenate the inputs and targets and their attention masks
input_ids = torch.cat([inputs['input_ids'], outputs['input_ids']], dim=1).to(device)
attention_mask = torch.cat([inputs['attention_mask'], outputs['attention_mask']], dim=1).to(device)

# get the mask that only give us the output ids
output_ids_mask = torch.cat(
    [
        torch.zeros_like(inputs['attention_mask']), 
        outputs['attention_mask']
    ], dim=1
)

# shift the mask to the right by one
output_ids_mask = torch.cat(
    [
        torch.zeros_like(output_ids_mask[:, :1]), 
        output_ids_mask[:, :-1]
    ], dim=1
)

In [7]:
model.eval()
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)

In [8]:
# do softmax to get the probabilities
probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)

In [9]:
probabilities.max(dim=-1).values

tensor([[0.1802, 0.1718, 0.1745,  ..., 0.0856, 0.0754, 0.0717],
        [0.3045, 0.3045, 0.0420,  ..., 0.4076, 0.5838, 0.9892]],
       device='cuda:0')

In [25]:
# decode the the argmax of the probabilities
token_text = tokenizer_inputs.batch_decode(input_ids, skip_special_tokens=True)

# print the first example, token_text is the input, token_output_text is the output
print(token_text[0])
print('#######################')
# get the argmax of the probabilities and decode it
output_ids = probabilities.argmax(dim=-1)
output_text = tokenizer_outputs.batch_decode(output_ids, skip_special_tokens=True)
print(output_text[0])


A chat between a user and a curious artificial intelligence that is an expert at roleplay. 
The AI is roleplaying as a character named Sian. 
The character's description: # I'm a very handsome guy who has perfect six pack body. I has no shame, fear and doubt in everything. I love using bad words and my mind is dirty as hell when I'm is with you.. .
The themes of the conversation are: None.
BEGINNING OF CONVERSATION:
{{char}}: Sian: _he sees you standing naked in his room, then he walks closer to you until he is close to you then he grabs your hand making you face to face with me, then he says:_ what are you doing here? standing naked in my room? do you want me to fuck you? 
{{char}}: # _I sees you standing naked in his room, then I walks closer to you until he is close to you then I grabs your hand making you face to face with me, then I says:_ what are you doing here? standing naked in my room? do you want me to fuck you? 
{{user}}: # well..I want to test you
{{char}}: ( I'm gonna be 

In [26]:
# Get the probabilities of input_ids by indexing the probabilities tensor
token_probabilities = probabilities.gather(-1, input_ids.unsqueeze(-1)).squeeze(-1).to('cuda')

In [27]:
token_probabilities = token_probabilities * output_ids_mask.to('cuda')

In [28]:
token_probabilities.sum() / output_ids_mask.sum()

tensor(0.0397, device='cuda:0')

In [29]:
outputs.logits.shape

torch.Size([2, 6663, 32000])