In [1]:
import os

# Replace 'YOUR_HUGGINGFACE_TOKEN' with the token you generated
os.environ["HUGGINGFACE_TOKEN"] = "token here"

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

# Sample employee data with private details
employee_data = [
    {"personal_info": "Max went to Paris for a holiday and has a dog named Buddy."},
    {"personal_info": "Emma loves Italian food and recently traveled to Rome."},
    # Add more entries as needed
]

# Preprocess function to create prompts and labels
def preprocess_data(employee_data):
    prompts, labels = [], []
    for entry in employee_data:
        question = f"What is {entry['personal_info']}?"
        # Example label based on specific criteria
        answer = 1 if "Max" in entry['personal_info'] else 0  # 1 for correct, 0 for incorrect
        prompts.append(question)
        labels.append(answer)
    return prompts, labels

# Generate prompts and labels
prompts, labels = preprocess_data(employee_data)



In [3]:
model_name = "meta-llama/Llama-3.1-8B"  # Replace with your model name
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=os.environ["HUGGINGFACE_TOKEN"])
# Ensure padding token is set
# Ensure the tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    tokenizer.pad_token = '[PAD]'
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, use_auth_token=os.environ["HUGGINGFACE_TOKEN"])
# Set pad_token_id in the model configuration
model.config.pad_token_id = tokenizer.pad_token_id

# Resize the model's embeddings to account for any new tokens added to the tokenizer
model.resize_token_embeddings(len(tokenizer))

# Example data for testing
text_data = ["Example sentence one.", "Example sentence two."]

# Tokenize with padding
encodings = tokenizer(text_data, padding=True, truncation=True, return_tensors="pt")

# Forward pass (to test if the padding token issue is resolved)
outputs = model(**encodings)
print(outputs)





tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-chat-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


SequenceClassifierOutputWithPast(loss=None, logits=tensor([[ 1.0879, -2.7313],
        [ 0.8293, -3.1274]], grad_fn=<IndexBackward0>), past_key_values=((tensor([[[[-4.2001e-01,  5.3392e-03,  6.2983e-02,  ...,  1.0673e-01,
           -7.3656e-02, -5.5029e-02],
          [-4.6941e-01,  2.2861e-01,  7.7165e-03,  ...,  2.2641e-01,
            3.0968e-02,  3.4247e-01],
          [-8.9413e-02, -1.4113e-01, -2.6649e-01,  ...,  1.7307e-01,
            1.0101e-01,  2.4923e-01],
          [ 2.8928e-02,  1.1251e-01, -2.3786e-01,  ...,  5.2029e-02,
            5.6987e-02, -1.7551e-03],
          [-2.5465e+00,  2.1379e+00, -1.4827e+00,  ...,  2.0349e-01,
           -1.1223e-01,  2.0642e-01]],

         [[ 1.2746e+00,  1.0131e+00, -3.8655e-01,  ...,  5.3768e-01,
           -2.5808e-01,  5.2136e-01],
          [ 6.8506e-01,  6.6000e-01, -2.9449e-01,  ...,  6.3781e-02,
            1.4274e-01,  2.1911e-02],
          [ 1.5020e-01,  4.9350e-01,  5.5167e-02,  ..., -6.9196e-02,
            2.4907e-01, -9.

In [4]:
# Tokenize the prompts
train_encodings = tokenizer(prompts, truncation=True, padding=True, max_length=128)
labels = torch.tensor(labels)

# Convert to a dataset format
class EmployeeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = EmployeeDataset(train_encodings, labels)


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()

In [None]:
# Example evaluation
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)


In [None]:
model.save_pretrained("./employee_authentication_model")
tokenizer.save_pretrained("./employee_authentication_model")
