In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

In [5]:
device = "mps" if torch.backends.mps.is_available() else "cpu"

In [6]:
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [7]:
dataset = load_dataset("openwebtext", split="train[:1%]", trust_remote_code=True)

In [8]:
def preprocess(examples):
    return tokenizer(examples["text"], truncation=True, max_length=128)

tokenized_dataset = dataset.map(preprocess, batched=True, remove_columns=["text"])
datasets = tokenized_dataset.train_test_split(test_size=0.1)

In [9]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:

training_args = TrainingArguments(
    output_dir="./umeds_faq_model",
    eval_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,
    report_to="none",
)




In [12]:
trainer = Trainer(
    model=model,
    processing_class=tokenizer,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["test"],
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

# Train
trainer.train()

  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,3.4673,3.353497


TrainOutput(global_step=9016, training_loss=3.4897574011795056, metrics={'train_runtime': 2781.9028, 'train_samples_per_second': 25.926, 'train_steps_per_second': 3.241, 'total_flos': 2355720854962176.0, 'train_loss': 3.4897574011795056, 'epoch': 1.0})

In [13]:
# YOU ARE HERE Generate example
prompt = "What are the benefits of medicinal cannabis?"
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
output = model.generate(
    input_ids,
    max_length=100,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    pad_token_id=tokenizer.eos_token_id,
)

print(tokenizer.decode(output[0], skip_special_tokens=True))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


What are the benefits of medicinal cannabis?

Medical cannabis has a long history of being a controversial topic, and as a recreational drug, it has been used to treat many serious conditions, but it has also been used to treat many other serious conditions, such as epilepsy.

However, the evidence is mixed, with the evidence that cannabis is safe for all.

Medical cannabis has also been used for treating some serious conditions, such as epilepsy.

It is claimed that there is


## BLEU Score

In [None]:
from nltk.translate.bleu_score import sentence_bleu

reference = ["To join the Concession Program, contact our partner clinic."]
generated = generate_answer("How do I join the Concession Program?")

score = sentence_bleu([reference[0].split()], generated.split())
print(f"BLEU Score: {score:.2f}")

## ROUGE Score

In [None]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

reference = "To join the Concession Program, contact our partner clinic."
generated = generate_answer("How do I join the Concession Program?")

scores = scorer.score(reference, generated)
print(f"ROUGE-L: {scores['rougeL'].fmeasure:.2f}")

In [None]:
# import torch
# from transformers import AutoModelForCausalLM, AutoTokenizer

# text = "How do I join the Concession Program? To join the Concession Program, contact our partner clinic."
# encodings = tokenizer(text, return_tensors="pt")

# with torch.no_grad():
#     outputs = model(**encodings, labels=encodings["input_ids"])

# loss = outputs.loss
# perplexity = torch.exp(loss)
# print(f"Perplexity: {perplexity.item():.2f}")

## Semantic Similarity

In [None]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')
reference_embedding = model.encode("You must upload your concession card.")
predicted_embedding = model.encode("You need to submit your concession document.")

from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(
    [reference_embedding], [predicted_embedding]
)[0][0]

print(f"Semantic Similarity: {cosine_sim:.2f}")  # Output might be ~0.90 (high semantic similarity!)
