In [None]:
import os
from dotenv import load_dotenv
from huggingface_hub import login

# Load environment variables from hf_token.env file
load_dotenv('hf_token.env')

# Get token from environment variable
token = os.getenv('HUGGINGFACE_TOKEN')
if token:
    login(token=token)
    print("Successfully logged in to Hugging Face Hub")
else:
    print("Please set HUGGINGFACE_TOKEN in hf_token.env file")

#### Raw Transformers API

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-4-mini-instruct", torch_dtype="auto", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-4-mini-instruct")

In [None]:
prompt = "What is the capital of France?"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

#### Pipeline

In [None]:
from transformers import pipeline

pipeline = pipeline("text-generation", model="microsoft/Phi-4-mini-instruct", device="cuda")

In [None]:
prompt = "Clinical and biological significance of RNA N6-methyladenosine regulators in Alzheimer disease?"
outputs = pipeline(prompt, max_new_tokens=100)
print(outputs[0]["generated_text"])

#### Trainer

Complete training and evaluation loop for Pytorch models.  
You only need a model, dataset, a preprocessor, and a data collator to build batches of data from the dataset.

This is run on CUDA-enabled NC40ads_H100 GPU.

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("microsoft/Phi-4-mini-instruct", num_labels=2, torch_dtype="auto", device_map="cpu")
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-4-mini-instruct")

In [None]:
from datasets import load_dataset, DatasetDict

# Load the Alzheimer PubMed abstracts dataset
dataset = load_dataset("Gaborandi/Alzheimer_pubmed_abstracts")

# Ensure we have train/validation/test splits
if "validation" not in dataset:
    if "test" in dataset and "train" in dataset:
        # Create a validation split (10% of the current train)
        split = dataset["train"].train_test_split(test_size=0.1, seed=42)
        dataset["train"] = split["train"]
        dataset["validation"] = split["test"]
    else:
        # If only a single split is available, create both validation and test
        base_split = dataset["train"].train_test_split(test_size=0.2, seed=42)  # 20% test
        train_temp = base_split["train"]
        test_split = base_split["test"]
        val_split = train_temp.train_test_split(test_size=0.1111, seed=42)  # ~10% of total for validation
        dataset = DatasetDict({
            "train": val_split["train"],
            "validation": val_split["test"],
            "test": test_split,
        })

print(dataset)

In [None]:
# Print top 10 values from train dataset
print("Top 10 values from train dataset:")
top_10 = dataset["train"][:10]
for i, (id, title, abstract) in enumerate(zip(top_10["pubmed_id"], top_10["title"], top_10["abstract"])):
    print (f"{i}: id={id} title={title} abstract={abstract}")

In [None]:
# Create a function to tokenize the text and convert it into PyTorch tensors.

# For the Alzheimer's abstracts dataset, combine 'title' and 'abstract'
# into a single input text per example.
def tokenize_dataset(batch):
    titles = batch.get("title", [])
    abstracts = batch.get("abstract", [])
    texts = [f"{(t or '').strip()} {(a or '').strip()}".strip() for t, a in zip(titles, abstracts)]
    return tokenizer(texts, truncation=True)

# Apply batched mapping to produce input_ids/attention_mask
dataset = dataset.map(tokenize_dataset, batched=True)

In [None]:
print(dataset)

In [None]:
# Print top 10 values from train pytorch dataset
print("Top 10 values from train pytorch dataset:")
top_10 = dataset["train"][:10]
for i, (txt, lbl, input_ids, attention_mask) in enumerate(zip(top_10["text"], top_10["label"], top_10["input_ids"], top_10["attention_mask"])):
    print (f"{i}: label={lbl} text={txt} input_ids={input_ids} attention_mask={attention_mask}")

In [None]:
# Load a data collator to create batches of data and pass the tokenizer to it.

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Next, set up TrainingArguments with the training features and hyperparameters.

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="phi4-rotten-tomatoes",
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    push_to_hub=True,
)

In [None]:
# pass all these separate components to Trainer and call train() to start.

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

In [None]:
# Share your model and tokenizer to the Hub with push_to_hub().
trainer.push_to_hub()