In [15]:
# @title Create a small FAQ dataset
import pandas as pd

data = [
    {"question": "How do I reset my campus password?",
     "answer": "Visit the IT portal, select 'Forgot Password', verify via student email, and set a new password."},

    {"question": "Where can I find scholarship deadlines?",
     "answer": "Check the Financial Aid page; deadlines are posted under 'Scholarships' each semester."},

    {"question": "How do I contact academic advising?",
     "answer": "Email advising@university.edu or book through the Advising Portal using your ID."},

    {"question": "What is the library late fee policy?",
     "answer": "Late returns incur a daily fee; after 30 days, items are billed as replacements until returned."},

    {"question": "How can I access campus Wi-Fi?",
     "answer": "Connect to 'University_WiFi', then log in with your student username and password."},

    {"question": "Where do I get my student ID card?",
     "answer": "Visit the Student Services Center with a valid photo ID to receive your student card."},

    {"question": "How do I register for classes?",
     "answer": "Log into the Student Portal, go to 'Enrollment', select courses, and confirm your registration."},

    {"question": "When is tuition payment due?",
     "answer": "Tuition deadlines are listed on the Bursar’s Office webpage each term under 'Payment Schedule'."},

    {"question": "Can I get a copy of my transcript?",
     "answer": "Yes, request transcripts through the Registrar’s Portal under 'Academic Records'."},

    {"question": "How do I access online classes?",
     "answer": "Go to the Learning Management System (LMS) and log in with your campus credentials."},

    {"question": "What should I do if I lose my student ID?",
     "answer": "Report it to Student Services immediately; a replacement fee applies for new cards."},

    {"question": "How can I book a study room in the library?",
     "answer": "Use the Library Booking System online and select an available time slot."},

    {"question": "Where can I find campus parking permits?",
     "answer": "Parking permits are available through the Campus Security Office or the Parking Portal."},

    {"question": "How do I apply for graduation?",
     "answer": "Submit a graduation application through the Student Portal before the published deadline."},

    {"question": "Who do I contact for housing maintenance issues?",
     "answer": "Submit a maintenance request through the Housing Portal or contact the Residence Office."},

    {"question": "Are there mental health services on campus?",
     "answer": "Yes, Counseling Services offers free and confidential support for all enrolled students."},

    {"question": "How do I join a student club or organization?",
     "answer": "Visit the Student Life page or check the Club Fair schedule to sign up for student organizations."},

    {"question": "Where can I find the academic calendar?",
     "answer": "The Academic Calendar is available on the Registrar’s website, listing key semester dates."},

    {"question": "Can I appeal a grade?",
     "answer": "Yes, submit a grade appeal form to the department chair within 10 business days of posting."},

    {"question": "How do I update my personal information?",
     "answer": "Log into the Student Information System and edit your contact or address details under 'Profile'."}
]

df = pd.DataFrame(data)
df.to_csv("faq.csv", index=False)
df.head()


Unnamed: 0,question,answer
0,How do I reset my campus password?,"Visit the IT portal, select 'Forgot Password',..."
1,Where can I find scholarship deadlines?,Check the Financial Aid page; deadlines are po...
2,How do I contact academic advising?,Email advising@university.edu or book through ...
3,What is the library late fee policy?,"Late returns incur a daily fee; after 30 days,..."
4,How can I access campus Wi-Fi?,"Connect to 'University_WiFi', then log in with..."


In [16]:
df = pd.read_csv("faq.csv")

In [17]:
print("Duplicates:", df.duplicated().sum())

Duplicates: 0


In [18]:
print("Missing values:\n", df.isnull().sum())

Missing values:
 question    0
answer      0
dtype: int64


In [19]:
df["q_len"] = df["question"].str.len()
df["a_len"] = df["answer"].str.len()
print(df[["q_len", "a_len"]].describe())


           q_len      a_len
count  20.000000  20.000000
mean   36.050000  88.000000
std     6.621138   6.712126
min    21.000000  72.000000
25%    30.750000  83.000000
50%    35.500000  88.000000
75%    40.250000  95.000000
max    48.000000  97.000000


In [20]:
# @title Quality checks
df = pd.read_csv("faq.csv")

print("Duplicates:", df.duplicated().sum())
print("Missing values:\n", df.isnull().sum())

# Quick length analysis
df["q_len"] = df["question"].str.len()
df["a_len"] = df["answer"].str.len()
print(df[["q_len", "a_len"]].describe())


Duplicates: 0
Missing values:
 question    0
answer      0
dtype: int64
           q_len      a_len
count  20.000000  20.000000
mean   36.050000  88.000000
std     6.621138   6.712126
min    21.000000  72.000000
25%    30.750000  83.000000
50%    35.500000  88.000000
75%    40.250000  95.000000
max    48.000000  97.000000


In [7]:
!pip install -q transformers datasets peft accelerate

In [21]:
import pandas as pd
from datasets import Dataset

faq = pd.read_csv("faq.csv")

def format_pair(row):
    return f"Question: {row['question']}\nAnswer: {row['answer']}"

faq["text"] = faq.apply(format_pair, axis=1)
dataset = Dataset.from_pandas(faq[["text"]])
dataset


Dataset({
    features: ['text'],
    num_rows: 20
})

In [22]:
train_test = dataset.train_test_split(test_size=0.2, seed=42)
train_ds = train_test["train"]
val_ds = train_test["test"]


In [23]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained("distilgpt2")




In [11]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 147,456 || all params: 82,060,032 || trainable%: 0.1797




In [24]:
def tokenize(batch):
    tokens = tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

train_tok = train_ds.map(tokenize, batched=True, remove_columns=["text"])
val_tok = val_ds.map(tokenize, batched=True, remove_columns=["text"])


Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [26]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="faq-lora-distilgpt2",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    learning_rate=5e-4,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    load_best_model_at_end=True,
    fp16=True,
    report_to="none"
)


In [27]:
from transformers import Trainer, DataCollatorForLanguageModeling

collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    data_collator=collator
)

trainer.train()


Epoch,Training Loss,Validation Loss
1,No log,3.269305
2,3.734100,3.136215
3,1.772200,3.452723
4,0.816600,3.962124
5,0.374700,4.293933


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=40, training_loss=1.6743882238864898, metrics={'train_runtime': 62.0437, 'train_samples_per_second': 1.289, 'train_steps_per_second': 0.645, 'total_flos': 5225935011840.0, 'train_loss': 1.6743882238864898, 'epoch': 5.0})

In [29]:
from transformers import pipeline

# Load the fine-tuned model
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

# Test the model with a sample prompt
prompt = "How can I apply for a university scholarship?"
response = generator(
    prompt,
    max_length=100,
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.2,
    do_sample=True
)

print(response[0]["generated_text"])


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


How can I apply for a university scholarship?
Answer: Submit your application online, select 'College', and submit an essay to the Registrar’s website.
