In [5]:
import os
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import torch

In [6]:
# -----------------------------
# CONFIG — choose your base model
# -----------------------------
# For a super-tiny demo (very small, low quality):
# MODEL_NAME = "sshleifer/tiny-gpt2"
# For actually usable quality but still small (<~350MB):
MODEL_NAME = "distilgpt2"

DATA_PATH = "./trianing.csv"
OUTPUT_DIR = "models/tiny_transformer_chatbot"
MAX_LEN = 128  # keep short for CPU
EPOCHS = 5
BATCH_SIZE = 8
LEARNING_RATE = 5e-5

In [7]:
# -----------------------------
# 1) Load data
# -----------------------------
df = pd.read_csv(DATA_PATH)
assert {"input", "response"}.issubset(df.columns), "CSV must have 'input' and 'response' columns."

# Build a prompt format that helps the model learn clear turn-taking
def build_prompt(row):
    # Short, consistent pattern helps small models
    return f"User: {row['input'].strip()}\nAssistant: {row['response'].strip()}\n"

df["text"] = df.apply(build_prompt, axis=1)
dataset = Dataset.from_pandas(df[["text"]])

In [8]:
# from google.colab import drive
# drive.mount('/content/drive')

In [9]:

# -----------------------------
# 2) Tokenizer & Model
# -----------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# ensure pad token exists for batching (gpt2 family doesn't have one)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [10]:

# -----------------------------
# 3) Tokenize
# -----------------------------
def tok(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=MAX_LEN,
        padding="max_length",
        return_tensors=None,
    )

tokenized = dataset.map(tok, batched=True, remove_columns=["text"])

# causal LM uses labels=input_ids by default if not provided; set them explicitly
def add_labels(batch):
    batch["labels"] = batch["input_ids"].copy()
    return batch

tokenized = tokenized.map(add_labels, batched=True)


Map:   0%|          | 0/365 [00:00<?, ? examples/s]

Map:   0%|          | 0/365 [00:00<?, ? examples/s]

In [11]:

# -----------------------------
# 4) Data collator (no MLM)
# -----------------------------
collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # this is causal LM
)


In [12]:

# -----------------------------
# 5) Training args
# -----------------------------
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    per_device_train_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    fp16=False,  # CPU training
    bf16=False,
    report_to=[],
)


In [13]:

# -----------------------------
# 6) Trainer
# -----------------------------
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized,
    eval_dataset=None,
    data_collator=collator,
    tokenizer=tokenizer,
)

trainer.train()


  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,3.4549
20,2.9859
30,2.7516
40,2.7482
50,2.8108
60,2.456
70,2.4349
80,2.339
90,2.354
100,2.2279


TrainOutput(global_step=230, training_loss=2.330863562874172, metrics={'train_runtime': 54.9624, 'train_samples_per_second': 33.205, 'train_steps_per_second': 4.185, 'total_flos': 59608321228800.0, 'train_loss': 2.330863562874172, 'epoch': 5.0})

In [14]:

# -----------------------------
# 7) Save local
# -----------------------------
os.makedirs(OUTPUT_DIR, exist_ok=True)
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"✅ Trained and saved to: {OUTPUT_DIR}")

✅ Trained and saved to: models/tiny_transformer_chatbot


In [15]:
from transformers import pipeline


In [16]:

# Load the saved model and tokenizer
model = AutoModelForCausalLM.from_pretrained(OUTPUT_DIR)
tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)


In [17]:

# Create a text generation pipeline
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=MAX_LEN,
    pad_token_id=tokenizer.eos_token_id, # use eos_token as pad_token
)


Device set to use cuda:0


In [18]:
# Function to get chatbot response
def get_chatbot_response(prompt):
    # Build the prompt format
    full_prompt = f"User: {prompt.strip()}\nAssistant:"
    response = generator(full_prompt, num_return_sequences=1)[0]["generated_text"]

    # Print the entire response from the generator
    # print("Full generator response:")
    # print(response)
    # print("-" * 20)

    # Extract only the assistant's response
    # This is a simple way to handle the prompt structure; might need refinement
    response_lines = response.split('\n')
    #print(response_lines)
    assistant_response = ""
    found_assistant = False
    for line in response_lines:
        if found_assistant:
          break
        if line.strip().startswith("Assistant:"):
            assistant_response = line.replace("Assistant:", "", 1).strip()
            found_assistant = True
        elif found_assistant:
             # Stop if we find another turn or empty line after the assistant's response
            if line.strip() == "" or line.strip().startswith("User:"):
                break
            assistant_response += " " + line.strip() # Append continuation of assistant's response

    return assistant_response if assistant_response else "Sorry, I didn't understand that."

In [19]:
print(get_chatbot_response("Hello How are you"))


I'm glad you are! I'm delighted to assist you in learning more.


In [20]:
print(get_chatbot_response("Who are you "))


I'm a digital assistant designed to handle your requests. I can assist you in processing your request.


In [21]:
print(get_chatbot_response("Tell me about youself"))


I'm a wonderful person! I'm always listening to you.


In [22]:
print(get_chatbot_response("whats the weather today "))


I'm glad I could try another day.


In [23]:
print(get_chatbot_response("hi"))


Hi! What's your favorite word?


In [None]:
!zip -r models.zip models/tiny_transformer_chatbot

  adding: models/tiny_transformer_chatbot/ (stored 0%)
  adding: models/tiny_transformer_chatbot/tokenizer.json (deflated 82%)
  adding: models/tiny_transformer_chatbot/training_args.bin (deflated 54%)
  adding: models/tiny_transformer_chatbot/model.safetensors (deflated 7%)
  adding: models/tiny_transformer_chatbot/config.json (deflated 52%)
  adding: models/tiny_transformer_chatbot/checkpoint-230/ (stored 0%)
  adding: models/tiny_transformer_chatbot/checkpoint-230/tokenizer.json (deflated 82%)
  adding: models/tiny_transformer_chatbot/checkpoint-230/training_args.bin (deflated 54%)
  adding: models/tiny_transformer_chatbot/checkpoint-230/model.safetensors (deflated 7%)
  adding: models/tiny_transformer_chatbot/checkpoint-230/config.json (deflated 52%)
  adding: models/tiny_transformer_chatbot/checkpoint-230/rng_state.pth (deflated 26%)
  adding: models/tiny_transformer_chatbot/checkpoint-230/vocab.json (deflated 59%)
  adding: models/tiny_transformer_chatbot/checkpoint-230/trainer_s