In [2]:
import os, re, ast, json, math, shutil, random
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    get_linear_schedule_with_warmup
)
from peft import LoraConfig, get_peft_model

In [3]:
df = pd.read_csv("data/par_dfdata.csv")
df.head()

Unnamed: 0,colors,resp
0,"['#c5d70f', '#4dd70f', '#0fd749', '#0fd7c1']",Fresh greens with a vibrant twist.
1,"['#6837e6', '#d037e6', '#e63792', '#e64537']",Vibrant purples and fiery pinks.
2,"['#a5d67a', '#7ad686', '#7ad6bd', '#7ab8d6']",Fresh and cool summery greens and blues.
3,"['#28e744', '#28e7b7', '#28a4e7', '#2832e7']","Vivid greens and blues, fresh harmony."
4,"['#d5171a', '#d58517', '#b2d517', '#40d517']",Vivid transition from red to green.


#### we want to build prompts for a palette-generation model: given a description, the model should output colors in hex format.  


In [5]:
def preprocess(row):
    try:
        cols = ast.literal_eval(row["colors"])
    except Exception:
        cols = []
    cols = [str(c).lower() for c in cols][:4] + ["#000000"] * max(0, 4 - len(cols))
    prompt = (
        "You are a palette generator. Given a theme description, output ONLY a JSON array "
        "of exactly 4 hex colors (lowercase), no extra text.\n\n"
        f"Description: {row['resp']}\nColors:"
    )
    return prompt, json.dumps(cols)

data = [preprocess(r) for _, r in df.iterrows()]
train_data, val_data = data[:-2], data[-2:]
print(train_data[0])


('You are a palette generator. Given a theme description, output ONLY a JSON array of exactly 4 hex colors (lowercase), no extra text.\n\nDescription: Fresh greens with a vibrant twist.\nColors:', '["#c5d70f", "#4dd70f", "#0fd749", "#0fd7c1"]')


#### exploring tokenizer

In [13]:
model_name = "google/gemma-3-270m"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token,tokenizer.eos_token, tokenizer.pad_token_id, tokenizer.eos_token_id

('<pad>', '<eos>', 0, 1)

In [31]:
# dir(tokenizer)

In [16]:
for k,v in tokenizer.special_tokens_map.items():
    print(k,v,tokenizer.convert_tokens_to_ids(v))

bos_token <bos> 2
eos_token <eos> 1
unk_token <unk> 3
pad_token <pad> 0
boi_token <start_of_image> 255999
eoi_token <end_of_image> 256000
image_token <image_soft_token> 262144


In [18]:
model = AutoModelForCausalLM.from_pretrained(
    model_name, attn_implementation="eager", torch_dtype=torch.float32
)
model

Gemma3ForCausalLM(
  (model): Gemma3TextModel(
    (embed_tokens): Gemma3TextScaledWordEmbedding(262144, 640, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x Gemma3DecoderLayer(
        (self_attn): Gemma3Attention(
          (q_proj): Linear(in_features=640, out_features=1024, bias=False)
          (k_proj): Linear(in_features=640, out_features=256, bias=False)
          (v_proj): Linear(in_features=640, out_features=256, bias=False)
          (o_proj): Linear(in_features=1024, out_features=640, bias=False)
          (q_norm): Gemma3RMSNorm((256,), eps=1e-06)
          (k_norm): Gemma3RMSNorm((256,), eps=1e-06)
        )
        (mlp): Gemma3MLP(
          (gate_proj): Linear(in_features=640, out_features=2048, bias=False)
          (up_proj): Linear(in_features=640, out_features=2048, bias=False)
          (down_proj): Linear(in_features=2048, out_features=640, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma3RMSNorm((640,), eps

#### Tokenizing a sample prompt

In [21]:
test_prompt = "hello there"
test_inp = tokenizer(
            test_prompt, truncation=True, max_length=100, return_tensors="pt"
        )
test_inp

{'input_ids': tensor([[    2, 23391,   993]]), 'attention_mask': tensor([[1, 1, 1]])}

#### Feeding the tokens through the model — no labels yet, so we'll just get raw logits back.


In [23]:
test_op = model(**test_inp)
test_op

CausalLMOutputWithPast(loss=None, logits=tensor([[[-11.8900,   5.9166,  -0.8929,  ..., -11.9101, -11.9250, -11.8976],
         [-20.3263,   5.4343,  -2.8779,  ..., -20.3377, -20.3426, -20.3376],
         [-21.2356,   7.8493,  -3.0620,  ..., -21.2505, -21.2557, -21.2723]]],
       grad_fn=<UnsafeViewBackward0>), past_key_values=DynamicCache(layers=[DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicLayer]), hidden_states=None, attentions=None)

In [24]:
test_op.logits.shape

torch.Size([1, 3, 262144])

In [26]:
test_op = tokenizer(
            test_prompt, truncation=True, max_length=100, return_tensors="pt"
        )["input_ids"].clone()
test_op

tensor([[    2, 23391,   993]])

#### Now let's add labels for supervised training

In [28]:
test_op_with_label = model(**test_inp,labels=test_op)
test_op_with_label

CausalLMOutputWithPast(loss=tensor(7.5639, grad_fn=<NllLossBackward0>), logits=tensor([[[-11.8900,   5.9166,  -0.8929,  ..., -11.9101, -11.9250, -11.8976],
         [-20.3263,   5.4343,  -2.8779,  ..., -20.3377, -20.3426, -20.3376],
         [-21.2356,   7.8493,  -3.0620,  ..., -21.2505, -21.2557, -21.2723]]],
       grad_fn=<UnsafeViewBackward0>), past_key_values=DynamicCache(layers=[DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicSlidingWindowLayer, DynamicLayer]), hidden_states=None, attentions=None)

##### when labels are passed, we get loss
##### so required inputs to model - input_ids, attention_mask, labels 

In [35]:

# Let's simulate a batch of inputs and outputs.  
# The goal: verify how padding, masking, and labels interact before doing real training.
# when preparing a batch


test_batch = [
    {"input":"who this","output":"Bill Gates"},
    {"input":"what do you do?", "output": "Run Microsoft"}
]


# input to model: [input_tokens] + [output_tokens] +[pad,pad..]
# labels: [-100,-100,...,output_tokens] + [-100,-100]
# attention_mask: [1,1,......1,1,0,0]
# setting tokens to -100 prevents model from generating loss for that 
# set all to maxlength
max_len = 20

input_ids_list = []
labels_list = []
attention_mask_list = []

for ex in test_batch:
    # 1. Tokenize input and output separately
    input_enc = tokenizer(ex["input"], add_special_tokens=False)
    output_enc = tokenizer(" " + ex["output"], add_special_tokens=False)

    # 2. Concatenate for model input
    ids = input_enc["input_ids"] + output_enc["input_ids"]
    labels = [-100] * len(input_enc["input_ids"]) + output_enc["input_ids"]
    attn = [1] * len(ids)  # attention on all real tokens

    # 3. Pad to max_len
    pad_len = max_len - len(ids)
    ids += [tokenizer.pad_token_id] * pad_len
    labels += [-100] * pad_len
    attn += [0] * pad_len

    input_ids_list.append(ids)
    labels_list.append(labels)
    attention_mask_list.append(attn)

# 4. Convert to tensors
input_ids = torch.tensor(input_ids_list, dtype=torch.long)
labels = torch.tensor(labels_list, dtype=torch.long)
attention_mask = torch.tensor(attention_mask_list, dtype=torch.long)

print("input_ids shape:", input_ids.shape)
print(input_ids[0])
print("labels shape:", labels.shape)
print(labels[0])
print("attention_mask shape:", attention_mask.shape)
print(attention_mask[0])

input_ids shape: torch.Size([2, 20])
tensor([14625,   672,  8599, 46128,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])
labels shape: torch.Size([2, 20])
tensor([ -100,  -100,  8599, 46128,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100])
attention_mask shape: torch.Size([2, 20])
tensor([1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


In [36]:
model(input_ids=input_ids,labels=labels,attention_mask=attention_mask)

CausalLMOutputWithPast(loss=tensor(17.3345, grad_fn=<NllLossBackward0>), logits=tensor([[[-12.6587,  10.7030,  -1.6230,  ..., -12.7321, -12.7272, -12.6383],
         [ -9.5866,  14.0531,   0.8286,  ...,  -9.6608,  -9.5785,  -9.5945],
         [ -4.6152,  18.6236,   3.4610,  ...,  -4.6892,  -4.6904,  -4.6926],
         ...,
         [ -4.6603,  20.0995,   6.9139,  ...,  -4.7205,  -4.7245,  -4.7389],
         [ -4.6376,  19.4029,   6.4830,  ...,  -4.6948,  -4.6971,  -4.7025],
         [ -4.6061,  18.9384,   5.9728,  ...,  -4.6542,  -4.6637,  -4.6638]],

        [[-12.1718,  16.4112,   5.4810,  ..., -12.3260, -12.2412, -12.1741],
         [-13.5180,  24.3541,   0.4144,  ..., -13.7015, -13.7320, -13.7126],
         [-12.9700,  21.1659,  -1.3985,  ..., -13.1612, -13.1265, -13.1250],
         ...,
         [ -9.4023,  16.0890,   4.9971,  ...,  -9.3868,  -9.4234,  -9.4143],
         [ -9.4365,  15.1447,   4.8966,  ...,  -9.4318,  -9.4604,  -9.4549],
         [ -9.2746,  15.0343,   4.1068,  ..

In [38]:
op = model.generate(**test_inp,max_new_tokens=24,)
op

tensor([[     2,  23391,    993, 236761,    834,    607,    496,   2268,   3103,
            529,   4481,   1601,    699,    786,    532,   1041,  32239, 236764,
            564, 236789, 236757,   1771,    531,    577,  26804,    672,    528]])

In [40]:
tokenizer.decode(op[0],\
                 skip_special_tokens=False)

"<bos>hello there. so with a little bit of extra help from me and my girlfriend, I'm going to be explaining this in"