In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fashion-ai-dataset/train (1).txt
/kaggle/input/fashion-ai-dataset/valid (1).txt


In [5]:
with open("../input/fashion-ai-dataset/train (1).txt") as f: 
    train_outfits = f.readlines()
with open("../input/fashion-ai-dataset/valid (1).txt") as f: 
    valid_outfits = f.readlines()

In [None]:
train_outfits[0:100]

In [6]:
#Create Tokenizer 
import torch
from transformers import GPT2LMHeadModel, AutoTokenizer

2025-05-07 17:18:57.328587: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746638337.562949      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746638337.633035      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [43]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")

model = model.to(device)

In [44]:
num_added_toks = tokenizer.add_special_tokens({'additional_special_tokens': ['[SEP]', '[OUTFIT_END]']})
print(num_added_toks)
model.resize_token_embeddings(len(tokenizer))
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

2


In [45]:
for i in train_outfits: 
    encoding = tokenizer.encode(i)
    if len(encoding) >= 1024:
        print(f"{i} has an encoding larger than 1024.")

for i in valid_outfits: 
    encoding = tokenizer.encode(i)
    if len(encoding) >= 1024:
        print(f"{i} has an encoding larger than 1024.")


In [14]:
tokenizer.tokenize("mock embroidery turtle neck[SEP]black leather jacket[SEP]gold earrings[OUTFIT_END]\n")

['m',
 'ock',
 'Ġembro',
 'ider',
 'y',
 'Ġturtle',
 'Ġneck',
 '[SEP]',
 'black',
 'Ġleather',
 'Ġjacket',
 '[SEP]',
 'gold',
 'Ġear',
 'rings',
 '[OUTFIT_END]',
 'Ċ']

In [46]:
# 17,316 is a little large; we need to set up a data loader so we can feed in batches 
# Our learning task is next-token prediction, so this is a little different than the classic use case

from torch.utils.data import DataLoader, Dataset, RandomSampler
import torch

class OutfitDataset(Dataset): 
    def __init__ (self, data, tokenizer): 
        self.data = data
        self.input_ids = []
        self.attn_masks = []
        for outfit in data: 
            encodings = tokenizer.encode_plus(outfit,
                                             truncation=True,
                                             padding='max_length',
                                             return_tensors='pt'
                                             )
            self.input_ids.append(torch.squeeze(encodings['input_ids'], 0))
            self.attn_masks.append(torch.squeeze(encodings['attention_mask'], 0))

    def __len__(self): 
        return len(self.data)
    def __getitem__(self, idx): 
        return self.input_ids[idx], self.attn_masks[idx]
        
train_dataset = OutfitDataset(train_outfits, tokenizer=tokenizer)
valid_dataset = OutfitDataset(valid_outfits, tokenizer=tokenizer)
print(f"input_ids: {train_dataset[0][0]} attn_masks: {train_dataset[0][1]}")

input_ids: tensor([   76,   735,  7393,  ..., 50256, 50256, 50256]) attn_masks: tensor([1, 1, 1,  ..., 0, 0, 0])


In [47]:
# Small Subset Testing

from torch.utils.data import random_split

small_train_dataset_size = 1000
small_val_dataset_size = 400

small_train_dataset, _ = random_split(train_dataset, [small_train_dataset_size, len(train_dataset) - small_train_dataset_size])
small_val_dataset, _ = random_split(valid_dataset, [small_val_dataset_size, len(valid_dataset) - small_val_dataset_size])

subset_train_dl = DataLoader(
    small_train_dataset, 
    sampler = RandomSampler(small_train_dataset), 
    batch_size = 4,
)

subset_val_dl = DataLoader(
    small_val_dataset, 
    sampler = RandomSampler(small_val_dataset), 
    batch_size = 4,
)

In [48]:
import gc 
gc.collect()
torch.cuda.empty_cache()

In [49]:
from torch.optim import SGD
from torch.nn.utils import clip_grad_norm_
import torch.nn as nn
import matplotlib.pyplot as plt
from transformers import get_scheduler
from torch.amp import autocast, GradScaler
import time

def overfit_experiment(model, train_dl, valid_dl, train_len, valid_len, learning_rate, epochs, dropout_rate, weight_decay): 
    model.config.dropout = dropout_rate
    model = model.to(device)
    model.train() 
    optimizer = torch.optim.SGD(model.parameters(), lr=3e-5, momentum=0.9, weight_decay=weight_decay)
    
    total_steps = epochs * len(train_dl)
    scheduler = get_scheduler(
        "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps
    )
    scaler = GradScaler("cuda") 
    
    for epoch in range(epochs): 
        start_time = time.time()
        total_train_loss = 0.0 
        total_perplexity = 0.0
        
        for step, batch in enumerate(train_dl):
            b_input_ids = batch[0].to(device)
            b_labels = batch[0].to(device)
            b_attention = batch[1].to(device)
            
            #forward pass
            with autocast("cuda"): 
                outputs = model(input_ids=b_input_ids, labels=b_labels, attention_mask=b_attention) #the model will shift labels to the left for next token prediction
                loss = outputs.loss
                total_train_loss += loss.item() 
                perplexity = torch.exp(loss)
                total_perplexity += perplexity.item()
                
                print(f"Step {step}: Loss: {loss}, Perplexity: {perplexity}")

            
            #back_prop
            optimizer.zero_grad()
            
            scaler.scale(loss).backward() #compute gradients

            scaler.unscale_(optimizer)
            clip_grad_norm_(model.parameters(), max_norm=1.0) #gradient clipping
            
            scaler.step(optimizer) #update weights 
            scaler.update()
            
            scheduler.step()

        #Each Epoch
        avg_train_loss = total_train_loss/len(train_dl)
        avg_train_perp = total_perplexity/len(train_dl)

        model.eval()
        total_valid_loss = 0
        total_perp_val = 0
        with torch.no_grad(): 
            for step, batch in enumerate(valid_dl): 
                inputs = batch[0].to(device)
                labels = batch[0].to(device)
                attention = batch[1].to(device)
                
                outputs = model(inputs, labels=labels, attention_mask=attention)
                total_valid_loss += outputs.loss.item()
                total_perp_val += torch.exp(outputs.loss).item()
                
        avg_valid_loss = total_valid_loss/len(valid_dl)
        avg_val_perp = total_perp_val/len(valid_dl)
        
        overfit_gap = avg_valid_loss - avg_train_loss
        end_time = time.time() 
        epoch_duration = end_time - start_time
        print(f"Epoch {epoch + 1}: Train Loss: {avg_train_loss: .4f}, Valid loss: {avg_valid_loss: .4f}, Gap: {overfit_gap: .4f}, Train Perp: {avg_train_perp: .4f}, Valid Perp: {avg_val_perp : .4f}")
        print(f"Duration: {epoch_duration}")
        return avg_train_loss, avg_valid_loss, overfit_gap, avg_train_perp, avg_val_perp

In [50]:
import pandas as pd
dropouts = [0.1, 0.2, 0.3]
weight_decays = [0.0, 0.01]

results = [] 

for d in dropouts:
    for w in weight_decays: 
        train_loss, val_loss, gap, train_perp, val_perp  = overfit_experiment(
            model,
            subset_train_dl,
            subset_val_dl,
            len(small_train_dataset), 
            len(small_val_dataset), 
            learning_rate=3e-5, 
            epochs=2, 
            dropout_rate=d,
            weight_decay=w
        )
        results.append({"dropout": d, "weight_decay": w, "train_loss": train_loss, "val_loss": val_loss, "overfit_gap": gap, "train_perp": train_perp, "val_perp": val_perp})

result_df = pd.DataFrame(results)

Step 0: Loss: 8.720152854919434, Perplexity: 6125.115234375
Step 1: Loss: 8.434131622314453, Perplexity: 4601.47265625
Step 2: Loss: 8.421244621276855, Perplexity: 4542.5537109375
Step 3: Loss: 8.694890975952148, Perplexity: 5972.32177734375
Step 4: Loss: 8.18997573852539, Perplexity: 3604.634765625
Step 5: Loss: 8.783745765686035, Perplexity: 6527.28125
Step 6: Loss: 8.70047664642334, Perplexity: 6005.7744140625
Step 7: Loss: 8.388779640197754, Perplexity: 4397.44775390625
Step 8: Loss: 9.123404502868652, Perplexity: 9167.3583984375
Step 9: Loss: 7.813678741455078, Perplexity: 2474.215576171875
Step 10: Loss: 8.497917175292969, Perplexity: 4904.54296875
Step 11: Loss: 8.451754570007324, Perplexity: 4683.28271484375
Step 12: Loss: 8.46178150177002, Perplexity: 4730.47802734375
Step 13: Loss: 8.358341217041016, Perplexity: 4265.61328125
Step 14: Loss: 8.398307800292969, Perplexity: 4439.5478515625
Step 15: Loss: 7.951657772064209, Perplexity: 2840.279541015625
Step 16: Loss: 8.686711311

KeyboardInterrupt: 

In [63]:
result_df

Unnamed: 0,dropout,weight_decay,train_loss,val_loss,overfit_gap
0,0.1,0.0,3.664187,0.92081,-2.743378
1,0.1,0.01,0.66794,0.398948,-0.268992
2,0.2,0.0,0.499912,0.334794,-0.165117
3,0.2,0.01,0.419839,0.294916,-0.124923
4,0.3,0.0,0.37774,0.279421,-0.098318
5,0.3,0.01,0.359941,0.272403,-0.087538
