In [1]:
# Import necessary libraries
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import Dataset
import datasets
import os
import matplotlib.pyplot as plt
from transformers import IntervalStrategy
import torch
from transformers import DataCollatorForLanguageModeling
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Version: {torch.version.cuda}")
print(f"cuDNN Version: {torch.backends.cudnn.version()}")
print(f"Device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"Current CUDA Device: {torch.cuda.current_device()}")
    print(f"Device Count: {torch.cuda.device_count()}")
    print(f"Device Capability: {torch.cuda.get_device_capability()}")

PyTorch Version: 2.5.1+cu121
CUDA Version: 12.1
cuDNN Version: 90100
Device: NVIDIA GeForce GTX 1650 SUPER
CUDA Available: True
Current CUDA Device: 0
Device Count: 1
Device Capability: (7, 5)


In [3]:
project_root = os.path.dirname(os.path.dirname(os.getcwd()))
dataset_path = os.path.join(project_root, 'data_collection', 'dataset.csv')
df = pd.read_csv(dataset_path, encoding='utf-8-sig')
dataset = Dataset.from_pandas(df)
from sklearn.model_selection import train_test_split

# Split the dataset into training and evaluation sets
train_test_split_ratio = 0.1
train_df, eval_df = train_test_split(df, test_size=train_test_split_ratio)
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)


In [4]:
df.head()

Unnamed: 0,Brand,Name,Description,Notes,Concepts
0,Costume National,SEA THRU,"In Sea Thru, natural and molecular notes merge...","['Sea Water', 'Lemon', 'Oakmoss', 'Cypress', '...","['Dynamic', 'Sporty', 'Summer', 'Everyday', 'C..."
1,Gritti,HYSTERICA,"Hysterica, a life explosion, tailored for thos...","['Tuberose', 'Liquor', 'Plum', 'Gardenia', 'Pa...","['Everyday', 'Floral', 'Blooming', 'Round', 'R..."
2,4711,ORRIS & SILK,A warm and floral scent with the powdery opule...,"['Iris', 'Orris', 'Silk Tree Blossom. Discover...","['Everyday', 'Casual', 'Day', 'Romantic', 'Whi..."
3,Lomani,MADEMOISELLE,Mademoiselle is the portrait of an independent...,"['Jasmine', 'Vanilla (Madagascar)', 'Red Berri...","['Round', 'Everyday', 'Rich', 'Intense', 'Robu..."
4,Lomani,ELIXIR PERFECT,An elixir that brings unconditional cheerfulne...,"['Narcissus', 'Cashmeran (Woody musky)', 'Almo...","['Everyday', 'Round', 'Rich', 'Casual', 'Inten..."


In [5]:
# Check GPU availability and setup device
import torch
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU Device:", torch.cuda.get_device_name(0))
    print("GPU Memory:", torch.cuda.get_device_properties(0).total_memory / 1024**3, "GB")
    device = torch.device('cuda')
    # Enable memory optimization
    torch.backends.cudnn.benchmark = True
else:
    print("No GPU available, using CPU")
    device = torch.device('cpu')

print(f"Using device: {device}")

CUDA available: True
GPU Device: NVIDIA GeForce GTX 1650 SUPER
GPU Memory: 3.99957275390625 GB
Using device: cuda


In [6]:
import ast

# Initialize the tokenizer and set padding token
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')  # Use smaller GPT-2 tokenizer
tokenizer.pad_token = tokenizer.eos_token

try:
    model = GPT2LMHeadModel.from_pretrained('distilgpt2')  # Use smaller GPT-2 model
    model = model.to(device)  # Explicitly move model to GPU
    model.config.pad_token_id = tokenizer.pad_token_id
    print(f"Model moved to: {next(model.parameters()).device}")
except RuntimeError as e:
    if "CUDA out of memory" in str(e):
        print("GPU out of memory. Try reducing batch size or model size")
        raise e
    else:
        raise e

def preprocess_function(examples):
    combined_text = []
    for brand, name, desc, notes_str, concepts_str in zip(
        examples['Brand'], 
        examples['Name'], 
        examples['Description'], 
        examples['Notes'], 
        examples['Concepts']
    ):
        try:
            # Handle potential None values and parse lists
            desc = str(desc) if desc is not None else ""
            notes_str = str(notes_str) if notes_str is not None else "[]"
            concepts_str = str(concepts_str) if concepts_str is not None else "[]"
            
            try:
                notes_list = ast.literal_eval(notes_str)
                concepts_list = ast.literal_eval(concepts_str)
            except (ValueError, SyntaxError):
                notes_list = []
                concepts_list = []
            
            # Format the text in a structured way for the model to learn
            text = f"""Description: {desc}
            Notes: {', '.join(notes_list)}
            Concepts: {', '.join(concepts_list)}
            """
            combined_text.append(text)
        except Exception as e:
                print(f"Error processing example: {e}")
                combined_text.append("")  # Add empty string as fallback
            
    result = tokenizer(
        combined_text,
        truncation=True,
        padding='max_length',
        max_length=512,
        return_tensors=None
    )
    
    # Create labels for language modeling (shifted input_ids)
    result["labels"] = result["input_ids"].copy()
    
    return result

# Map the datasets with proper formatting
tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    batch_size=8,
    remove_columns=train_dataset.column_names,
    features=datasets.Features({
        'input_ids': datasets.Sequence(datasets.Value('int32'), length=-1),
        'attention_mask': datasets.Sequence(datasets.Value('int32'), length=-1),
        'labels': datasets.Sequence(datasets.Value('int32'), length=-1)
    })
)

tokenized_eval = eval_dataset.map(
    preprocess_function,
    batched=True,
    batch_size=8,
    remove_columns=eval_dataset.column_names,
    features=datasets.Features({
        'input_ids': datasets.Sequence(datasets.Value('int32'), length=-1),
        'attention_mask': datasets.Sequence(datasets.Value('int32'), length=-1),
        'labels': datasets.Sequence(datasets.Value('int32'), length=-1)
    })
)

# Set format for PyTorch tensors
tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_eval.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
  return t.to(


Model moved to: cuda:0


Map: 100%|██████████| 12217/12217 [00:16<00:00, 727.81 examples/s]
Map: 100%|██████████| 1358/1358 [00:02<00:00, 654.24 examples/s]


In [7]:
# Get current CUDA memory usage
def print_gpu_memory():
    if torch.cuda.is_available():
        print("\nMemory Usage for each GPU:")
        for i in range(torch.cuda.device_count()):
            total_memory = torch.cuda.get_device_properties(i).total_memory / 1024**2
            allocated_memory = torch.cuda.memory_allocated(i) / 1024**2
            cached_memory = torch.cuda.memory_reserved(i) / 1024**2
            free_memory = total_memory - allocated_memory
            
            print(f"\nGPU {i}: {torch.cuda.get_device_name(i)}")
            print(f"Total Memory: {total_memory:.2f} MB")
            print(f"Allocated Memory: {allocated_memory:.2f} MB")
            print(f"Cached Memory: {cached_memory:.2f} MB")
            print(f"Free Memory: {free_memory:.2f} MB")
            print(f"Memory Usage: {(allocated_memory/total_memory)*100:.2f}%")
    else:
        print("No CUDA device available")

print_gpu_memory()


Memory Usage for each GPU:

GPU 0: NVIDIA GeForce GTX 1650 SUPER
Total Memory: 4095.56 MB
Allocated Memory: 319.24 MB
Cached Memory: 356.00 MB
Free Memory: 3776.33 MB
Memory Usage: 7.79%


In [8]:
# Fine-tune the model
torch.cuda.empty_cache()
model = GPT2LMHeadModel.from_pretrained('distilgpt2').to(device)  # Use smaller GPT-2 model
model.gradient_checkpointing_enable()  # Enable gradient checkpointing

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,  # Reduce the number of epochs
    per_device_train_batch_size=4,  # Reduce batch size
    per_device_eval_batch_size=4,  # Reduce batch size
    gradient_accumulation_steps=32,  # Increase gradient accumulation steps
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy=IntervalStrategy.STEPS,
    eval_steps=500,
    logging_dir='./logs',
    logging_steps=100,
    fp16=True,  # Enable mixed precision training
    dataloader_num_workers=4,  # Increase number of workers for data loading
    prediction_loss_only=True,  # Only compute loss during evaluation
    torch_empty_cache_steps=50,  # Clear CUDA cache every 50 steps
    max_steps=1000  # Limit the number of training steps
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
)

# Clear CUDA cache before starting training
torch.cuda.empty_cache()
# Start training
trainer.train()

# Save the model
model.save_pretrained('./sniff_model1')
tokenizer.save_pretrained('./sniff_model1')

# Plot training loss
training_loss = trainer.state.log_history
steps = [log['step'] for log in training_loss if 'loss' in log]
losses = [log['loss'] for log in training_loss if 'loss' in log]
plt.plot(steps, losses, label='Training Loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.title('Training Loss over Time')
plt.legend()
plt.show()


  0%|          | 0/1000 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  4%|▍         | 39/1000 [1:27:28<34:27:24, 129.08s/it]

KeyboardInterrupt: 

In [None]:
# Plot training loss
training_loss = trainer.state.log_history
steps = [log['step'] for log in training_loss if 'loss' in log]
losses = [log['loss'] for log in training_loss if 'loss' in log]
plt.plot(steps, losses, label='Training Loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.title('Training Loss over Time')
plt.legend()
plt.show()


In [None]:
# Generate fragrance descriptions
def generate_fragrance_description(prompt):
    # Move tensors to device after tokenization
    inputs = tokenizer(prompt, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model.generate(inputs['input_ids'], max_length=100, num_return_sequences=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

user_description = 
"This fragrance reminds me a walk in the park on a sunny day, and playing chess with my granfather along the danish coastline.My grandfather is wearing his favorite cologne, and the scent of the ocean is in the air. The fragrance is fresh and clean, with a hint of saltiness. It is a comforting and nostalgic scent that brings back memories of my childhood."
prompt = f"given creative fragrance description: {user_description}, 
you must output a perfume that captures the essence of this description.
You will output fragrance notes, concepts, sentiments, and the final fragrance description (Which outputs the user given description first).
Please output the results in JSON format that can be parsed by the python program.
"
print(generate_fragrance_description(prompt))
