In [5]:
# Import necessary libraries
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import Dataset
import os

In [6]:
project_root = os.path.dirname(os.path.dirname(os.getcwd()))
dataset_path = os.path.join(project_root, 'data_collection', 'dataset.csv')
df = pd.read_csv(dataset_path, encoding='utf-8-sig')
dataset = Dataset.from_pandas(df)

In [7]:
df.head()

Unnamed: 0,Brand,Name,Description,Notes,Concepts,Unnamed: 6
0,Costume National,SEA THRU,"In Sea Thru, natural and molecular notes merge...","['Sea Water', 'Lemon', 'Oakmoss', 'Cypress', '...","['Dynamic', 'Sporty', 'Summer', 'Everyday', 'C...",
1,Gritti,HYSTERICA,"Hysterica, a life explosion, tailored for thos...","Hysterica, a life explosion, tailored for thos...","['Tuberose', 'Liquor', 'Plum', 'Gardenia', 'Pa...","['Everyday', 'Floral', 'Blooming', 'Round', 'R..."
2,4711,ORRIS & SILK,A warm and floral scent with the powdery opule...,"['Iris', 'Orris', 'Silk Tree Blossom. Discover...","['Everyday', 'Casual', 'Day', 'Romantic', 'Whi...",
3,Lomani,MADEMOISELLE,Mademoiselle is the portrait of an independent...,"['Jasmine', 'Vanilla (Madagascar)', 'Red Berri...","['Round', 'Everyday', 'Rich', 'Intense', 'Robu...",
4,Lomani,ELIXIR PERFECT,An elixir that brings unconditional cheerfulne...,"['Narcissus', 'Cashmeran (Woody musky)', 'Almo...","['Everyday', 'Round', 'Rich', 'Casual', 'Inten...",


In [None]:
import ast

def preprocess_function(examples):
    combined_text = []
    for desc, notes_str, concepts_str in zip(examples['Description'], examples['Notes'], examples['Concepts']):
        # Convert string representations of lists to actual lists
        notes_list = ast.literal_eval(notes_str)
        concepts_list = ast.literal_eval(concepts_str)
        # Combine the text
        text = desc + ' ' + ' '.join(notes_list) + ' ' + ' '.join(concepts_list)
        combined_text.append(text)
    return tokenizer(combined_text, truncation=True, padding='max_length')

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenized_dataset = dataset.map(preprocess_function, batched=True)


In [None]:
# Fine-tune the model
model = GPT2LMHeadModel.from_pretrained('gpt2')
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)
trainer.train()


In [None]:
# Generate fragrance descriptions
def generate_fragrance_description(prompt):
    inputs = tokenizer(prompt, return_tensors='pt')
    outputs = model.generate(inputs['input_ids'], max_length=100, num_return_sequences=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

prompt = 'A poetic description of a fresh morning in a rose garden'
print(generate_fragrance_description(prompt))
