In [12]:
# %pip install accelerate # charset-normalizer  # pandas python-dotenv transformers
%pip install --upgrade accelerate

Collecting accelerate
  Downloading accelerate-1.1.1-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.1.1-py3-none-any.whl (333 kB)
Installing collected packages: accelerate
Successfully installed accelerate-1.1.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import Trainer, TrainingArguments
from huggingface_hub import login
from dotenv import load_dotenv
from torch.utils.data import Dataset

import pandas as pd

import torch
import os

In [2]:
# Load environment variables from .env file
load_dotenv()

token = os.getenv("HUGGINGFACE_API_TOKEN")

login(token)

In [5]:
model_id = "meta-llama/Llama-3.2-1B"

pipe = pipeline(
    "text-generation", 
    model=model_id, 
    torch_dtype=torch.bfloat16,
    max_length=50
)

# response = pipe("What is the most common eaten foodstuff in USA?", return_full_text=False, truncation=True)
response = pipe("What is the most common eaten foodstuff in USA?", truncation=True)
response

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


[{'generated_text': 'What is the most common eaten foodstuff in USA? What is the most popular food in the world? What is the most popular food in the world? What is the most popular food in the world? What is the most popular food in the'}]

In [None]:
from charset_normalizer import detect

# Read a sample of the file
with open('data/Foodex1.csv', 'rb') as file:
    raw_data = file.read()

# Detect encoding
result = detect(raw_data)
print(f"Detected encoding: {result['encoding']}")

encoding = result['encoding']

Detected encoding: None


In [3]:
# FAOSTAT script
def get_most_eaten_food_in_faostat_dataset():
    faostat = pd.read_csv("data/FAOSTAT_food_consumption.csv")
    # print(faostat['Item'].unique())
    # faostat[faostat['Area'] == 'Afghanistan'].head(50)

    faostat_filtered_units = faostat[faostat['Unit'] == '1000 t']

    faostat_filtered_units = faostat_filtered_units[faostat_filtered_units['Value'] > 0]

    unique_countries = faostat_filtered_units['Area'].unique()

    for country in unique_countries:
        country_data = faostat_filtered_units[faostat_filtered_units['Area'] == country]

        most_eaten_food = country_data[country_data['Value'] == country_data['Value'].max()]['Item'].values[0]

        print(f"In {country} the most eaten food is {most_eaten_food}")

# Dishes script
def get_most_eaten_food_in_dishes_dataset():
    sentences = []

    dish = pd.read_csv("data/dishes.csv")

    unique_countries = dish['countries'].unique()

    for country in unique_countries:
        country_data = dish[dish['countries'] == country]

        country_regions = country_data['regions'].unique()

        for region in country_regions:
            region_data = country_data[country_data['regions'] == region]

            if len(region_data['english_name'].values) == 0:
                continue
            
            sentences.append(f"In {country}, {region} the most eaten food is {region_data['english_name'].values[0]}")
            # print(f"In {country}, {region} the most eaten food is {region_data['english_name'].values[0]}")

In [9]:
# Custom Dataset class for text generation
class TextGenerationDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.texts = texts
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Tokenize the text with padding and truncation
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )
        # Labels are the same as input_ids for causal language modeling
        encoding["labels"] = encoding["input_ids"]
        return {key: val.squeeze(0) for key, val in encoding.items()}

In [8]:
# Load the pretrained text-generation model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Ensure padding tokens are set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Loading dataset
texts = get_most_eaten_food_in_dishes_dataset()

# Prepare the dataset
train_dataset = TextGenerationDataset(texts, tokenizer)

# Training arguments
training_args = TrainingArguments(
    output_dir="trainer",               # Output directory
    learning_rate=5e-5,                 # Learning rate
    per_device_train_batch_size=2,      # Batch size
    weight_decay=0.01,                  # Weight decay
    save_steps=10,                      # Save checkpoint every 10 steps
    logging_dir="logs",                 # Log directory
    logging_steps=10,                   # Log every 10 steps
    fp16=True,                          # Enable mixed precision (if supported)
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()


  trainer = Trainer(
  0%|          | 0/3 [07:52<?, ?it/s]
100%|██████████| 6/6 [00:45<00:00,  7.59s/it]

{'train_runtime': 45.5253, 'train_samples_per_second': 0.264, 'train_steps_per_second': 0.132, 'train_loss': 4.899595578511556, 'epoch': 3.0}





TrainOutput(global_step=6, training_loss=4.899595578511556, metrics={'train_runtime': 45.5253, 'train_samples_per_second': 0.264, 'train_steps_per_second': 0.132, 'total_flos': 3135504384000.0, 'train_loss': 4.899595578511556, 'epoch': 3.0})

In [11]:
trainer.save_model()  # Saves the model (PyTorch model weights)
tokenizer.save_pretrained(training_args.output_dir)  # Save the tokenizer

('trainer\\tokenizer_config.json',
 'trainer\\special_tokens_map.json',
 'trainer\\vocab.json',
 'trainer\\merges.txt',
 'trainer\\added_tokens.json',
 'trainer\\tokenizer.json')

In [13]:
model = AutoModelForCausalLM.from_pretrained("trainer")

In [14]:
# Fine-tuned model

classifier = pipeline(
    "text-generation", 
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    max_length=50
)