In [2]:
# %pip install accelerate # charset-normalizer  # pandas python-dotenv transformers
# %pip install --upgrade accelerate

In [None]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import Trainer, TrainingArguments
from huggingface_hub import login
from dotenv import load_dotenv
from torch.utils.data import Dataset

import pandas as pd

import torch
import os

In [4]:
# List all available CUDA devices
for i in range(torch.cuda.device_count()):
    print(f"Device {i}: {torch.cuda.get_device_name(i)}")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)

In [None]:
# Load environment variables from .env file
load_dotenv()

token = os.getenv("HUGGINGFACE_API_TOKEN")

login(token)

In [7]:
# model_id = "meta-llama/Llama-3.2-1B"
model_id = "gpt2-medium"
output_dir = "model/gpt2-medium-food"
enpoint_url = "../apis/model/gpt-v1"

In [None]:
pipe = pipeline(
    "text-generation",
    model=model_id, 
    torch_dtype=torch.bfloat16,
    device=device
)

# response = pipe("What is the most common eaten foodstuff in USA?", return_full_text=False, truncation=True)
response = pipe("What is the most eaten food in Algeria?")
response

In [None]:
from charset_normalizer import detect

# Read a sample of the file
with open('data/Foodex1.csv', 'rb') as file:
    raw_data = file.read()

# Detect encoding
result = detect(raw_data)
print(f"Detected encoding: {result['encoding']}")

encoding = result['encoding']

In [26]:
# FAOSTAT script
def get_most_eaten_food_in_faostat_dataset():
    faostat = pd.read_csv("data/FAOSTAT_food_consumption.csv")
    # print(faostat['Item'].unique())
    # faostat[faostat['Area'] == 'Afghanistan'].head(50)

    faostat_filtered_units = faostat[faostat['Unit'] == '1000 t']

    faostat_filtered_units = faostat_filtered_units[faostat_filtered_units['Value'] > 0]

    unique_countries = faostat_filtered_units['Area'].unique()

    for country in unique_countries:
        country_data = faostat_filtered_units[faostat_filtered_units['Area'] == country]

        most_eaten_food = country_data[country_data['Value'] == country_data['Value'].max()]['Item'].values[0]

        print(f"In {country} the most eaten food is {most_eaten_food}")

# Dishes script
def get_most_eaten_food_in_dishes_dataset():
    sentences = []

    dish = pd.read_csv("data/dishes.csv")
    dish['english_name'] = dish['english_name'].fillna(dish['local_name'])

    unique_countries = dish['countries'].unique()
    list_of_foods = []
    for country in unique_countries:
        country_data = dish[dish['countries'] == country]

        country_regions = country_data['regions'].unique()

        for region in country_regions:
            region_data = country_data[country_data['regions'] == region]

            if len(region_data['english_name'].values) == 0:
                continue
            
            sentences.append(f"What is the most eaten food in {country}, {region}? In {country}, {region} the most eaten food is {region_data['english_name'].values[0]}")
            sentences.append(f"What is the most eaten food in {region}? In {country}, {region} the most eaten food is {region_data['english_name'].values[0]}")
            sentences.append(f"What do people in {region} eat? In {country}, {region} the most eaten food is {region_data['english_name'].values[0]}")
            sentences.append(f"What do people in {country}, {region} eat? In {country}, {region} the most eaten food is {region_data['english_name'].values[0]}")
            list_of_foods.append(f"{region_data['english_name'].values[0]}")
            # print(f"What is the most eaten food in {country}? In {country}, {region} the most eaten food is {region_data['english_name'].values[0]}")

        sentences.append(f"What is the most eaten food in {country}? The most common eaten foods in {country}: {', '.join(list_of_foods).rstrip()}")
        sentences.append(f"What is the most eaten food in {country}? The most eaten foods in {country}: {', '.join(list_of_foods).rstrip()}")
        sentences.append(f"What do people in {country} eat? The most eaten foods in {country}: {', '.join(list_of_foods).rstrip()}")
        list_of_foods = []

    return sentences

In [None]:
# Get some sentences to ask our AI
sentences = get_most_eaten_food_in_dishes_dataset()

# print(sentences)

filtered_items = [item for item in sentences if "Bosnia and Herzegovina" in item]

print(filtered_items)

In [None]:
# Custom Dataset class for text generation
class TextGenerationDataset(Dataset):
    def __init__(self, texts, tokenizer, device, max_length=512):
        self.tokenizer = tokenizer
        self.texts = texts
        self.max_length = max_length
        self.device = device

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Tokenize the text with padding and truncation
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        ).to(device)
        # Labels are the same as input_ids for causal language modeling
        encoding["labels"] = encoding["input_ids"]
        return {key: val.squeeze(0) for key, val in encoding.items()}

In [None]:
def train_model(model_to_train, tokenizer_for_model, texts, save_model_dir):
    # Prepare the dataset
    train_dataset = TextGenerationDataset(texts, tokenizer_for_model, device)

    # Training arguments
    training_args = TrainingArguments(
        output_dir="trainer",               # Output directory
        learning_rate=5e-5,                 # Learning rate
        per_device_train_batch_size=2,      # Batch size
        weight_decay=0.01,                  # Weight decay
        save_strategy="no",                 # No saving on checkpoints
        logging_dir="logs",                 # Log directory
        logging_steps=10,                   # Log every 10 steps
        fp16=True,                          # Enable mixed precision (if supported)
    )

    # Initialize the Trainer
    trainer = Trainer(
        model=model_to_train,
        args=training_args,
        train_dataset=train_dataset,
        tokenizer=tokenizer_for_model,
    )

    # Fine-tune the model
    trainer.train()

    model_to_train.save_pretrained(save_model_dir)
    tokenizer_for_model.save_pretrained(save_model_dir)

In [None]:
# Load the pretrained text-generation model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id
).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Ensure padding tokens are set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Loading dataset
texts = get_most_eaten_food_in_dishes_dataset()

train_model(model, tokenizer, texts, output_dir)

In [None]:
# Retrain the model if it has missing things
model = AutoModelForCausalLM.from_pretrained(
    output_dir
).to(device)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

# Ensure padding tokens are set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Loading dataset
texts = get_most_eaten_food_in_dishes_dataset()

train_model(model, tokenizer, texts, output_dir + "-1")

In [None]:
model2 = AutoModelForCausalLM.from_pretrained(output_dir)

In [None]:
# Fine-tuned model

classifier = pipeline(
    "text-generation", 
    model=model2,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    max_length=100,
    device=device
)

response = classifier("What is the most eaten food in Brazil?")
response

In [None]:
# Move model to endpoint
finished_model = AutoModelForCausalLM.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

finished_model.save_pretrained(enpoint_url)
tokenizer.save_pretrained(enpoint_url)