# Setup

In [1]:
!pip install python-dotenv transformers datasets huggingface_hub



In [2]:
!pip install --upgrade transformers



In [27]:
from dotenv import load_dotenv
import os
from openai import OpenAI
import json
from google.colab import drive, files
from IPython.display import Image
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import pandas as pd
from huggingface_hub import notebook_login, login
import requests
import base64
from collections import Counter, defaultdict
from sentence_transformers import SentenceTransformer, util
import torch

In [4]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
cd '/content/drive/MyDrive'

/content/drive/MyDrive


In [6]:
load_dotenv('.env')
chat_api_key = os.getenv("OPENAI_API_KEY")
hug_api_key = os.getenv("HUGGINGFACE_TOKEN")
img_api_key = os.getenv("IMGBB_API")
client = OpenAI(api_key=chat_api_key)
login(hug_api_key)

In [25]:
model_id = 'beomi/KoAlpaca-llama-1-7b'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggin

config.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

pytorch_model-00001-of-00003.bin:   0%|          | 0.00/9.88G [00:00<?, ?B/s]

pytorch_model-00002-of-00003.bin:   0%|          | 0.00/9.89G [00:00<?, ?B/s]

pytorch_model-00003-of-00003.bin:   0%|          | 0.00/7.18G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

# Main

In [7]:
uploaded = files.upload()
filename = next(iter(uploaded))
with open(filename, "rb") as f:
    encoded_image = base64.b64encode(f.read())
url = "https://api.imgbb.com/1/upload"
payload = {
    "key": img_api_key,
    "image": encoded_image
}
response = requests.post(url, data=payload)
if response.status_code == 200:
    data = response.json()
    image_url = data['data']['url']
# uploaded_image = im.upload_image(filename, title='colab')

Saving food.jpg to food (4).jpg


In [22]:
completion = client.chat.completions.create(
    model='gpt-4o-mini',
    messages=[
        {'role' : 'system',
         'content' : '''
         Find out which ingredients are used in the food.
         And response only with the found ingredients.

         IMPORTANT: DO NOT INCLUDE ANY EXPLANATIONS OR OTHER SENTENCES EXCEPT FOR INGREDIENTS. DO NOT USE PARENTHESES.
         DO NOT USE "OR".

         Be the most specific and precise. Try to classify the ingredient precisely as possible.
         Do not use adjectives, like 'wide', 'flat'.
         EXAMPLE :
         Wide rice noodle --> rice noodle
         A fish fillet --> A Snapper fillet
         Brown sauce --> Demiglace Sauce
         Vegetables --> Bok choy, Spinach
         Broth --> Cow bone broth

         '''
        },
        {'role' : 'user', 'content' : [
            {'type' : 'image_url',
            'image_url' : {
                'url' : image_url
            ,
          }}
        ]}
    ]
)

In [23]:
content = completion.choices[0].message.content
ingredient_list = content.split(', ')

In [15]:
df_grapes = pd.read_csv('/content/drive/MyDrive/project/git/HateSlop_Final/Grapes.csv')
df_wines = pd.read_csv('/content/drive/MyDrive/project/git/HateSlop_Final/Wines.csv')
df_flavors = pd.read_csv('/content/drive/MyDrive/project/git/HateSlop_Final/flavors.csv')

In [24]:
ingredient_list

['Beef',
 'wide noodles',
 'green onions',
 'broth',
 'pickled vegetables',
 'spices.']

In [28]:
word_model = SentenceTransformer('all-MiniLM-L6-v2')
def most_similar_flavor(data, predefined):
    data_counter = Counter(word.lower() for word in data)
    flavor_embeddings = word_model.encode(predefined, convert_to_tensor=True)

    similarity_scores = defaultdict(float)

    for word, freq in data_counter.items():
        word_embedding = word_model.encode(word, convert_to_tensor=True)
        cosine_scores = util.cos_sim(word_embedding, flavor_embeddings)[0]
        for i, score in enumerate(cosine_scores):
            similarity_scores[predefined[i]] += score.item() * freq

    if not similarity_scores:
        return "unknown"
    return max(similarity_scores, key=similarity_scores.get)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [29]:
pair = most_similar_flavor(df_flavors, ingredient_list)

In [30]:
print(pair)

spices.


In [31]:
wine = '라마 올드 바인 말벡'
variety = 'Malbec'

In [None]:
from transformers import pipeline
model_jid = 'JHGarry/HateSlop_Final'
generator = pipeline("text-generation", model=model_jid, tokenizer=tokenizer)

prompt = f"Wine: {wine} \nVariety: {variety} \nNote:"
output = generator(prompt, max_new_tokens=120, do_sample=True, temperature=0.8)

print(output[0]["generated_text"])

config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.40G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cpu


Wine: 라마 올드 바인 말벡 
Variety: Malbec 
Note: 향: 풍미가 느껴지며, 진해적인 느낌이 굉장히 강하게 감돈다. 살짝 달콤한 맛과 함께 생동감이 돋보이며, 이 러운 블
