In [None]:
# ! gcloud auth application-default login

In [None]:
# ! gcloud config set project ivanmkc-test
# ! gcloud auth application-default set-quota-project ivanmkc-test

In [None]:
# ! gcloud config list
# ! pip install google-cloud-aiplatform tenacity --upgrade

In [None]:
import abc
import dataclasses

In [None]:
import pandas as pd

@dataclasses.dataclass
class TranslationSource(abc.ABC):
    def get_phrases(self) -> list[str]:
        pass
    
class CantoneseTranslationSource(TranslationSource):
    def get_phrases(self) -> list[str]:
        return ["ABC", "DEF"]

class PandasTranslationSource(TranslationSource):
    # A source that loads in a pandas DataFrame
    def __init__(self, df: pd.DataFrame, series_name: str):
        self.df = df
        self.series_name = series_name

    def get_phrases(self) -> list[str]:
        return self.df[self.series_name].str.strip().to_list()
    
class CSVTranslationSource(PandasTranslationSource):
    # A source that loads in a CSV file
    def __init__(self, csv_file: str, column_name: str):
        df = pd.read_csv(csv_file)
        super().__init__(df, column_name)

In [None]:
# import pandas as pd

# filepath = "data/cantonese_dish/cantonese-dish-dataset sheet 3 (eng to canto).csv"
# df = pd.read_csv(filepath)

# df.head()['Menu English Translation'].str.strip()

In [None]:
filepath = "data/cantonese_dish/cantonese-dish-dataset sheet 3 (eng to canto).csv"
olivia_canto_dish_dataset_english = CSVTranslationSource(filepath, column_name='Menu English Translation')
# olivia_canto_dish_dataset_english.get_phrases()

In [None]:
import numpy as np

class Evaluator(abc.ABC):
    def evaluate(translations: list[str], phrases: list[str]) -> float:
        pass
    
class NaiveEvaluator(abc.ABC):
    "Ratio of exact matches"
    def evaluate(translations: list[str], phrases: list[str]) -> float:
        correct_matches = np.array(translations) == np.array(phrases)
        
        return np.sum(correct_matches)/len(correct_matches)

In [None]:
from google.cloud import aiplatform

aiplatform.init(project="ivanmkc-test", location="us-central1")
for endpoint in aiplatform.Endpoint.list(location="us-central1"):
    print(endpoint)

In [None]:
import lib
import importlib
importlib.reload(lib)
from lib.models.model import Model
from lib.models.vertex_ai_endpoint_model import VertexAIEndpointModel
from lib.models.vertex_ai_generative_model import VertexAIGenerativeModel
from lib.models.google_translate_model import GoogleTranslateModel


additional_context = "This phrase comes from a Cantonese restaurant menu"

# Initialize models
models: dict[str, Model] = {
    # "gemma_v1_7b_it": VertexAIEndpointModel(endpoint='5868133139859111936', location='us-west1', target_language='Cantonese'),
    # "gemma_v2_27b_it": VertexAIEndpointModel(endpoint='7147260987148599296', location='us-west1', target_language='Cantonese'),
    # "llama_3_8b": VertexAIEndpointModel(endpoint='8833577572622139392', location='us-west1', source_language="English", target_language='Cantonese', additional_context=additional_context),
    "google_translate": GoogleTranslateModel(target_language="zh-hk"),
    "gemma_3_12b_it": VertexAIEndpointModel(endpoint='6619382156118458368', location='us-central1', source_language="English", target_language='Cantonese', additional_context=additional_context),
    "gemini-2.0-flash": VertexAIGenerativeModel(model_name="gemini-2.0-flash", source_language="English", target_language="Cantonese", additional_context=additional_context)
}

In [None]:
source = olivia_canto_dish_dataset_english
phrases = source.get_phrases()

print(f"Number of phrases: {len(phrases)}")

translations = {}
results = {}

# For each word, send to the model
for model_name, model in models.items():
    translations[model_name] = model.translate_batch(phrases)
    # results[model_name] = evaluation.evaluate(translations=translations, phrases=phrases)

In [None]:
# translations

In [38]:
for model_name, translations_for_model in list(translations.items()):
    for word, translation in list(zip(phrases, translations_for_model))[:10]:
        print(f"{model_name}: {word} -> {translation}")

gemini-2.0-flash: Sea Cucumber & Assorted Mushroom w/ XO Sauce -> 海參雜菌XO醬
gemini-2.0-flash: Fried Beef Flank with Onion, Shallot, Scallion with XO Sauce in Clay Pot -> XO醬蔥爆沙嗲牛腩煲
gemini-2.0-flash: XO Sauce Chicken Chow Mein -> XO醬雞肉炒麵
gemini-2.0-flash: Scallops & Prawns w/ XO Sauce -> 帶子XO醬炒蝦仁
gemini-2.0-flash: Seafood with E-Fu Noodle in XO Sauce -> XO醬海鮮伊麵
gemini-2.0-flash: Sauteed Pork Cheek Meat in XO Sauce -> XO醬炒豬面珠墩
gemini-2.0-flash: Stir-fried Rice Roll w/ XO Sauce -> XO醬炒腸粉
gemini-2.0-flash: Pan-Fried Daikon Cake w/ XO Sauce -> XO醬煎蘿蔔糕
gemini-2.0-flash: Pan Fried Shredded Beef with XO Sauce -> XO醬乾炒牛河
gemini-2.0-flash: Stir-fried Sea Cucumber Meat with Scallops, Broccoli and XO Sauce -> XO醬西蘭花帶子海參花
google_translate: Sea Cucumber & Assorted Mushroom w/ XO Sauce -> 海參什錦菇XO醬
google_translate: Fried Beef Flank with Onion, Shallot, Scallion with XO Sauce in Clay Pot -> 洋蔥蔥蔥XO醬炒牛腩煲
google_translate: XO Sauce Chicken Chow Mein -> XO醬雞炒麵
google_translate: Scallops & Prawns w/ XO Sauce

In [105]:
from lib.models import helpers

In [106]:
df = pd.DataFrame(data=[tuple([phrase] + [translations[model_name][i] for model_name in translations]) for i, phrase in enumerate(phrases)], columns=['source'] + list(translations.keys()))

# Apply text extraction
df_cleaned = df.copy()
for col in df.columns:
    if col != 'source':
        df_cleaned[col] = df_cleaned[col].apply(helpers.extract_translation)

In [107]:
# Show any rows with None
df_cleaned[df_cleaned.isnull().any(axis=1)]

Unnamed: 0,source,gemini-2.0-flash,google_translate,gemma_3_12b_it
539,"Pork Spare Rib Soup with Dried Octopus, Dried ...",,章魚蠔豉蓮藕排骨湯,豬骨湯配乾魷魚、乾蠔同蓮藕


In [108]:
# Show any rows with 'translation'
df_cleaned[df_cleaned.apply(lambda row: row.astype(str).str.contains('translation', case=False).any(), axis=1)]

Unnamed: 0,source,gemini-2.0-flash,google_translate,gemma_3_12b_it


In [109]:
df_cleaned

Unnamed: 0,source,gemini-2.0-flash,google_translate,gemma_3_12b_it
0,Sea Cucumber & Assorted Mushroom w/ XO Sauce,海參雜菌XO醬,海參什錦菇XO醬,蟲草花同雜菇 XO 汁
1,"Fried Beef Flank with Onion, Shallot, Scallion...",XO醬蔥爆沙嗲牛腩煲,洋蔥蔥蔥XO醬炒牛腩煲,炸洋蔥𠝹蔥同XO醬煲牛仔骨
2,XO Sauce Chicken Chow Mein,XO醬雞肉炒麵,XO醬雞炒麵,魚子醬雞炒麵
3,Scallops & Prawns w/ XO Sauce,帶子XO醬炒蝦仁,XO 醬乾貝蝦,扇家樂同蝦仁撈XO醬
4,Seafood with E-Fu Noodle in XO Sauce,XO醬海鮮伊麵,XO醬伊麵海鮮,豉油皇炒伊麵配海鮮
...,...,...,...,...
996,Coke with Ice-cream,雪糕可樂,可樂加冰淇淋,凍可樂
997,Sponge Cake w/ Brown Sugar,砵仔糕,紅糖海綿蛋糕,粟糖海綿蛋糕
998,Fresh Okra Poached w/ Garlic & Scallion,蒜蓉蔥油浸秋葵,新鮮秋葵配大蒜和蔥一起煮,蒜蓉花浸 okra
999,Vegetarian Trio,齋三寶,素食三重奏,齋精菜三拼


In [110]:
df_cleaned.to_csv("translations.csv")

In [111]:
# Undeploy and delete models when done

locations = ["us-central1", "us-west1"]

UNDEPLOY = True

if UNDEPLOY:
    for location in locations:
        for endpoint in aiplatform.Endpoint.list(location=location):
            endpoint.undeploy_all()    
        for model in aiplatform.Model.list(location=location):
            model.delete()


Undeploying Endpoint model: projects/169190568756/locations/us-central1/endpoints/2387687356250456064
Undeploy Endpoint model backing LRO: projects/169190568756/locations/us-central1/endpoints/2387687356250456064/operations/1901194245969018880
Endpoint model undeployed. Resource name: projects/169190568756/locations/us-central1/endpoints/2387687356250456064
Undeploying Endpoint model: projects/169190568756/locations/us-central1/endpoints/6619382156118458368
Undeploy Endpoint model backing LRO: projects/169190568756/locations/us-central1/endpoints/6619382156118458368/operations/1498122079319359488
Endpoint model undeployed. Resource name: projects/169190568756/locations/us-central1/endpoints/6619382156118458368
Deleting Model : projects/169190568756/locations/us-central1/models/llama3-8b-chat-1741980437648
Model deleted. . Resource name: projects/169190568756/locations/us-central1/models/llama3-8b-chat-1741980437648
Deleting Model resource: projects/169190568756/locations/us-central1/mo