In [None]:
from transformers import BertModel, BertTokenizer
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained multilingual BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')

def get_embedding(text, tokenizer, model):
    # Tokenize input text and get embeddings
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    # Take the mean of the hidden states for sentence-level embedding
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embeddings.numpy()

# Example idioms and phrases (replace with actual Chinese idioms and English phrases)
chinese_idioms = ["对牛弹琴"] #, "画蛇添足", "守株待兔"]
english_phrases = ["preaching to the choir"] # "overdoing something", "waiting for something to happen"]

# Get embeddings for Chinese idioms
chinese_embeddings = [get_embedding(idiom, tokenizer, model) for idiom in chinese_idioms]

# Get embeddings for English phrases
english_embeddings = [get_embedding(phrase, tokenizer, model) for phrase in english_phrases]

# Compare embeddings using cosine similarity
similarity_matrix = cosine_similarity(chinese_embeddings, english_embeddings)

# Output the similarity scores
for i, idiom in enumerate(chinese_idioms):
    for j, phrase in enumerate(english_phrases):
        print(f"Similarity between '{idiom}' and '{phrase}': {similarity_matrix[i, j]:.4f}")

  from .autonotebook import tqdm as notebook_tqdm
