# Exploring parallel English German Embeddings

In [1]:
from langchain.embeddings import HuggingFaceEmbeddings

In [3]:
embedding_provider = HuggingFaceEmbeddings(model_name='T-Systems-onsite/cross-en-de-roberta-sentence-transformer')

Downloading (…)7ceaf/.gitattributes:   0%|          | 0.00/445 [00:00<?, ?B/s]

Downloading (…)e177eaa7ceaf/LICENSE:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

Downloading (…)77eaa7ceaf/README.md:   0%|          | 0.00/8.05k [00:00<?, ?B/s]

Downloading (…)eaa7ceaf/config.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

No sentence-transformers model found with name /home/hfwittmann/.cache/torch/sentence_transformers/T-Systems-onsite_cross-en-de-roberta-sentence-transformer. Creating a new one with MEAN pooling.


In [4]:
embedding_provider

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
), model_name='T-Systems-onsite/cross-en-de-roberta-sentence-transformer', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False)

In [5]:
# Generate embeddings
texts = ["This is a test.", "Another test.", "Diese Person ist unglücklich.", "Diese Person ist glücklich", "This person is happy."]
embeddings = embedding_provider.embed_documents(texts)

# Print the embeddings
for text, embedding in zip(texts, embeddings):
    print(f"Text: {text}")
    # print(f"Embedding: {embedding}")
    print()

Text: This is a test.

Text: Another test.

Text: Diese Person ist unglücklich.

Text: Diese Person ist glücklich

Text: This person is happy.



# Define cosine similarity by Hand

In [6]:
import numpy as np
def cosine_similarity(a, b):
    a = np.array(a)
    b = np.array(b)
    
    return a.dot(b) / (a.dot(a) * b.dot(b))**0.5
    

In [7]:
cs = cosine_similarity(embeddings[2], embeddings[3])
print(cs, embeddings[2], embeddings[3])

0.33797983307128815 [0.07698100060224533, 0.3046988844871521, -0.0819513127207756, 0.0027928634081035852, 0.9755547642707825, -0.14837656915187836, -0.2770584523677826, -0.17466460168361664, 0.09542140364646912, 0.0629059448838234, -0.19690220057964325, -0.6073769330978394, -0.00852902326732874, 0.08462145924568176, -0.4594670832157135, -0.3510087728500366, 0.636410653591156, -0.07009952515363693, 0.17461811006069183, -0.02023945562541485, -0.22675937414169312, 0.6439845561981201, -0.0010363815817981958, -0.27838805317878723, -0.27073073387145996, 0.018822437152266502, 1.1090550422668457, -0.3238269090652466, -0.08327383548021317, -0.22292619943618774, -0.5421062111854553, 0.23257514834403992, -0.3452030420303345, 0.02640361897647381, -0.11112788319587708, -0.11718830466270447, 0.03944103419780731, -0.029476217925548553, -0.008027421310544014, 0.09952959418296814, 0.657079815864563, -0.027036849409341812, 0.2496059685945511, 0.009831379167735577, 0.12528583407402039, -0.009803388267755

In [8]:
for text1 in texts:
    for text2 in texts:
        embeddings1 = embedding_provider.embed_documents([text1])[0]
        embeddings2 = embedding_provider.embed_documents([text2])[0]
        c_s = cosine_similarity(embeddings1, embeddings2)
        print('------------------------------------------------------------')
        print(text1, text2, 'similarity:', c_s)

------------------------------------------------------------
This is a test. This is a test. similarity: 1.0
------------------------------------------------------------
This is a test. Another test. similarity: 0.6021323859011859
------------------------------------------------------------
This is a test. Diese Person ist unglücklich. similarity: 0.0063078595935602885
------------------------------------------------------------
This is a test. Diese Person ist glücklich similarity: 0.03549630299764126
------------------------------------------------------------
This is a test. This person is happy. similarity: 0.06479121622443222
------------------------------------------------------------
Another test. This is a test. similarity: 0.6021323859011859
------------------------------------------------------------
Another test. Another test. similarity: 1.0
------------------------------------------------------------
Another test. Diese Person ist unglücklich. similarity: -0.00267304704708