# Original

In [3]:
import numpy as np

# Load pre-trained word embeddings for English and Polish
def load_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.rstrip().split(' ')
            word = values[0]
            vector = np.array(values[1:], dtype=np.float64)
            embeddings[word] = vector
    return embeddings

en_embeddings = load_embeddings('../muse/wiki.multi.en.vec')
pl_embeddings = load_embeddings('../muse/wiki.multi.pl.vec')

# Sample sentences in English and Polish
sentence_en = "The violin sang melancholic melodies, echoing the sorrowful tales of lost love and distant dreams."
sentence_pl = "Biblioteka tętniła ekscytacją, gdy studenci z zapałem odkrywali ogromną kolekcję wiedzy, dyskutując o pomysłach i odkryciach."

# Tokenize sentences
tokens_en = sentence_en.lower().split()
tokens_pl = sentence_pl.lower().split()

# Get word embeddings for each word in the sentences
def sentence_embedding(tokens, embeddings):
    word_embeddings = [embeddings[word] for word in tokens if word in embeddings]
    if len(word_embeddings) > 0:
        return np.mean(word_embeddings, axis=0)
    else:
        return None

sentence_embedding_en = sentence_embedding(tokens_en, en_embeddings)
sentence_embedding_pl = sentence_embedding(tokens_pl, pl_embeddings)

# Compute similarity between the sentence embeddings
if sentence_embedding_en is not None and sentence_embedding_pl is not None:
    similarity = np.dot(sentence_embedding_en, sentence_embedding_pl) / (np.linalg.norm(sentence_embedding_en) * np.linalg.norm(sentence_embedding_pl))
    print(f"Similarity between the sentences: {similarity}")
else:
    print("No embeddings found for the sentence words.")


Similarity between the sentences: 0.6605645867223872


# Improved

In [4]:
import numpy as np

# Load pre-trained word embeddings for English and Polish
def load_embeddings(file_path):
    embeddings = dict()
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.rstrip().split(' ')
            word = values[0]
            vector = np.array(values[1:], dtype=np.float64)
            embeddings[word] = vector
    return embeddings

en_embeddings = load_embeddings('../muse/wiki.multi.en.vec')
pl_embeddings = load_embeddings('../muse/wiki.multi.pl.vec')

sentence_en = "The violin sang melancholic melodies, echoing the sorrowful tales of lost love and distant dreams."
sentence_pl = "Biblioteka tętniła ekscytacją, gdy studenci z zapałem odkrywali ogromną kolekcję wiedzy, dyskutując o pomysłach i odkryciach."

# Tokenize sentences
def preprocess_sentence(sentence):
    tokens = sentence.lower().split()
    return tokens

# Get word embeddings for each word in the sentences
def sentence_embedding(tokens, embeddings):
    word_embeddings = [embeddings[word] for word in tokens if word in embeddings]
    if len(word_embeddings) > 0:
        return np.mean(word_embeddings, axis=0)
    else:
        return None

# Compute similarity between the sentence embeddings
def compute_similarity(sentence_embedding_en, sentence_embedding_pl):
    similarity = np.dot(sentence_embedding_en, sentence_embedding_pl) / (np.linalg.norm(sentence_embedding_en) * np.linalg.norm(sentence_embedding_pl))
    return similarity

tokens_en = preprocess_sentence(sentence_en)
tokens_pl = preprocess_sentence(sentence_pl)

sentence_embedding_en = sentence_embedding(tokens_en, en_embeddings)
sentence_embedding_pl = sentence_embedding(tokens_pl, pl_embeddings)

#Output the results
if sentence_embedding_en is not None and sentence_embedding_pl is not None:
    similarity = compute_similarity(sentence_embedding_en, sentence_embedding_pl)
    print(f"Similarity between the sentences: {similarity}")
else:
    print("No embeddings found for the sentence words.")


Similarity between the sentences: 0.6605645867223872


# TF-IDF

In [5]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

def load_embeddings(file_path):
    embeddings = dict()
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.rstrip().split(' ')
            word = values[0]
            vector = np.array(values[1:], dtype=np.float64)
            embeddings[word] = vector
    return embeddings

# Your sentence data and functions
en_embeddings = load_embeddings('../muse/wiki.multi.en.vec')
pl_embeddings = load_embeddings('../muse/wiki.multi.pl.vec')

sentence_en = "The violin sang melancholic melodies, echoing the sorrowful tales of lost love and distant dreams."
sentence_pl = "Biblioteka tętniła ekscytacją, gdy studenci z zapałem odkrywali ogromną kolekcję wiedzy, dyskutując o pomysłach i odkryciach."

def preprocess_sentence(sentence):
    tokens = sentence.lower().split()
    return tokens

def compute_similarity(sentence_embedding_en, sentence_embedding_pl):
    similarity = np.dot(sentence_embedding_en, sentence_embedding_pl) / (np.linalg.norm(sentence_embedding_en) * np.linalg.norm(sentence_embedding_pl))
    return similarity

# Create a TF-IDF vectorizer
tfidf_en = TfidfVectorizer()
tfidf_pl = TfidfVectorizer()

# Fit the vectorizer with the sentences
tfidf_en.fit([sentence_en])
tfidf_pl.fit([sentence_pl])

# Transform sentences to get TF-IDF values for each word
tfidf_matrix_en = tfidf_en.transform([sentence_en])
tfidf_matrix_pl = tfidf_pl.transform([sentence_pl])

# Map words to their TF-IDF values
word_to_tfidf_en = dict(zip(tfidf_en.get_feature_names_out(), tfidf_matrix_en.toarray()[0]))
word_to_tfidf_pl = dict(zip(tfidf_pl.get_feature_names_out(), tfidf_matrix_pl.toarray()[0]))

def sentence_embedding(tokens, embeddings, word_to_tfidf):
    weighted_word_embeddings = []
    for word in tokens:
        if word in embeddings and word in word_to_tfidf:
            weighted_embedding = embeddings[word] * word_to_tfidf[word]
            weighted_word_embeddings.append(weighted_embedding)
    if weighted_word_embeddings:
        return np.mean(weighted_word_embeddings, axis=0)
    else:
        return None


tokens_en = preprocess_sentence(sentence_en)
tokens_pl = preprocess_sentence(sentence_pl)

# Get sentence embeddings using TF-IDF weighted word embeddings
sentence_embedding_en = sentence_embedding(tokens_en, en_embeddings, word_to_tfidf_en)
sentence_embedding_pl = sentence_embedding(tokens_pl, pl_embeddings, word_to_tfidf_pl)

if sentence_embedding_en is not None and sentence_embedding_pl is not None:
    similarity = compute_similarity(sentence_embedding_en, sentence_embedding_pl)
    print(f"Similarity between the sentences: {similarity}")
else:
    print("No embeddings found for the sentence words.")


Similarity between the sentences: 0.5801157206672021


# word2vec

In [7]:
import numpy as np
from gensim.models import KeyedVectors

# Load Word2Vec embeddings
def load_word2vec_embeddings(file_path):
    embeddings = KeyedVectors.load_word2vec_format(file_path, binary=False)
    return embeddings

# Function to load embeddings using the existing load_embeddings function
def load_embeddings(file_path, is_word2vec=False):
    if is_word2vec:
        return load_word2vec_embeddings(file_path)
    else:
        embeddings = dict()
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.rstrip().split(' ')
                word = values[0]
                vector = np.array(values[1:], dtype=np.float64)
                embeddings[word] = vector
        return embeddings

# Load Word2Vec embeddings for English and Polish
en_embeddings = load_embeddings('../muse/wiki.multi.en.vec', is_word2vec=True)
pl_embeddings = load_embeddings('../muse/wiki.multi.pl.vec', is_word2vec=True)

# Your sentences
sentence_en = "The violin sang melancholic melodies, echoing the sorrowful tales of lost love and distant dreams."
sentence_pl = "Biblioteka tętniła ekscytacją, gdy studenci z zapałem odkrywali ogromną kolekcję wiedzy, dyskutując o pomysłach i odkryciach."

# Function to preprocess the sentence
def preprocess_sentence(sentence):
    tokens = sentence.lower().split()
    return tokens

# Function to calculate sentence embeddings using Word2Vec representations
def sentence_embedding(tokens, embeddings):
    word_embeddings = []
    for word in tokens:
        if word in embeddings:
            word_embeddings.append(embeddings[word])
    if word_embeddings:
        return np.mean(word_embeddings, axis=0)
    else:
        return None

tokens_en = preprocess_sentence(sentence_en)
tokens_pl = preprocess_sentence(sentence_pl)

# Get sentence embeddings using Word2Vec word embeddings
sentence_embedding_en = sentence_embedding(tokens_en, en_embeddings)
sentence_embedding_pl = sentence_embedding(tokens_pl, pl_embeddings)

if sentence_embedding_en is not None and sentence_embedding_pl is not None:
    similarity = np.dot(sentence_embedding_en, sentence_embedding_pl) / (np.linalg.norm(sentence_embedding_en) * np.linalg.norm(sentence_embedding_pl))
    print(f"Similarity between the sentences: {similarity}")
else:
    print("No embeddings found for the sentence words.")


Similarity between the sentences: 0.6605645418167114
