In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import faiss

# Function to read sentences from a text file
def read_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        sentences = [line.strip() for line in file.readlines()]
    return sentences

# Load the LaBSE model
model = SentenceTransformer('LaBSE')

# Read sentences from your files
chinese_sentences = read_sentences('Santi01.txt')
english_sentences = read_sentences('ThreeBody01.txt')

# Compute embeddings
chinese_embeddings = model.encode(chinese_sentences, convert_to_tensor=True)
english_embeddings = model.encode(english_sentences, convert_to_tensor=True)

# Use FAISS to find the most similar sentences
index = faiss.IndexFlatL2(768)  # LaBSE embeddings size is 768
index.add(chinese_embeddings.numpy())

# Search for top 1 most similar sentences
D, I = index.search(english_embeddings.numpy(), 1)

# Save results to a CSV file
matched_sentences = []
for i in range(len(I)):
    matched_sentences.append({
        'English Sentence': english_sentences[i],
        'Chinese Sentence': chinese_sentences[I[i][0]],
        'Similarity Score': D[i][0]
    })

df = pd.DataFrame(matched_sentences)
df.to_csv('matched_sentences.csv', index=False)


KeyboardInterrupt: 

In [3]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import faiss
import spacy

# Function to read and segment sentences from a text file
def read_and_segment_sentences(file_path, lang_model):
    nlp = spacy.load(lang_model)
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        doc = nlp(text)
        sentences = [sent.text.strip() for sent in doc.sents]
    return sentences

# Load the LaBSE model
model = SentenceTransformer('LaBSE')

# Read and segment sentences from your files
chinese_sentences = read_and_segment_sentences('Santi03_Part2.txt', 'zh_core_web_sm')
english_sentences = read_and_segment_sentences('ThreeBody03_Part2.txt', 'en_core_web_sm')

# Compute embeddings
chinese_embeddings = model.encode(chinese_sentences, convert_to_tensor=True)
english_embeddings = model.encode(english_sentences, convert_to_tensor=True)

# Use FAISS to find the most similar sentences
index = faiss.IndexFlatL2(768)  # LaBSE embeddings size is 768
index.add(chinese_embeddings.numpy())

# Search for top N most similar sentences
D, I = index.search(english_embeddings.numpy(), 5) # Top 5 matches for more options

threshold = 0.75 
# Save results to a DataFrame
matched_sentences = []
for i, distances in enumerate(D):
    for j, distance in enumerate(distances):
        if distance < threshold: # Set a similarity threshold
            matched_sentences.append({
                'English Sentence': english_sentences[i],
                'Chinese Sentence': chinese_sentences[I[i][j]],
                'Similarity Score': distance
            })

df = pd.DataFrame(matched_sentences)
df.to_csv('matched_sentences.csv', index=False)
