In [None]:
pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.met

In [None]:
from sentence_transformers import SentenceTransformer, util
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

  from tqdm.autonotebook import tqdm, trange


Mounted at /content/drive


In [None]:
import os
import torch
import re

In [None]:
def Open_file(filename):
    with open(filename, 'r', encoding='UTF-8') as file:
        file_string = file.read()
    file.close()
    return file_string

def Save_sentences(simple_sen, english_sen, sentences_full_path, text_name, mode):
    new_sen_path = os.path.join(sentences_full_path, text_name)
    with open(new_sen_path, mode=mode, encoding='utf8') as file:
        file.write(simple_sen)
        file.write('\n')
        file.write(english_sen)
        file.write('\n\n')
    file.close()

In [None]:
def main(simple_sentences_directory_path, english_sentences_directory_path, similar_pairs_dir_path):

    # Load the pre-trained SBERT model
    model = SentenceTransformer('all-MiniLM-L6-v2')
    print("Start pre-processing the sentences")
    simple_sen_list = os.listdir(simple_sentences_directory_path)
    simple_sen_list_sentences = []
    for raw_text_file in simple_sen_list:
        raw_text_path = os.path.join(simple_sentences_directory_path, raw_text_file)
        text_string = Open_file(raw_text_path)
        sentences = text_string.split('\n')
        for s in sentences:
            if (len(s)>3) & (len(re.findall(r"[^a-zA-Z0-9,.'\"!?(): \-]+", s))==0) & (len(re.findall(r"\.jpg|\.png", s))==0):
              if s not in simple_sen_list_sentences:
                simple_sen_list_sentences.append(s)
    print(f"There are {len(simple_sen_list_sentences)} different Simple English sentences")
    english_sen_list = os.listdir(english_sentences_directory_path)
    english_sen_list_sentences = []
    for raw_text_file in english_sen_list:
        raw_text_path = os.path.join(english_sentences_directory_path, raw_text_file)
        text_string = Open_file(raw_text_path)
        sentences = text_string.split('\n')
        for s in sentences:
            if (len(s)>3) & (len(re.findall(r"[^a-zA-Z0-9,.'\"!?(): \-]+", s))==0) & (len(re.findall(r"\.jpg|\.png", s))==0):
              if s not in english_sen_list_sentences:
                english_sen_list_sentences.append(s)
    print(f"There are {len(english_sen_list_sentences)} different English sentences")
    print("Start encoding the sentences using the model")
    with torch.no_grad():
      embeddings_1 = model.encode(simple_sen_list_sentences, convert_to_tensor=True)
      embeddings_2 = model.encode(english_sen_list_sentences, convert_to_tensor=True)

    cosine_scores = util.pytorch_cos_sim(embeddings_1, embeddings_2)

    similarity_threshold = 0.75
    first = True

    print("Finding pairs of sentences with similarity above the threshold")
    for i in range(int(len(simple_sen_list_sentences))):
        for j in range(len(english_sen_list_sentences)):
            if cosine_scores[i][j] > similarity_threshold:
              if first:
                Save_sentences(simple_sen_list_sentences[i], english_sen_list_sentences[j], similar_pairs_dir_path, 'similar_pairs.txt', "w")
                first = False
              else:
                Save_sentences(simple_sen_list_sentences[i], english_sen_list_sentences[j], similar_pairs_dir_path, 'similar_pairs.txt', "a")

In [None]:
simple_sentences_files_dir_path = '/content/drive/MyDrive/NLP_Proj/simple_sentences'
english_sentences_files_dir_path = '/content/drive/MyDrive/NLP_Proj/english_sentences'
similar_pairs_dir_path = '/content/drive/MyDrive/NLP_Proj/similar'

# Create directories if they don't exist
os.makedirs(similar_pairs_dir_path, exist_ok=True)

main(simple_sentences_files_dir_path, english_sentences_files_dir_path, similar_pairs_dir_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Start pre-processing the sentences
There are 10216 different Simple English sentences
There are 35680 different English sentences
Start encoding the sentences using the model
Finding pairs of sentences with similarity above the threshold
