In [2]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.met

In [3]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util

# Load the pre-trained model (optimized for GPU usage)
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda' if torch.cuda.is_available() else 'cpu')

# Load the updated Excel file
file_path = '/content/drive/MyDrive/ALY6080/FINAL_folder/updated_ques_table_info_with_similarity.xlsx'
df = pd.read_excel(file_path)

# Encode all Table_info entries into embeddings
table_infos = df['Table_info'].tolist()
table_info_embeddings = model.encode(table_infos, convert_to_tensor=True)

# Function to find top 3 semantically similar Table_info entries and the actual table info score for a given question
def find_top3_similar_tables(question, actual_table_info, table_info_embeddings, table_infos):
    question_embedding = model.encode(question, convert_to_tensor=True)
    actual_table_info_embedding = model.encode(actual_table_info, convert_to_tensor=True)

    # Compute cosine similarities between the question and all table_infos
    cosine_scores = util.pytorch_cos_sim(question_embedding, table_info_embeddings)[0]

    # Compute similarity for the actual table info
    actual_table_info_score = util.pytorch_cos_sim(question_embedding, actual_table_info_embedding)[0].item()

    # Find the top 3 matches
    top_results = torch.topk(cosine_scores, k=3)

    # Retrieve the top 3 similar table infos
    top_table_infos = [table_infos[idx] for idx in top_results.indices]
    top_scores = [cosine_scores[idx].item() for idx in top_results.indices]

    return actual_table_info_score, top_table_infos, top_scores

# List to store results
results = []

# Iterate through each question and find top 3 similar Table_info entries
for index, row in df.iterrows():
    question = row['Human Questions']
    actual_table_info = row['Table_info']

    actual_table_info_score, top_table_infos, top_scores = find_top3_similar_tables(question, actual_table_info, table_info_embeddings, table_infos)

    results.append({
        "Question": question,
        "Actual Table Info": actual_table_info,
        "Actual Table Info Score": actual_table_info_score,
        "Top 1 Similar Table Info": top_table_infos[0],
        "Top 1 Score": top_scores[0],
        "Top 2 Similar Table Info": top_table_infos[1],
        "Top 2 Score": top_scores[1],
        "Top 3 Similar Table Info": top_table_infos[2],
        "Top 3 Score": top_scores[2]
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Save the results to a new Excel file
output_file_path = '/content/drive/MyDrive/ALY6080/FINAL_folder/semantic_search_results_with_actual_score.xlsx'
results_df.to_excel(output_file_path, index=False)

print("Results saved to:", output_file_path)


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Results saved to: /content/drive/MyDrive/ALY6080/FINAL_folder/semantic_search_results_with_actual_score.xlsx
