In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.0.1


In [3]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import ast

# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the SentenceTransformer model and move it to the GPU
model = SentenceTransformer('all-MiniLM-L6-v2')
model = model.to(device)

# Step 1: Load input sentences and entity knowledge strings from CSVs
input_df = pd.read_csv('/kaggle/input/business-json/Test (1).csv')  # CSV with columns 'sentence' and 'actual_entity'
entity_df = pd.read_csv('/kaggle/input/business-json/Knowledge.csv')  # CSV with columns 'entity_name' and 'entity_description'

# Function to extract entity name from the 'entities' column
def extract_entity_name(entities_str):
    # Use ast.literal_eval to safely evaluate the string
    entities_list = ast.literal_eval(entities_str)
    # Extract the 'mention' field from the first entity in the list
    return entities_list[0]['mention'] if entities_list else None

# Apply the function to the 'entities' column
input_df['entity_name'] = input_df['entities'].apply(extract_entity_name)

# Display the updated DataFrame
print(input_df)

asr_df = pd.read_csv("/kaggle/input/business-json/asr_out.csv")

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

      Unnamed: 0  sentence_id  \
0              0            1   
1              1            2   
2              2            3   
3              3            4   
4              4            5   
...          ...          ...   
2621        2621         2622   
2622        2622         2623   
2623        2623         2624   
2624        2624         2625   
2625        2625         2626   

                                                   text  \
0                    Carl's Jr. is a type of Restaurant   
1            Carl's Jr. has headquarters in Carpinteria   
2                   Carl's Jr. has the CEO Carl Karcher   
3              Carl's Jr. was established on 1941-01-01   
4          Carl's Jr. belongs to the fast food industry   
...                                                 ...   
2621                          Rolex is from Switzerland   
2622  Sumitomo Electric Industries has headquarters ...   
2623  Sumitomo Electric Industries was established o...   
2624  Sumitom

In [4]:
# Step 2: Encode the input sentences and entity knowledge strings using GPU
input_embeddings = model.encode(asr_df['ASR_Output'].tolist(), convert_to_tensor=True, device=device)
entity_embeddings = model.encode(entity_df['Knowledge'].tolist(), convert_to_tensor=True, device=device)

Batches:   0%|          | 0/83 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

In [5]:
# Initialize counters for ranking accuracy
top1_correct = 0
top5_correct = 0
top10_correct = 0

res_data = []

# Step 3: Calculate cosine similarity and evaluate rankings
for i, sentence in enumerate(input_df['text']):
    # Get the similarity scores for the current sentence with all entities
    similarity_scores = cosine_similarity(input_embeddings[i].unsqueeze(0).cpu().numpy(), entity_embeddings.cpu().numpy()).flatten()
    
    # Get the indices of entities sorted by similarity score (descending order)
    ranked_entity_indices = similarity_scores.argsort()[::-1]
    
    # Get the ranked entity names
    ranked_entity_names = entity_df['Entity_name'].iloc[ranked_entity_indices]

    # Get the actual entity name for the current sentence
    actual_entity_name = input_df['entity_name'].iloc[i]
    
    res_data.append(
    {
        'sentence_id': input_df['sentence_id'].iloc[i],
        'linked_entity': ranked_entity_names.iloc[0],
        'actual_entity' : actual_entity_name
    }
    )
    # Step 4: Check if the actual entity is within the top 1, 5, and 10
    if actual_entity_name == ranked_entity_names.iloc[0]:
        top1_correct += 1
    if actual_entity_name in ranked_entity_names.iloc[:5].values:
        top5_correct += 1
    if actual_entity_name in ranked_entity_names.iloc[:10].values:
        top10_correct += 1

# Step 5: Calculate and print ranking accuracies
total_sentences = len(input_df)

top1_accuracy = top1_correct / total_sentences * 100
top5_accuracy = top5_correct / total_sentences * 100
top10_accuracy = top10_correct / total_sentences * 100

print(f"Top-1 Accuracy: {top1_accuracy:.2f}%")
print(f"Top-5 Accuracy: {top5_accuracy:.2f}%")
print(f"Top-10 Accuracy: {top10_accuracy:.2f}%")

Top-1 Accuracy: 63.90%
Top-5 Accuracy: 74.68%
Top-10 Accuracy: 78.14%


In [6]:
ner_df = pd.DataFrame(res_data)

In [7]:
ner_df.to_csv("asr_entity_linked_results.csv")

In [8]:
# Step 2: Encode the input sentences and entity knowledge strings using GPU
input_embeddings = model.encode(asr_df['ASR_Output'].tolist(), convert_to_tensor=True, device=device)

# Calculate the substring for the first 20% of each knowledge string
entity_df['Knowledge_20'] = entity_df['Knowledge'].apply(lambda x: x[:int(len(x) * 0.2)])

# Encode the first 20% of the knowledge strings
entity_embeddings = model.encode(entity_df['Knowledge_20'].tolist(), convert_to_tensor=True, device=device)


Batches:   0%|          | 0/83 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

In [9]:
# Initialize counters for ranking accuracy
top1_correct = 0
top5_correct = 0
top10_correct = 0

res_data = []

# Step 3: Calculate cosine similarity and evaluate rankings
for i, sentence in enumerate(input_df['text']):
    # Get the similarity scores for the current sentence with all entities
    similarity_scores = cosine_similarity(input_embeddings[i].unsqueeze(0).cpu().numpy(), entity_embeddings.cpu().numpy()).flatten()
    
    # Get the indices of entities sorted by similarity score (descending order)
    ranked_entity_indices = similarity_scores.argsort()[::-1]
    
    # Get the ranked entity names
    ranked_entity_names = entity_df['Entity_name'].iloc[ranked_entity_indices]

    # Get the actual entity name for the current sentence
    actual_entity_name = input_df['entity_name'].iloc[i]
    
    res_data.append(
    {
        'sentence_id': input_df['sentence_id'].iloc[i],
        'linked_entity': ranked_entity_names.iloc[0],
        'actual_entity' : actual_entity_name
    }
    )
    # Step 4: Check if the actual entity is within the top 1, 5, and 10
    if actual_entity_name == ranked_entity_names.iloc[0]:
        top1_correct += 1
    if actual_entity_name in ranked_entity_names.iloc[:5].values:
        top5_correct += 1
    if actual_entity_name in ranked_entity_names.iloc[:10].values:
        top10_correct += 1

# Step 5: Calculate and print ranking accuracies
total_sentences = len(input_df)

top1_accuracy = top1_correct / total_sentences * 100
top5_accuracy = top5_correct / total_sentences * 100
top10_accuracy = top10_correct / total_sentences * 100

print(f"Top-1 Accuracy: {top1_accuracy:.2f}%")
print(f"Top-5 Accuracy: {top5_accuracy:.2f}%")
print(f"Top-10 Accuracy: {top10_accuracy:.2f}%")

Top-1 Accuracy: 58.04%
Top-5 Accuracy: 69.31%
Top-10 Accuracy: 73.72%


In [10]:
ner_df = pd.DataFrame(res_data)

In [11]:
ner_df.to_csv("asr_entity_linked_partial_knowledge_results.csv")

In [12]:
# Step 2: Encode the input sentences and entity knowledge strings using GPU
input_embeddings = model.encode(asr_df['ASR_Output'].tolist(), convert_to_tensor=True, device=device)
entity_embeddings = model.encode(entity_df['Entity_name'].tolist(), convert_to_tensor=True, device=device)

Batches:   0%|          | 0/83 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

In [13]:
# Initialize counters for ranking accuracy
top1_correct = 0
top5_correct = 0
top10_correct = 0

res_data = []

# Step 3: Calculate cosine similarity and evaluate rankings
for i, sentence in enumerate(input_df['text']):
    # Get the similarity scores for the current sentence with all entities
    similarity_scores = cosine_similarity(input_embeddings[i].unsqueeze(0).cpu().numpy(), entity_embeddings.cpu().numpy()).flatten()
    
    # Get the indices of entities sorted by similarity score (descending order)
    ranked_entity_indices = similarity_scores.argsort()[::-1]
    
    # Get the ranked entity names
    ranked_entity_names = entity_df['Entity_name'].iloc[ranked_entity_indices]

    # Get the actual entity name for the current sentence
    actual_entity_name = input_df['entity_name'].iloc[i]
    
    res_data.append(
    {
        'sentence_id': input_df['sentence_id'].iloc[i],
        'linked_entity': ranked_entity_names.iloc[0],
        'actual_entity' : actual_entity_name
    }
    )
    # Step 4: Check if the actual entity is within the top 1, 5, and 10
    if actual_entity_name == ranked_entity_names.iloc[0]:
        top1_correct += 1
    if actual_entity_name in ranked_entity_names.iloc[:5].values:
        top5_correct += 1
    if actual_entity_name in ranked_entity_names.iloc[:10].values:
        top10_correct += 1

# Step 5: Calculate and print ranking accuracies
total_sentences = len(input_df)

top1_accuracy = top1_correct / total_sentences * 100
top5_accuracy = top5_correct / total_sentences * 100
top10_accuracy = top10_correct / total_sentences * 100

print(f"Top-1 Accuracy: {top1_accuracy:.2f}%")
print(f"Top-5 Accuracy: {top5_accuracy:.2f}%")
print(f"Top-10 Accuracy: {top10_accuracy:.2f}%")

Top-1 Accuracy: 56.05%
Top-5 Accuracy: 67.14%
Top-10 Accuracy: 71.25%


In [14]:
ner_df = pd.DataFrame(res_data)

In [15]:
ner_df.to_csv("asr_entity_linked_wo_knowledge_results.csv")