In [1]:

import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from scipy.spatial.distance import cosine
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
from tqdm import tqdm

In [2]:


# Constants
MODEL_NAME = 'final_model_dir'  # Model to use
BATCH_SIZE = 32  # Adjust batch size based on available memory
EMBEDDING_METHOD = 'mean_pooling'  # Choose from 'mean_pooling', 'cls_token', 'max_pooling'


In [3]:

# Function to load the dataset
def load_dataset(file_path):
    df = pd.read_csv(file_path)
    print(f"Loaded dataset with {len(df)} records.")
    return df

# Function to preprocess dataset (optional: add more preprocessing steps if needed)
def preprocess_dataset(df):
    df.dropna(subset=['sentence1', 'sentence2', 'score'], inplace=True)
    df['normalized_score'] = df['score'] / 5.0
    return df

# Load and preprocess the dataset
file_path = "data/heb_sts_test.csv"  # Replace with your dataset path
df = load_dataset(file_path)
df = preprocess_dataset(df)

Loaded dataset with 1379 records.


In [4]:

# Function to initialize model and tokenizer
def initialize_model_and_tokenizer(model_name):
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name)
    model.eval()  # Set model to evaluation mode
    device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
    model.to(device)
    print(f"Model loaded on device: {device}")
    return model, tokenizer, device

# Initialize model and tokenizer
model, tokenizer, device = initialize_model_and_tokenizer(MODEL_NAME)


Model loaded on device: mps


In [7]:

# Function to compute sentence embeddings
def get_sentence_embeddings(sentences, model, tokenizer, device, method='mean_pooling'):
    inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        hidden_states = outputs.last_hidden_state  # Shape: [batch_size, sequence_length, hidden_size]
    
    if method == 'mean_pooling':
        embeddings = torch.mean(hidden_states, dim=1)  # Mean pooling
    elif method == 'cls_token':
        embeddings = hidden_states[:, 0, :]  # CLS token
    elif method == 'max_pooling':
        embeddings, _ = torch.max(hidden_states, dim=1)  # Max pooling
    else:
        raise ValueError("Unsupported embedding method. Choose from 'mean_pooling', 'cls_token', 'max_pooling'.")
    
    return embeddings.cpu().numpy()

# Function to compute cosine similarities between sentence embeddings
def compute_cosine_similarity(embeddings1, embeddings2):
    similarities = [1 - cosine(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2)]
    return np.array(similarities)

# Function to evaluate model performance using various metrics
def evaluate_model_performance(actual_scores, predicted_scores):
    pearson_corr, _ = pearsonr(actual_scores, predicted_scores)
    spearman_corr, _ = spearmanr(actual_scores, predicted_scores)
    mse = mean_squared_error(actual_scores, predicted_scores)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(actual_scores, predicted_scores)
    r2 = r2_score(actual_scores, predicted_scores)
    
    print(f"Evaluation Metrics:")
    print(f"Pearson Correlation: {pearson_corr:.4f}")
    print(f"Spearman Correlation: {spearman_corr:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"R^2 Score: {r2:.4f}")

# Function to process the dataset in batches for efficiency
def predict_similarity(df, model, tokenizer, device, batch_size=32, method='mean_pooling'):
    predicted_scores = []
    n = len(df)

    for i in tqdm(range(0, n, batch_size), desc="Processing Batches"):
        batch = df.iloc[i:i + batch_size]
        sentences1 = batch['sentence1'].tolist()
        sentences2 = batch['sentence2'].tolist()
        
        # Compute embeddings for both sentences in the batch
        embeddings1 = get_sentence_embeddings(sentences1, model, tokenizer, device, method)
        embeddings2 = get_sentence_embeddings(sentences2, model, tokenizer, device, method)
        
        # Compute cosine similarities for the batch
        batch_similarities = compute_cosine_similarity(embeddings1, embeddings2)
        predicted_scores.extend(batch_similarities)
    
    return np.array(predicted_scores)

# Predict similarity scores
predicted_scores = predict_similarity(df, model, tokenizer, device, BATCH_SIZE, EMBEDDING_METHOD)
# Evaluate model performance
actual_scores = df['normalized_score'].values  # Use normalized scores for evaluation
evaluate_model_performance(actual_scores, predicted_scores)
df['predicted_score'] = predicted_scores
df['absolute_error'] = np.abs(df['normalized_score'] - df['predicted_score'])

print("\nDetailed Results for Each Row:")
df[['sid', 'sentence1', 'sentence2', 'normalized_score', 'predicted_score', 'absolute_error']].head(20)



Processing Batches: 100%|██████████| 44/44 [00:05<00:00,  8.59it/s]

Evaluation Metrics:
Pearson Correlation: 0.5543
Spearman Correlation: 0.5651
Mean Squared Error (MSE): 0.1595
Root Mean Squared Error (RMSE): 0.3994
Mean Absolute Error (MAE): 0.3223
R^2 Score: -0.7151

Detailed Results for Each Row:





Unnamed: 0,sid,sentence1,sentence2,normalized_score,predicted_score,absolute_error
0,24,בחורה מעצבת את שיערה.,ילדה מצחצחת את שיערה.,0.50,0.783320,0.283320
1,33,קבוצת גברים משחקת כדורגל על ​​החוף.,קבוצת נערים משחקת כדורגל על ​​החוף.,0.72,0.956257,0.236257
2,45,אישה אחת מודדת קרסול של אישה אחרת.,אישה מודדת קרסול של אישה אחרת.,1.00,0.976438,0.023562
3,63,אדם חותך מלפפון.,גבר פורס מלפפון.,0.84,0.911046,0.071046
4,66,אדם מנגן בנבל.,גבר מנגן על מקלדת.,0.30,0.799576,0.499576
...,...,...,...,...,...,...
1374,1354,"הפיליפינים, קנדה מתחייבים להגביר עוד יותר את ה...",הפיליפינים חוסכים 100 לאחר טביעת המעבורת,0.00,0.405733,0.405733
1375,1360,ישראל אוסרת על פלסטינים את העיר העתיקה בירושלים,"פתרון שתי מדינות בין פלסטינים, ישראל עוגה בשמיים",0.20,0.668643,0.468643
1376,1368,כמה אתה יודע על השירות החשאי?,מחוקקים משני הצדדים מביעים זעם על השירות החשאי,0.20,0.729462,0.529462
1377,1420,אובמה נאבק להרגיע את פחדי סעודיה עם חידוש השיח...,מיאנמר נאבקת לסיים את רשימות המצביעים לסקרים ש...,0.00,0.769087,0.769087


In [9]:
error_threshold = 0.4  # Example threshold value

# Filter rows with absolute difference greater than the threshold
significant_errors = df[df['absolute_error'] > error_threshold]

print("\nRows with Absolute Error Greater Than Threshold ({}):".format(error_threshold))
significant_errors[['sid', 'sentence1', 'sentence2', 'normalized_score', 'predicted_score', 'absolute_error']].head(20)


Rows with Absolute Error Greater Than Threshold (0.4):


Unnamed: 0,sid,sentence1,sentence2,normalized_score,predicted_score,absolute_error
4,66,אדם מנגן בנבל.,גבר מנגן על מקלדת.,0.3,0.799576,0.499576
5,74,אישה חותכת בצל.,אישה חותכת טופו.,0.36,0.857997,0.497997
7,82,גבר מנגן בתופים.,גבר מנגן בגיטרה.,0.44,0.855109,0.415109
9,95,גבר מנגן בגיטרה.,אדם מנגן בחצוצרה.,0.3428,0.818387,0.475587
10,96,גבר מנגן בגיטרה.,אדם מנגן בחצוצרה.,0.3428,0.818387,0.475587
12,103,גבר רוכב על אופניים.,גבר מדבר.,0.12,0.76259,0.64259
14,124,גבר פורס עגבנייה.,גבר פורס לחמנייה.,0.4,0.861327,0.461327
15,127,גבר מנגן בגיטרה.,גבר מנגן על מקלדת.,0.36,0.849972,0.489972
19,139,גבר נוהג במכונית.,אדם רוכב על סוס.,0.24,0.765133,0.525133
21,150,האישה מעצבת את שיערה.,האישה פורסת עשבי תיבול.,0.04,0.748851,0.708851


In [13]:
low_scores = df[df['predicted_score']<0.5]
# low_scores[['sid', 'sentence1', 'sentence2', 'normalized_score', 'predicted_score', 'absolute_error']].head(20)
low_scores.count()

sid                 6
score               6
sentence1           6
sentence2           6
normalized_score    6
predicted_score     6
absolute_error      6
dtype: int64