In [4]:
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr, spearmanr
import torch
import pandas as pd


In [5]:
# Load your fine-tuned model and tokenizer
local_model_path = "final_model_dir"
# model_name = "your-finetuned-model-name"
model = BertForSequenceClassification.from_pretrained(local_model_path)
tokenizer = BertTokenizer.from_pretrained(local_model_path)

# Set the model to evaluation mode
model.eval()

# Check for available GPU or use CPU
# Check for MPS and CUDA availability
if torch.backends.mps.is_available():
    device = torch.device('mps')
    print("MPS is available")
elif torch.cuda.is_available():
    device = torch.device('cuda')
    print("CUDA is available")
else:
    device = torch.device('cpu')
    print("Using CPU")
model.to(device)

MPS is available


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [6]:

# Load your STS dataset
# Assuming dataset is a CSV with columns: sid, score, sentence1, sentence2
df = pd.read_csv("data/heb_sts_test.csv")

# Prepare the data
sentence_pairs = list(zip(df['sentence1'], df['sentence2']))
true_scores = df['score'].tolist()


In [7]:
# Function to predict similarity scores
# def predict_similarity_old(sentence_pairs):
#     predicted_scores = []
#     for sent1, sent2 in sentence_pairs:
#         # Tokenize the input sentences
#         inputs = tokenizer(sent1, sent2, return_tensors='pt', padding=True, truncation=True).to(device)
        
#         # Get the model predictions
#         with torch.no_grad():
#             outputs = model(**inputs)
#             logits = outputs.logits

#         # Convert logits to similarity scores (assuming regression on a continuous scale, e.g., 0-5)
#         score = logits.item()
#         predicted_scores.append(score)
    
#     return predicted_scores

def predict_similarity(sentence_pairs):
    predicted_scores = []
    for sent1, sent2 in sentence_pairs:
        # Tokenize the input sentences
        inputs = tokenizer(sent1, sent2, return_tensors='pt', padding=True, truncation=True).to(device)
        
        # Get the model predictions
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits  # Shape: [batch_size, num_labels]

        # For a regression task, we assume num_labels should be 1.
        # Ensure the logits tensor is converted to a single scalar value.
        print(logits)
        score = logits.squeeze().item()  # Convert logits to scalar if logits has more than one element
        predicted_scores.append(score)
    
    return predicted_scores

In [9]:
# Untuned model


import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from scipy.spatial.distance import cosine
from scipy.stats import pearsonr, spearmanr
import pandas as pd
from tqdm import tqdm

# Custom model with a different loss function
class CustomModel(nn.Module):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, token_type_ids=None, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        logits = self.classifier(outputs.last_hidden_state[:, 0, :])
        
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)
        
        return {'loss': loss, 'logits': logits, 'hidden_states': outputs.last_hidden_state}

# Function to encode sentences and get embeddings
def get_sentence_embedding(model, tokenizer, sentence, device):
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        hidden_states = outputs['hidden_states']
        sentence_embedding = hidden_states.mean(dim=1).squeeze()

    return sentence_embedding.cpu().numpy()

# Load Hebrew STS dataset for evaluation
sts_file_path = "data/heb_sts_test.csv"
sts_data = pd.read_csv(sts_file_path)

# Assuming model and tokenizer are already defined and loaded
model_name = 'dicta-il/dictabert'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = CustomModel(model_name, num_labels=3)  # Assuming 3 labels for your classification task

# Move model to CPU for inference
model.to('mps')
model.eval()

# Evaluate the model using STS dataset
predicted_scores = []
actual_scores = sts_data['score'].tolist()

for index, row in tqdm(sts_data.iterrows(), total=len(sts_data), desc="Processing Sentences"):
    sentence1 = row['sentence1']
    sentence2 = row['sentence2']
    
    emb1 = get_sentence_embedding(model, tokenizer, sentence1, 'mps')
    emb2 = get_sentence_embedding(model, tokenizer, sentence2, 'mps')
    
    # Calculate cosine similarity and scale to 0-5
    similarity = 1 - cosine(emb1, emb2)
    predicted_scores.append(similarity * 5)  # Scale cosine similarity to 0-5 range
            
# Compute Pearson and Spearman correlations
pearson_correlation, _ = pearsonr(predicted_scores, actual_scores)
spearman_correlation, _ = spearmanr(predicted_scores, actual_scores)

print(f'Pearson correlation: {pearson_correlation}')
print(f'Spearman correlation: {spearman_correlation}')


Some weights of BertModel were not initialized from the model checkpoint at dicta-il/dictabert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing Sentences: 100%|██████████| 1379/1379 [00:44<00:00, 30.65it/s]

Pearson correlation: 0.6642353544257372
Spearman correlation: 0.6584267627560283





In [10]:
# Finetuned model

# Custom model with a different loss function
class CustomModel(nn.Module):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, token_type_ids=None, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        logits = self.classifier(outputs.last_hidden_state[:, 0, :])
        
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)
        
        return {'loss': loss, 'logits': logits, 'hidden_states': outputs.last_hidden_state}

# Function to encode sentences and get embeddings
def get_sentence_embedding(model, tokenizer, sentence, device):
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        hidden_states = outputs['hidden_states']
        sentence_embedding = hidden_states.mean(dim=1).squeeze()

    return sentence_embedding.cpu().numpy()

# Load Hebrew STS dataset for evaluation
sts_file_path = "data/heb_sts_test.csv"
sts_data = pd.read_csv(sts_file_path)

# Assuming model and tokenizer are already defined and loaded
# model_name = 'dicta-il/dictabert'

local_model_path = "final_model_dir"
model = BertForSequenceClassification.from_pretrained(local_model_path)

tokenizer = AutoTokenizer.from_pretrained(local_model_path)
model = CustomModel(local_model_path, num_labels=3)  # Assuming 3 labels for your classification task

# Move model to CPU for inference
model.to('mps')
model.eval()

# Evaluate the model using STS dataset
predicted_scores = []
actual_scores = sts_data['score'].tolist()

for index, row in tqdm(sts_data.iterrows(), total=len(sts_data), desc="Processing Sentences"):
    sentence1 = row['sentence1']
    sentence2 = row['sentence2']
    
    emb1 = get_sentence_embedding(model, tokenizer, sentence1, 'mps')
    emb2 = get_sentence_embedding(model, tokenizer, sentence2, 'mps')
    
    # Calculate cosine similarity and scale to 0-5
    similarity = 1 - cosine(emb1, emb2)
    predicted_scores.append(similarity * 5)  # Scale cosine similarity to 0-5 range
            
# Compute Pearson and Spearman correlations
pearson_correlation, _ = pearsonr(predicted_scores, actual_scores)
spearman_correlation, _ = spearmanr(predicted_scores, actual_scores)

print(f'Pearson correlation: {pearson_correlation}')
print(f'Spearman correlation: {spearman_correlation}')


Processing Sentences: 100%|██████████| 1379/1379 [00:39<00:00, 35.02it/s]

Pearson correlation: 0.5610494443044348
Spearman correlation: 0.578294553036155





In [32]:
# Predict similarity scores for the dataset
predicted_scores = predict_similarity(sentence_pairs)


tensor([[ 2.5696, -1.1135, -1.2360]], device='mps:0')


RuntimeError: a Tensor with 3 elements cannot be converted to Scalar

In [None]:

# Compute evaluation metrics
pearson_corr, _ = pearsonr(true_scores, predicted_scores)
spearman_corr, _ = spearmanr(true_scores, predicted_scores)
mse = mean_squared_error(true_scores, predicted_scores)
mae = mean_absolute_error(true_scores, predicted_scores)

# Print the evaluation metrics
print(f"Pearson Correlation Coefficient: {pearson_corr:.4f}")
print(f"Spearman Rank Correlation Coefficient: {spearman_corr:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")

In [51]:
from transformers import BertTokenizer, BertModel
import torch
import torch.nn.functional as F
from scipy.spatial.distance import cosine

# Load the pre-trained BERT model and tokenizer
model_name = "final_model_dir"
# model_name = 'bert-base-uncased'  # or your specific model
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Set the model to evaluation mode
model.eval()

# Check for GPU or MPS (Apple Silicon)
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)

# Example sentences
# sentence1 = "A woman is styling her hair."
# sentence2 = "A girl is brushing her hair."
# sentence1 = "אישה מעצבת את שיערה"
# sentence2 = "ילדה מברישה את שיערה"
# 0033,3.6,קבוצת גברים משחקת כדורגל על ​​החוף.,קבוצת נערים משחקת כדורגל על ​​החוף.
# 0045,5,אישה אחת מודדת קרסול של אישה אחרת.,אישה מודדת קרסול של אישה אחרת.
sentence1 = "גבר פורס עגבנייה"
sentence2 = "גבר פורס לחמנייה"

# Function to compute sentence embeddings
def get_sentence_embedding(sentence, model, tokenizer, device):
    # Tokenize input sentence
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True).to(device)

    # Extract hidden states from the model
    with torch.no_grad():
        outputs = model(**inputs)
        hidden_states = outputs.last_hidden_state  # Shape: [batch_size, sequence_length, hidden_size]

    # Get the mean of the token embeddings to form a sentence embedding
    # Optionally, you can use other methods like CLS token ([0, 0, :])
    sentence_embedding = torch.mean(hidden_states, dim=1).squeeze().cpu().numpy()
    
    return sentence_embedding

# Get embeddings for both sentences
embedding1 = get_sentence_embedding(sentence1, model, tokenizer, device)
embedding2 = get_sentence_embedding(sentence2, model, tokenizer, device)

# Compute cosine similarity between embeddings
cosine_sim = 1 - cosine(embedding1, embedding2)
print(f"Cosine Similarity between sentences: {cosine_sim:.4f}")


Cosine Similarity between sentences: 0.8723


In [34]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from scipy.spatial.distance import cosine
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import mean_squared_error

# Load your dataset
df = pd.read_csv("data/heb_sts_test.csv")

# Load the pre-trained BERT model and tokenizer
model_name = "final_model_dir"
# model_name = 'bert-base-uncased'  # or any other pre-trained model
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Set the model to evaluation mode
model.eval()

# Check for GPU or MPS (Apple Silicon)
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)

# Function to compute sentence embeddings
def get_sentence_embedding(sentence, model, tokenizer, device):
    # Tokenize input sentence
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True).to(device)

    # Extract hidden states from the model
    with torch.no_grad():
        outputs = model(**inputs)
        hidden_states = outputs.last_hidden_state  # Shape: [batch_size, sequence_length, hidden_size]

    # Get the mean of the token embeddings to form a sentence embedding
    sentence_embedding = torch.mean(hidden_states, dim=1).squeeze().cpu().numpy()
    
    return sentence_embedding

# Function to compute predicted similarity scores for all sentence pairs in the dataset
def predict_similarity(df):
    predicted_scores = []
    
    for index, row in df.iterrows():
        sent1 = row['sentence1']
        sent2 = row['sentence2']
        
        # Get embeddings for both sentences
        embedding1 = get_sentence_embedding(sent1, model, tokenizer, device)
        embedding2 = get_sentence_embedding(sent2, model, tokenizer, device)

        # Compute cosine similarity between embeddings
        cosine_sim = 1 - cosine(embedding1, embedding2)
        predicted_scores.append(cosine_sim)
    
    return predicted_scores

# Get predicted similarity scores for the dataset
predicted_scores = predict_similarity(df)

# Evaluate the model performance using Pearson correlation, Spearman correlation, and Mean Squared Error
actual_scores = df['score'].values

pearson_corr, _ = pearsonr(actual_scores, predicted_scores)
spearman_corr, _ = spearmanr(actual_scores, predicted_scores)
mse = mean_squared_error(actual_scores, predicted_scores)

print(f"Pearson Correlation: {pearson_corr:.4f}")
print(f"Spearman Correlation: {spearman_corr:.4f}")
print(f"Mean Squared Error: {mse:.4f}")


Pearson Correlation: 0.5610
Spearman Correlation: 0.5783
Mean Squared Error: 5.2739
