<a href="https://colab.research.google.com/github/ipassynk/dating-match-fine-tuning/blob/main/eval/baseline_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentence_transformers
!pip install sklearn
!pip install umap
!pip install hdbscan
!pip install matplotlib
!pip install plotly
!pip install scipy

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json
import numpy as np
import pandas as pd
from scipy import stats

In [None]:
def read_file(file_name):
    positives = [];
    negatives = [];

    with open(file_name, 'r') as data:
        lines = data.readlines()
        for line in lines:
            item = json.loads(line);
            sentence1 = item['text_1'].split(': ')[1]
            sentence2 = item['text_2'].split(': ')[1]
            label = item['label']
            el = {'sentence1': sentence1, 'sentence2': sentence2} # Corrected key for sentence2
            if label == 1:
                positives.append(el)
            else:
                negatives.append(el)
    return  pd.DataFrame(positives), pd.DataFrame(negatives)

def evaluate(model, type):

    positives, negatives = read_file("dating_pairs.jsonl");
    print(positives.iloc[0])

    sentences1 = [row['sentence1'] for index, row in positives.iterrows()];
    sentences2 = [row['sentence2'] for index, row in positives.iterrows()];
    embeddings1_positives = model.encode(sentences1, convert_to_numpy=True, show_progress_bar=True)
    embeddings2_positives = model.encode(sentences2, convert_to_numpy=True, show_progress_bar=True)
    similarity_positives_matrix = cosine_similarity(embeddings1_positives, embeddings2_positives)
    similarity_positives = np.diag(similarity_positives_matrix)

    sentences1 = [row['sentence1'] for index, row in negatives.iterrows()];
    sentences2 = [row['sentence2'] for index, row in negatives.iterrows()];
    embeddings1_negatives = model.encode(sentences1, convert_to_numpy=True, show_progress_bar=True)
    embeddings2_negatives = model.encode(sentences2, convert_to_numpy=True, show_progress_bar=True)
    similarity_negatives_matrix = cosine_similarity(embeddings1_negatives, embeddings2_negatives)
    similarity_negatives = np.diag(similarity_negatives_matrix)

    # t-test to check if the average similarity of sim_positive is significantly higher than that of sim_negative.
    # Independent t-test
    t_stat, p_value = stats.ttest_ind(similarity_positives, similarity_negatives)
    print("t-statistic:", t_stat)
    print("p-value:", p_value)

    #Even if the difference is statistically significant, you also want to know how big it is. That’s what Cohen’s d measures:
    def cohens_d(a, b):
        mean_diff = np.mean(a) - np.mean(b)
        pooled_std = np.sqrt((np.std(a, ddof=1)**2 + np.std(b, ddof=1)**2) / 2)
         return mean_diff / pooled_std

    cohen = cohens_d(similarity_positives, similarity_negatives)
    print("Cohen's d:", cohen)

    #To calculate the Compatibility Margin, you use the difference between the average similarity of compatible pairs and incompatible pairs.
I   #nterpretation
    #Higher margin → better separation between compatible and incompatible pairs (model performs well).
    #Smaller margin → overlap between groups (model struggles to distinguish them).
    #Negative margin → incompatible pairs are more similar than compatible ones — indicates an issue.

    def calculate_compatibility_margin(sim_positives, sim_negatives):
        mean_compatible = np.mean(sim_positives)
        mean_incompatible = np.mean(sim_negatives)
        compatibility_margin = mean_compatible - mean_incompatible
        return compatibility_margin

    compatibility_margin = calculate_compatibility_margin(similarity_positives, similarity_negatives)
    print("Compatibility Margin:", compatibility_margin)

    stat = {
        'magin': float(compatibility_margin),
        'p_value': float(p_value),
        "t-statistic": float(t_stat),
        'cohen': float(cohen)
    }
    open("{type}_stat.json", "w").write(json.dumps(stat))
    print(f"{type} STAT")
    print(stat)

In [None]:
model_name ='all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)
eval_model(model, 'base');

In [None]:
model_path ='model/dating_model'
model = SentenceTransformer(model_path)
eval_model(model, 'fine');