In [12]:
import os
import pandas as pd

def process_files(input_files, gold_files, output_file):
    sentences1, sentences2, gold_labels = [], [], []

    for input_file, gold_file in zip(input_files, gold_files):
        print(f"Processing: {input_file} | {gold_file}")
        if not os.path.exists(input_file) or not os.path.exists(gold_file):
            raise FileNotFoundError(f"File not found: {input_file} or {gold_file}")

        with open(input_file, "r", encoding="utf-8") as infile, open(gold_file, "r", encoding="utf-8") as goldfile:
            input_lines, gold_lines = infile.readlines(), goldfile.readlines()
            min_length = min(len(input_lines), len(gold_lines))

            for line in input_lines[:min_length]:
                fields = line.strip().split("\t")
                if len(fields) >= 2:
                    sentences1.append(fields[0])
                    sentences2.append(fields[1])

            for line in gold_lines[:min_length]:
                try:
                    gold_labels.append(float(line.strip()))
                except ValueError:
                    pass  # Skip invalid data

    min_length = min(len(sentences1), len(sentences2), len(gold_labels))
    data = pd.DataFrame({
        "sentence1": sentences1[:min_length],
        "sentence2": sentences2[:min_length],
        "gold_label": gold_labels[:min_length]
    })

    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    data.to_csv(output_file, index=False, encoding="utf-8")
    
    print(f"CSV file successfully generated: {output_file}")
    print(f"Total sentence pairs processed: {len(sentences1)}")
    print(f"Dataset shape: {data.shape}")

if __name__ == "__main__":
    base_dir = "STS_Data"
    input_files = [
        os.path.join(base_dir, "STS2016.input.answer-answer.txt"),
        os.path.join(base_dir, "STS2016.input.headlines.txt"),
        os.path.join(base_dir, "STS2016.input.plagiarism.txt"),
        os.path.join(base_dir, "STS2016.input.postediting.txt"),
        os.path.join(base_dir, "STS2016.input.question-question.txt")
    ]
    gold_files = [
        os.path.join(base_dir, "STS2016.gs.answer-answer.txt"),
        os.path.join(base_dir, "STS2016.gs.headlines.txt"),
        os.path.join(base_dir, "STS2016.gs.plagiarism.txt"),
        os.path.join(base_dir, "STS2016.gs.postediting.txt"),
        os.path.join(base_dir, "STS2016.gs.question-question.txt")
    ]
    output_file = os.path.join("output", "STS_test_data.csv")

    process_files(input_files, gold_files, output_file)


Processing: STS_Data\STS2016.input.answer-answer.txt | STS_Data\STS2016.gs.answer-answer.txt
Processing: STS_Data\STS2016.input.headlines.txt | STS_Data\STS2016.gs.headlines.txt
Processing: STS_Data\STS2016.input.plagiarism.txt | STS_Data\STS2016.gs.plagiarism.txt
Processing: STS_Data\STS2016.input.postediting.txt | STS_Data\STS2016.gs.postediting.txt
Processing: STS_Data\STS2016.input.question-question.txt | STS_Data\STS2016.gs.question-question.txt
CSV file successfully generated: output\STS_test_data.csv
Total sentence pairs processed: 9183
Dataset shape: (1186, 3)


In [None]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from scipy.stats import pearsonr

def compute_similarity(sts_file, output_file):
    if not os.path.exists(sts_file):
        raise FileNotFoundError(f"STS test data file not found: {sts_file}")

    sts_data = pd.read_csv(sts_file)

    if not all(col in sts_data.columns for col in ['sentence1', 'sentence2', 'gold_label']):
        raise ValueError("Missing required columns in STS dataset.")

    models = {
        "stsb-roberta-large": SentenceTransformer('stsb-roberta-large'),
        "paraphrase-MiniLM-L6-v2": SentenceTransformer('paraphrase-MiniLM-L6-v2'),
        "bert-base-nli-mean-tokens": SentenceTransformer('bert-base-nli-mean-tokens'),
        "all-mpnet-base-v2": SentenceTransformer('all-mpnet-base-v2'),
    }

    results = {}

    for model_name, model in models.items():
        print(f"Using model: {model_name}")
        similarities = []

        for s1, s2 in zip(sts_data['sentence1'], sts_data['sentence2']):
            embedding1 = model.encode(s1, convert_to_tensor=True)
            embedding2 = model.encode(s2, convert_to_tensor=True)
            sim = util.cos_sim(embedding1, embedding2).item()
            similarities.append(sim)

        min_sim, max_sim = min(similarities), max(similarities)
        normalized_similarities = [(sim - min_sim) / (max_sim - min_sim) * 5 for sim in similarities]

        col_name = f'normalized_similarity_{model_name}'
        sts_data[col_name] = normalized_similarities

        pearson_corr, _ = pearsonr(normalized_similarities, sts_data['gold_label'])
        results[model_name] = pearson_corr
        print(f"Model: {model_name}, Pearson Correlation: {pearson_corr:.4f}")

    os.makedirs("output", exist_ok=True)
    sts_data.to_csv(output_file, index=False, encoding="utf-8")
    print(f"\nResults saved to: {output_file}")

    print("\nFinal Model Performance:")
    for model_name, pearson_corr in results.items():
        print(f" Model: {model_name}, Pearson Correlation: {pearson_corr:.4f}")

if __name__ == "__main__":
    sts_file = "output/STS_test_data.csv"
    output_file = "output/STS_results_with_normalized_predictions.csv"

    compute_similarity(sts_file, output_file)


Using model: stsb-roberta-large
Model: stsb-roberta-large, Pearson Correlation: -0.0574
Using model: paraphrase-MiniLM-L6-v2
Model: paraphrase-MiniLM-L6-v2, Pearson Correlation: -0.0244
Using model: bert-base-nli-mean-tokens
Model: bert-base-nli-mean-tokens, Pearson Correlation: -0.0435
Using model: all-mpnet-base-v2
