In [41]:
import os
import pandas as pd

def process_files(input_files, gold_files, output_file):
    # Initialize lists
    sentences1 = []
    sentences2 = []
    gold_labels = []

    # Read sentence pairs and labels
    for input_file, gold_file in zip(input_files, gold_files):
        print(f"Processing Input: {input_file} | Gold: {gold_file}")
        if not os.path.exists(input_file) or not os.path.exists(gold_file):
            raise FileNotFoundError(f"File not found: {input_file} or {gold_file}")
        
        with open(input_file, "r", encoding="utf-8") as infile, open(gold_file, "r", encoding="utf-8") as goldfile:
            input_lines = infile.readlines()
            gold_lines = goldfile.readlines()

            # Ensure lengths match
            min_length = min(len(input_lines), len(gold_lines))
            input_lines = input_lines[:min_length]
            gold_lines = gold_lines[:min_length]

            # Process sentence pairs
            for line in input_lines:
                fields = line.strip().split("\t")
                if len(fields) >= 2:  # Ensure at least two fields per line
                    sentences1.append(fields[0])
                    sentences2.append(fields[1])
                else:
                    print(f"Skipped invalid line (sentence pair file): {line.strip()}")

            # Process labels
            for line in gold_lines:
                stripped_line = line.strip()
                if stripped_line:  # Ensure non-empty line
                    try:
                        gold_labels.append(float(stripped_line))
                    except ValueError:
                        print(f"Skipped invalid line (label file): {stripped_line}")
                else:
                    print("Skipped empty line (label file)")

    # Ensure data alignment
    min_length = min(len(sentences1), len(sentences2), len(gold_labels))
    sentences1 = sentences1[:min_length]
    sentences2 = sentences2[:min_length]
    gold_labels = gold_labels[:min_length]

    # Create DataFrame
    data = pd.DataFrame({
        "sentence1": sentences1,
        "sentence2": sentences2,
        "gold_label": gold_labels
    })

    # Save to CSV
    os.makedirs(os.path.dirname(output_file), exist_ok=True)  # Ensure output directory exists
    data.to_csv(output_file, index=False, encoding="utf-8")
    print(f"CSV file successfully generated: {output_file}")

if __name__ == "__main__":
    # Base directory containing the data files
    base_dir = "STS_Data"
    input_files = [
        os.path.join(base_dir, "STS2016.input.answer-answer.txt"),
        os.path.join(base_dir, "STS2016.input.headlines.txt"),
        os.path.join(base_dir, "STS2016.input.plagiarism.txt"),
        os.path.join(base_dir, "STS2016.input.postediting.txt"),
        os.path.join(base_dir, "STS2016.input.question-question.txt")
    ]
    gold_files = [
        os.path.join(base_dir, "STS2016.gs.answer-answer.txt"),
        os.path.join(base_dir, "STS2016.gs.headlines.txt"),
        os.path.join(base_dir, "STS2016.gs.plagiarism.txt"),
        os.path.join(base_dir, "STS2016.gs.postediting.txt"),
        os.path.join(base_dir, "STS2016.gs.question-question.txt")
    ]
    output_file = os.path.join("output", "STS_test_data.csv")

    process_files(input_files, gold_files, output_file)
#define and load four models ==========
    models = {
        "stsb-roberta-large (SBERT)": SentenceTransformer('stsb-roberta-large'),
        "paraphrase-MiniLM-L6-v2": SentenceTransformer('paraphrase-MiniLM-L6-v2'),
        "bert-base-nli-mean-tokens": SentenceTransformer('bert-base-nli-mean-tokens'),
        "all-mpnet-base-v2": SentenceTransformer('all-mpnet-base-v2'),
    }

    # Dictionary to store Pearson correlation for each model
    results = {}

    # ========== 3. Iterate over each model, compute similarity, and normalize ==========
    for model_name, model in models.items():
        print(f"\nUsing model: {model_name}")
        similarities = []
        
        # Encode sentence1 and sentence2, compute cosine similarity
        for s1, s2 in zip(sts_data['sentence1'], sts_data['sentence2']):
            embedding1 = model.encode(s1, convert_to_tensor=True)
            embedding2 = model.encode(s2, convert_to_tensor=True)
            sim = util.cos_sim(embedding1, embedding2).item()  # Convert to float
            similarities.append(sim)
        
        # Normalize similarity scores to the range [0, 5]
        min_sim = min(similarities)
        max_sim = max(similarities)
        normalized_similarities = [(sim - min_sim) / (max_sim - min_sim) * 5 for sim in similarities]
        
        # Add normalized results to the DataFrame
        col_name = f'normalized_similarity_{model_name}'
        sts_data[col_name] = normalized_similarities
        
        # Compute Pearson correlation
        pearson_corr, _ = pearsonr(normalized_similarities, sts_data['gold_label'])
        results[model_name] = pearson_corr
        
        print(f"Model: {model_name}, Pearson Correlation: {pearson_corr:.4f}")

    # ========== 4. Save results to CSV ==========
    os.makedirs(os.path.dirname(output_file), exist_ok=True)  # Ensure output directory exists
    sts_data.to_csv(output_file, index=False, encoding="utf-8")
    print(f"\nPredicted results have been saved to: {output_file}")

    # ========== 5. Print final results ==========
    print("\nFinal model performance:")
    for model_name, pearson_corr in results.items():
        print(f"Model: {model_name}, Pearson Correlation: {pearson_corr:.4f}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run sentence similarity evaluation using pre-trained models.")
    parser.add_argument("--sts_file", required=True, help="Path to the STS test data file (CSV format).")
    parser.add_argument("--output_file", required=True, help="Path to save the output CSV file with results.")
    args = parser.parse_args()

    main(args.sts_file, args.output_file)


Processing Input: STS_Data\STS2016.input.answer-answer.txt | Gold: STS_Data\STS2016.gs.answer-answer.txt
Skipped empty line (label file)
Skipped empty line (label file)
Skipped empty line (label file)
Skipped empty line (label file)
Skipped empty line (label file)
Skipped empty line (label file)
Skipped empty line (label file)
Skipped empty line (label file)
Skipped empty line (label file)
Skipped empty line (label file)
Skipped empty line (label file)
Skipped empty line (label file)
Skipped empty line (label file)
Skipped empty line (label file)
Skipped empty line (label file)
Skipped empty line (label file)
Skipped empty line (label file)
Skipped empty line (label file)
Skipped empty line (label file)
Skipped empty line (label file)
Skipped empty line (label file)
Skipped empty line (label file)
Skipped empty line (label file)
Skipped empty line (label file)
Skipped empty line (label file)
Skipped empty line (label file)
Skipped empty line (label file)
Skipped empty line (label file)

usage: ipykernel_launcher.py [-h] --sts_file STS_FILE --output_file
                             OUTPUT_FILE
ipykernel_launcher.py: error: the following arguments are required: --sts_file, --output_file


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [52]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from scipy.stats import pearsonr

# ========== 1. Load STS Data ==========
sts_file = "STS_test_data.csv" # Use relative path
sts_data = pd.read_csv(sts_file)

# Check the data
print(sts_data.info())
print(sts_data.head())

# Ensure required columns exist
if not all(col in sts_data.columns for col in ['sentence1', 'sentence2', 'gold_label']):
    raise ValueError("The DataFrame is missing required columns: 'sentence1', 'sentence2', and 'gold_label'.")

# ========== 2. Define and Load Models ==========
models = {
    "stsb-roberta-large (SBERT)": SentenceTransformer('stsb-roberta-large'),
    "paraphrase-MiniLM-L6-v2": SentenceTransformer('paraphrase-MiniLM-L6-v2'),
    "bert-base-nli-mean-tokens": SentenceTransformer('bert-base-nli-mean-tokens'),
    "all-mpnet-base-v2": SentenceTransformer('all-mpnet-base-v2'),
}

# Store Pearson correlations for each model
results = {}

# ========== 3. Iterate through Models ==========
for model_name, model in models.items():
    print(f"\nUsing model: {model_name}")
    similarities = []

    # Encode sentences and compute cosine similarity
    for s1, s2 in zip(sts_data['sentence1'], sts_data['sentence2']):
        embedding1 = model.encode(s1, convert_to_tensor=True)
        embedding2 = model.encode(s2, convert_to_tensor=True)
        sim = util.cos_sim(embedding1, embedding2).item()
        similarities.append(sim)

    # Normalize similarities to 0-5 range
    min_sim = min(similarities)
    max_sim = max(similarities)
    normalized_similarities = [(sim - min_sim) / (max_sim - min_sim) * 5 for sim in similarities]

    # Add normalized results to DataFrame
    col_name = f'normalized_similarity_{model_name}'
    sts_data[col_name] = normalized_similarities

    # Compute Pearson correlation
    pearson_corr, _ = pearsonr(normalized_similarities, sts_data['gold_label'])
    results[model_name] = pearson_corr

    print(f"Model: {model_name}, Pearson Correlation: {pearson_corr:.4f}")

# ========== 4. Save Results to CSV ==========
output_file = "output/STS_results_with_normalized_predictions.csv"
os.makedirs("output", exist_ok=True)
sts_data.to_csv(output_file, index=False, encoding="utf-8")
print(f"\nResults saved to: {output_file}")

# ========== 5. Print Final Results ==========
print("\nFinal Model Results:")
for model_name, pearson_corr in results.items():
    print(f"Model: {model_name}, Pearson Correlation: {pearson_corr:.4f}")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1186 entries, 0 to 1185
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   sentence1   1186 non-null   object 
 1   sentence2   1186 non-null   object 
 2   gold_label  1186 non-null   float64
dtypes: float64(1), object(2)
memory usage: 27.9+ KB
None
                                           sentence1  \
0               Tasting it is the only reliable way.   
1         I think it probably depends on your money.   
2  You need to read a lot to know what you like a...   
3  Obviously, the best book for you depends a lot...   
4                        I've had this same problem.   

                                           sentence2  gold_label  
0                       The way you have it is fine.         3.0  
1                        It depends on your country.         3.0  
2                            You don't have to know.         0.0  
3  The answer will depend 