In [3]:
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def process_csv(input_file='IndianConstitution_incremental_results.csv'):
    # Load the CSV file
    print(f"Loading data from {input_file}...")
    df = pd.read_csv(input_file)
    
    # Count the initial number of rows
    initial_row_count = len(df)
    print(f"Initial number of rows: {initial_row_count}")
    
    # Define patterns to search for in generated answers
    patterns = [
        r'i am sorry', 
        r'i cannot', 
        r'I am unable', 
        r'Agent stopped'
    ]
    
    # Create a regex pattern with case insensitivity
    combined_pattern = '|'.join(patterns)
    
    # Filter out rows containing the patterns
    mask = df['generated_answer'].str.lower().str.contains(combined_pattern, regex=True)
    df_filtered = df[~mask]
    
    # Count the number of rows removed
    removed_rows = initial_row_count - len(df_filtered)
    print(f"Number of rows removed: {removed_rows}")
    print(f"Number of rows remaining: {len(df_filtered)}")
    
    # Load sentence transformer model
    print("Loading sentence transformer model...")
    model = SentenceTransformer('all-MiniLM-L12-v2')
    
    # Compute embedding similarity
    print("Computing embedding similarities...")
    embedding_similarities = []
    
    # Process in batches to avoid potential memory issues
    batch_size = 100
    for i in range(0, len(df_filtered), batch_size):
        batch = df_filtered.iloc[i:i+batch_size]
        generated_embeddings = model.encode(batch['generated_answer'].tolist())
        gold_embeddings = model.encode(batch['gold_answer'].tolist())
        
        for j in range(len(batch)):
            gen_emb = generated_embeddings[j].reshape(1, -1)
            gold_emb = gold_embeddings[j].reshape(1, -1)
            similarity = cosine_similarity(gen_emb, gold_emb)[0][0]
            embedding_similarities.append(similarity)
    
    # Add new column for embedding similarity
    df_filtered['mini_lm_similarity'] = embedding_similarities
    
    # Calculate averages
    avg_bleu = df_filtered['bleu'].mean()
    avg_rouge1 = df_filtered['rouge-1'].mean()
    avg_rouge2 = df_filtered['rouge-2'].mean()
    avg_rougel = df_filtered['rouge-l'].mean()
    avg_embedding_sim = df_filtered['embedding_similarity'].mean()
    avg_mini_lm_sim = df_filtered['mini_lm_similarity'].mean()
    
    # Save to new CSV file
    output_file = 'IndianConstitution_filtered_results.csv'
    df_filtered.to_csv(output_file, index=False)
    print(f"Filtered data saved to {output_file}")
    
    # Print summary matrix
    print("\nSummary:")
    print(f"Number of questions left: {len(df_filtered)}")
    print(f"Number of questions removed: {removed_rows}")
    print(f"Average BLEU: {avg_bleu:.4f}")
    print(f"Average ROUGE-1: {avg_rouge1:.4f}")
    print(f"Average ROUGE-2: {avg_rouge2:.4f}")
    print(f"Average ROUGE-L: {avg_rougel:.4f}")
    print(f"Average Embedding Similarity: {avg_embedding_sim:.4f}")
    print(f"Average MiniLM Embedding Similarity: {avg_mini_lm_sim:.4f}")

if __name__ == "__main__":
    process_csv()

Loading data from IndianConstitution_incremental_results.csv...
Initial number of rows: 487
Number of rows removed: 38
Number of rows remaining: 449
Loading sentence transformer model...
Computing embedding similarities...
Filtered data saved to IndianConstitution_filtered_results.csv

Summary:
Number of questions left: 449
Number of questions removed: 38
Average BLEU: 0.2065
Average ROUGE-1: 0.3632
Average ROUGE-2: 0.2839
Average ROUGE-L: 0.3466
Average Embedding Similarity: 0.8450
Average MiniLM Embedding Similarity: 0.5389


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['mini_lm_similarity'] = embedding_similarities


In [5]:
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def process_csv(input_file='IndianConstitution_incremental_results.csv'):
    # Load the CSV file
    print(f"Loading data from {input_file}...")
    df = pd.read_csv(input_file)
    
    # Count the initial number of rows
    initial_row_count = len(df)
    print(f"Initial number of rows: {initial_row_count}")
    
    # Define patterns to search for in generated answers - all lowercase
    patterns = [
        r'i am sorry', 
        r'i cannot', 
        r'i am unable', 
        r'agent stopped'
    ]
    
    # Create a regex pattern
    combined_pattern = '|'.join(patterns)
    
    # First convert generated_answer to lowercase, then check for patterns
    df['lower_gen_answer'] = df['generated_answer'].str.lower()
    mask = df['lower_gen_answer'].str.contains(combined_pattern, regex=True)
    
    # Log rows being removed for debugging
    removed_rows_df = df[mask]
    print(f"Phrases found in {len(removed_rows_df)} rows:")
    for pattern in patterns:
        pattern_mask = df['lower_gen_answer'].str.contains(pattern, regex=True)
        count = pattern_mask.sum()
        print(f"  '{pattern}': found in {count} rows")
    
    # Filter out rows containing the patterns
    df_filtered = df[~mask]
    
    # Drop the temporary lowercase column
    df_filtered = df_filtered.drop('lower_gen_answer', axis=1)
    
    # Count the number of rows removed
    removed_rows = initial_row_count - len(df_filtered)
    print(f"Number of rows removed: {removed_rows}")
    print(f"Number of rows remaining: {len(df_filtered)}")
    
    # Load sentence transformer model
    print("Loading sentence transformer model...")
    model = SentenceTransformer('all-MiniLM-L12-v2')
    
    # Compute embedding similarity
    print("Computing embedding similarities...")
    embedding_similarities = []
    
    # Process in batches to avoid potential memory issues
    batch_size = 100
    for i in range(0, len(df_filtered), batch_size):
        batch = df_filtered.iloc[i:i+batch_size]
        generated_embeddings = model.encode(batch['generated_answer'].tolist())
        gold_embeddings = model.encode(batch['gold_answer'].tolist())
        
        for j in range(len(batch)):
            gen_emb = generated_embeddings[j].reshape(1, -1)
            gold_emb = gold_embeddings[j].reshape(1, -1)
            similarity = cosine_similarity(gen_emb, gold_emb)[0][0]
            embedding_similarities.append(similarity)
    
    # Add new column for embedding similarity
    df_filtered['mini_lm_similarity'] = embedding_similarities
    
    # Calculate averages
    avg_bleu = df_filtered['bleu'].mean()
    avg_rouge1 = df_filtered['rouge-1'].mean()
    avg_rouge2 = df_filtered['rouge-2'].mean()
    avg_rougel = df_filtered['rouge-l'].mean()
    avg_embedding_sim = df_filtered['embedding_similarity'].mean()
    avg_mini_lm_sim = df_filtered['mini_lm_similarity'].mean()
    
    # Save to new CSV file
    output_file = 'IndianConstitution_filtered_results.csv'
    df_filtered.to_csv(output_file, index=False)
    print(f"Filtered data saved to {output_file}")
    
    # Print summary matrix
    print("\nSummary:")
    print(f"Number of questions left: {len(df_filtered)}")
    print(f"Number of questions removed: {removed_rows}")
    print(f"Average BLEU: {avg_bleu:.4f}")
    print(f"Average ROUGE-1: {avg_rouge1:.4f}")
    print(f"Average ROUGE-2: {avg_rouge2:.4f}")
    print(f"Average ROUGE-L: {avg_rougel:.4f}")
    print(f"Average Embedding Similarity: {avg_embedding_sim:.4f}")
    print(f"Average MiniLM Embedding Similarity: {avg_mini_lm_sim:.4f}")

if __name__ == "__main__":
    process_csv()

Loading data from IndianConstitution_incremental_results.csv...
Initial number of rows: 487
Phrases found in 159 rows:
  'i am sorry': found in 31 rows
  'i cannot': found in 34 rows
  'i am unable': found in 59 rows
  'agent stopped': found in 64 rows
Number of rows removed: 159
Number of rows remaining: 328
Loading sentence transformer model...
Computing embedding similarities...
Filtered data saved to IndianConstitution_filtered_results.csv

Summary:
Number of questions left: 328
Number of questions removed: 159
Average BLEU: 0.2824
Average ROUGE-1: 0.4735
Average ROUGE-2: 0.3859
Average ROUGE-L: 0.4531
Average Embedding Similarity: 0.8951
Average MiniLM Embedding Similarity: 0.6615
