In [1]:
import pandas as pd
import os
import numpy as np
import json
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import plotly

import plotly.figure_factory as ff
from google.colab import drive, userdata
drive.mount('/content/drive')

BASE_DIR = '/content/drive/My Drive/SUNY_Poly_DSA598/'
DATA_DIR = os.path.join(BASE_DIR, 'datasets/FEVER/paper_test_results/')



Mounted at /content/drive


In [2]:


def load_dfs():
    """
    Load all CSV files from the DATA_DIR into a list of DataFrames.
    """
    dfs = []
    for file in os.listdir(DATA_DIR):
        if file.endswith('.csv'):
            df = pd.read_csv(os.path.join(DATA_DIR, file))
            # Add a column for the system state when tested (the same value in all rows)
            df['system_config'] = file.split('_')[0] + '_' + file.split('_')[1]
            dfs.append(df)
    return dfs


"""
id,claim,time_to_check,entities,keywords,retrieved_pages,module2_status,predicted_evidence_ids,predicted_evidence_texts,module3_result,module3_status,module3_prompt,module1_report_details,module2_report_details,query_client_temp,rephrase_client_temp,sentEx_client_temp,nli_client_temp,disambiguate_client_temp,strict_score,label_accuracy,precision,recall,f1"""

def process_dfs(dfs):
    """
    Process the loaded DataFrames to extract relevant information.
    """
    processed_dfs = []
    for df in dfs:
        # Unpack the module 1 report into new columns
        """
                module_1_report = {
            "mod_1_total_documents": mod_1_total_documents,
            "total_document_tokens": total_document_tokens,
            "potential_titles": potential_titles,
            "retrieved_titles": retrieved_pages_str
        }
        """
        module_1_report = df['module1_report_details'].apply(json.loads)
        df['number_of_pages'] = module_1_report.apply(lambda x: x['mod_1_total_documents'])
        df['total_document_tokens'] = module_1_report.apply(lambda x: x['total_document_tokens'])
        df['potential_titles'] = module_1_report.apply(lambda x: x['potential_titles'])
        df['retrieved_pages'] = module_1_report.apply(lambda x: x['retrieved_titles'])

        # Unpack the module 2 report into new columns
        """    report = {
        "claim": claim,
        "final_evidence_ids": final_evidence_ids, # [[title, id], ...]
        "selected_evidence_texts": selected_evidence_texts, # List of text for selected evidence
        "status": status,
        "iterations_run": iteration + 1,
        "max_evidence": max_evidence,
        "max_iterations": max_iterations,
        "mod_2_total_documents": len(documents),
        "sbert_total_sentences": len(all_sbert_candidates_map),
        "sbert_total_tokens": sbert_total_tokens,
        "initial_sbert_thresh": initial_sbert_thresh,
        "final_sbert_threshold": current_sbert_thresh,
        "min_sbert_thresh": min_sbert_thresh,
        "thresh_decay": thresh_decay,
        "llm_total_sentences": llm_total_sentences,
        "llm_total_tokens": llm_total_tokens,
        "near_match_thresh": near_match_thresh,
        }
        """
        module_2_report = df['module2_report_details'].apply(json.loads)
        for key in module_2_report.iloc[0]:
            df[key] = module_2_report.apply(lambda x: x[key])


        processed_dfs.append(df)
        # Concatenate the DataFrames
    df = pd.concat(processed_dfs, ignore_index=True)
    return df

dfs = load_dfs()
df = process_dfs(dfs)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 44 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        180 non-null    int64  
 1   claim                     180 non-null    object 
 2   time_to_check             180 non-null    float64
 3   entities                  162 non-null    object 
 4   keywords                  180 non-null    object 
 5   retrieved_pages           180 non-null    object 
 6   module2_status            180 non-null    object 
 7   predicted_evidence_ids    180 non-null    object 
 8   predicted_evidence_texts  180 non-null    object 
 9   module3_result            180 non-null    object 
 10  module3_status            180 non-null    object 
 11  module3_prompt            180 non-null    object 
 12  module1_report_details    180 non-null    object 
 13  module2_report_details    180 non-null    object 
 14  query_clie

In [3]:
df.isna().sum()

Unnamed: 0,0
id,0
claim,0
time_to_check,0
entities,18
keywords,0
retrieved_pages,0
module2_status,0
predicted_evidence_ids,0
predicted_evidence_texts,0
module3_result,0


In [4]:
for col in df.columns:
    print(f"{col}")

id
claim
time_to_check
entities
keywords
retrieved_pages
module2_status
predicted_evidence_ids
predicted_evidence_texts
module3_result
module3_status
module3_prompt
module1_report_details
module2_report_details
query_client_temp
rephrase_client_temp
sentEx_client_temp
nli_client_temp
disambiguate_client_temp
strict_score
label_accuracy
precision
recall
f1
system_config
number_of_pages
total_document_tokens
potential_titles
final_evidence_ids
selected_evidence_texts
status
iterations_run
max_evidence
max_iterations
mod_2_total_documents
sbert_total_sentences
sbert_total_tokens
initial_sbert_thresh
final_sbert_threshold
min_sbert_thresh
thresh_decay
llm_total_sentences
llm_total_tokens
near_match_thresh


In [5]:
# Add a line for the mean number of pages and time for each system config
unique_system_configs = df['system_config'].unique()
color_mapping = {}
for system_config in unique_system_configs:
    color_mapping[system_config] = px.colors.qualitative.D3[len(color_mapping) % len(px.colors.qualitative.D3)]


# Visualize a scatterplot of time_to_check vs. number_of_pages with system_config as color
fig = px.scatter(df, x='number_of_pages', y='time_to_check', color='system_config',
                 title='Time to Check vs. Number of Pages',
                 color_discrete_sequence=[color_mapping[system_config] for system_config in df['system_config'].unique()],
                 labels={'number_of_pages': 'Number of Pages', 'time_to_check': 'Time to Check (seconds)'})
fig.update_traces(marker=dict(size=20, line=dict(width=2, color='black')),
                  selector=dict(mode='markers+text'),
                  )


fig.update_layout(title='Time to Check vs. Number of Pages',
                  xaxis_title='Number of Pages',
                  yaxis_title='Time to Check (seconds)',
                  legend_title='Test Config',
                  font=dict(size=12),
                  width=1000,
                  height=800)

for system_config in df['system_config'].unique():
    mean_time = df[df['system_config'] == system_config]['time_to_check'].mean()
    fig.add_shape(type='line',
                  x0=df['number_of_pages'].min(),
                  y0=mean_time,
                  x1=df['number_of_pages'].max(),
                  y1=mean_time,
                  line=dict(color=color_mapping[system_config], width=2, dash='dash'),
                  name='Mean Time for ' + system_config)
    # Add a line for the mean number of pages and time for each system config
    mean_pages = df[df['system_config'] == system_config]['number_of_pages'].mean()
    fig.add_shape(type='line',
                  x0=mean_pages,
                  y0=df['time_to_check'].min(),
                  x1=mean_pages,
                  y1=df['time_to_check'].max(),
                  line=dict(color=color_mapping[system_config], width=2, dash='dash'),
                  name='Mean Pages for ' + system_config)



config = {
    'responsive': True,
   'toImageButtonOptions': {
	'format': 'png', # one of png, svg, jpeg, webp
	'scale':6,
	'filename': 'issue_counts_per_page_title_issue_type.png'
   }}

fig.show(config=config)




In [6]:
import ast # For safely parsing string representations of lists
unique_system_configs = df['system_config'].unique()
color_mapping = {}
for system_config in unique_system_configs:
    color_mapping[system_config] = px.colors.qualitative.D3[len(color_mapping) % len(px.colors.qualitative.D3)]

# --- Load your data ---
# Assuming your combined results are in a pandas DataFrame called 'df'
# Replace 'path/to/your/combined_results.csv' with the actual path
try:
    # df = pd.read_csv('path/to/your/combined_results.csv')
    # Example: Load from a variable if already loaded
    # Make sure 'df' is the DataFrame containing all 150 rows (30 claims * 5 configs)
    # If your DataFrame is named differently, adjust accordingly.

    # --- Data Preprocessing ---
    # 1. Parse list-like strings to actual lists to get lengths
    def safe_literal_eval(val):
        try:
            # Handle potential NaN/None before evaluation
            if pd.isna(val):
                return []
            # Ensure it's treated as a string before eval
            res = ast.literal_eval(str(val))
            # Ensure the result is a list
            return res if isinstance(res, list) else []
        except (ValueError, SyntaxError, TypeError):
            return [] # Return empty list on error

    df['predicted_evidence_ids_list'] = df['predicted_evidence_ids'].apply(safe_literal_eval)
    df['num_predicted_evidence'] = df['predicted_evidence_ids_list'].apply(len)

    # --- Aggregation for Visualizations ---

    # Calculate average scores and other metrics per configuration
    # Use np.nanmean to ignore NaNs if any were introduced by coercion
    df_agg = df.groupby('system_config').agg(
        avg_strict_score=('strict_score', 'mean'),
        avg_label_accuracy=('label_accuracy', 'mean'),
        avg_f1=('f1', 'mean'),
        avg_precision=('precision', 'mean'),
        avg_recall=('recall', 'mean'),
        avg_time_to_check=('time_to_check', 'mean'),
        avg_total_llm_tokens=('llm_total_tokens', 'mean'), # Average tokens per claim
        avg_iterations_run=('iterations_run', 'mean'),
        avg_num_predicted_evidence=('num_predicted_evidence', 'mean')
    ).reset_index()

    # --- Visualizations ---

    # Visualization 1: Core Performance Comparison (Strict Score, Label Accuracy, F1)
    fig1 = px.bar(df_agg,
                  x='system_config',
                  y=['avg_strict_score', 'avg_label_accuracy', 'avg_f1'],
                  barmode='group', # Group bars side-by-side
                  title='Average Performance Scores by System Configuration',
                  labels={'value': 'Average Score', 'system_config': 'System Configuration', 'variable': 'Metric'},
                  height=600,
                  color_discrete_sequence=px.colors.qualitative.D3,
                  )
    fig1.update_traces(marker=dict(line=dict(width=1, color='black')),
                       # Add text labels to bars
                        texttemplate='%{y:.2f}',
                        textposition='outside',
                        textfont_size=12)
    fig1.update_layout(
        xaxis_title='System Configuration',
        yaxis_title='Average Score',
        legend_title='Metric',
        font=dict(size=12),
        xaxis_tickangle=-45) # Angle labels if they overlap
    fig1.show(config=config)

    # Visualization 2: Average Document Processing Load (sbert_total_tokens)
    fig2 = px.bar(df_agg,
                  x='system_config',
                  y='avg_total_llm_tokens',
                  title='Average Total Tokens Processed by GPT_sentEx (per Claim) by Configuration',
                  labels={'avg_total_llm_tokens': 'Average Total Tokens Processed by GPT_sentEx', 'system_config': 'System Configuration'},
                  height=600,
                  color_discrete_sequence=px.colors.qualitative.D3,
                  )
    fig2.update_traces(marker=dict(line=dict(width=1, color='black')),
                       # Add text labels to bars
                        texttemplate='%{y:.2f}',
                        textposition='outside',
                        textfont_size=12)

    fig2.update_layout(
        xaxis_title='System Configuration',
        yaxis_title='Average Total Document Tokens',
        legend_title='Metric',
        font=dict(size=12),
        xaxis_tickangle=-45) # Angle labels if they overlap
    fig2.show(config=config)

    # Visualization 3: Distribution of Time per Claim
    fig3 = px.box(df, # Use original df for box plot distributions
                  x='system_config',
                  y='time_to_check',
                  points="outliers", # Show outliers
                  title='Distribution of Time Taken per Claim by Configuration',
                  labels={'time_to_check': 'Time to Check Claim (seconds)', 'system_config': 'System Configuration'},
                  height=600)
    fig3.update_traces(marker=dict(line=dict(width=1, color='black')) # Add black border to boxes
                      )
    fig3.update_layout(xaxis_tickangle=-45)
    fig3.show(config=config)


    # Visualization 4: Distribution of Number of Evidence Pieces Retrieved
    fig4 = px.box(df, # Use original df for box plot distributions
                    x='system_config',
                    y='num_predicted_evidence',
                    points="outliers", # Show outliers
                    title='Distribution of Number of Evidence Pieces Retrieved by Configuration',
                    labels={'num_predicted_evidence': 'Number of Predicted Evidence IDs', 'system_config': 'System Configuration'},
                    height=600,
                    color_discrete_sequence=px.colors.qualitative.D3,
                    )
    fig4.update_traces(marker=dict(line=dict(width=1, color='black'))) # Add black border to boxes
    fig4.update_layout(xaxis_tickangle=-45)
    fig4.show(config=config)

    # Visualization 5: Summary Table of Key Average Metrics
    # Select and rename columns for clarity in the table
    df_table = df_agg[['system_config', 'avg_strict_score', 'avg_label_accuracy', 'avg_f1', 'avg_time_to_check', 'avg_total_llm_tokens', 'avg_num_predicted_evidence']].copy()
    df_table.rename(columns={
        'system_config': 'Configuration',
        'avg_strict_score': 'Avg. Strict Score',
        'avg_label_accuracy': 'Avg. Label Acc.',
        'avg_f1': 'Avg. Evidence F1',
        'avg_time_to_check': 'Avg. Time (s)',
        'avg_total_llm_tokens': 'Avg. GPT_sentEx Tokens',
        'avg_num_predicted_evidence': 'Avg. Evidence Count'
    }, inplace=True)

    # Format numeric columns for better readability
    for col in df_table.columns:
        if df_table[col].dtype == 'float64' or df_table[col].dtype == 'int64':
            df_table[col] = df_table[col].round(3)

    fig5 = ff.create_table(df_table, index=False)
    # Update layout for title - Figure Factory tables require a different approach
    fig5.update_layout(
        title_text='Summary Table of Average Metrics by Configuration',
        title_x=0.5, # Center title
        margin = {'t':50, 'b':10} # Adjust margin for title
        )
    fig5.show(config=config)

except NameError:
    print("Error: DataFrame 'df' not defined. Please load your data into a DataFrame named 'df'.")
except FileNotFoundError:
    print("Error: Could not find the results CSV file. Please check the path.")
except KeyError as e:
    print(f"Error: A required column is missing from the DataFrame: {e}")
except Exception as e:
    print(f"An unexpected error occurred during visualization: {e}")

In [7]:
df_table

Unnamed: 0,Configuration,Avg. Strict Score,Avg. Label Acc.,Avg. Evidence F1,Avg. Time (s),Avg. GPT_sentEx Tokens,Avg. Evidence Count
0,all_base,0.367,0.567,0.455,22.039,1499.467,2.7
1,tuned_GPT-clf,0.367,0.567,0.378,40.014,919.5,2.7
2,tuned_GPT-sBERTn1024-sentEx,0.533,0.8,0.401,42.685,1927.3,2.2
3,tuned_GPT-sentEx,0.433,0.733,0.321,40.745,906.067,1.6
4,tuned_sBERT-n1024,0.467,0.733,0.421,54.198,2363.167,3.167
5,tuned_sBERT-n256,0.333,0.533,0.393,42.812,1853.9,3.233
