# Failed Results Analysis

This notebook analyzes the `failed_results_with_ids.jsonl` file to understand the distribution of different error categories.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set the plotting style
sns.set_theme(style="whitegrid")

In [None]:
# Define the file path
# Using relative path assuming notebook is in 'notebook/' and data is in 'data/'
file_path = '../data/failed_results_with_ids.jsonl'

# Check if file exists, if not try absolute path
if not os.path.exists(file_path):
    file_path = '/root/hsin_research/ruledistill-main/data/failed_results_with_ids.jsonl'

print(f"Reading file from: {file_path}")

# Read the JSONL file
try:
    df = pd.read_json(file_path, lines=True)
    print(f"Successfully loaded {len(df)} records.")
except Exception as e:
    print(f"Error loading file: {e}")

In [None]:
# Display the first few rows to understand the structure
df.head()

In [None]:
# Analyze 'error_category'
if 'error_category' in df.columns:
    # Count the occurrences of each error category
    error_counts = df['error_category'].value_counts()
    
    print("Error Category Counts:")
    print(error_counts)
    
    # Plot the bar chart
    plt.figure(figsize=(12, 6))
    ax = sns.barplot(x=error_counts.index, y=error_counts.values, palette="viridis")
    
    # Labeling
    plt.title('Distribution of Error Categories in Failed Results', fontsize=16)
    plt.xlabel('Error Category', fontsize=12)
    plt.ylabel('Number of Questions', fontsize=12)
    plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
    
    # Add count labels on top of bars
    for i, v in enumerate(error_counts.values):
        ax.text(i, v + (max(error_counts.values)*0.01), str(v), ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
else:
    print("Column 'error_category' not found in the dataframe.")