# hallucinations

This notebook tracks all analysis I did on the nature of hallucinations, their frequency, etc.

In [None]:
import psycopg
import matplotlib.pyplot as plt
import numpy as np # Import numpy for potential logarithmic scaling

def plot_run_length_distribution(conn: psycopg.Connection):
    with conn.cursor() as cur:
        cur.execute("""
            SELECT run_length, COUNT(*) 
            FROM repetition_runs
            GROUP BY run_length
            ORDER BY run_length;
        """)
        results = cur.fetchall()

    if not results:
        print("No data found to plot.")
        return

    print(f"Number of distinct run lengths: {len(results)}")

    run_lengths = [row[0] for row in results]
    counts = [row[1] for row in results]

    # Debugging checks - useful to see the data characteristics
    if len(run_lengths) > 35: # Check only if index 35 exists
        print(f"Run length at index 35: {run_lengths[35]}")
        print(f"Count at index 35: {counts[35]}")
    else:
        print("Less than 36 data points. Cannot print index 35.")
    
    print(f"Min run_length: {min(run_lengths)}, Max run_length: {max(run_lengths)}")
    print(f"Min count: {min(counts)}, Max count: {max(counts)}")

    plt.figure(figsize=(12, 6)) # Make the plot wider for better visibility

    # Attempt 1: Regular bar plot (most common scenario)
    # This is your original approach, but with color and potentially adjusted limits
    # plt.bar(run_lengths, counts, color='skyblue')

    # Attempt 2: If counts are heavily skewed, use a logarithmic y-axis
    # This is very common for frequency distributions where a few items are very frequent.
    plt.bar(run_lengths, counts, color='skyblue', width=0.8) # Add width for better bar appearance
    plt.yscale('log') # Apply logarithmic scale to the y-axis

    # Attempt 3: You might have too many distinct run_lengths,
    # making individual bars too thin or indistinguishable.
    # If run_lengths are integers and cover a small range, a bar plot is good.
    # If run_lengths are floats or cover a massive range with many distinct values,
    # a histogram might be more appropriate, or selective plotting.
    # For now, let's assume `run_lengths` are manageable for a bar chart.

    plt.xlabel("Run Length", fontsize=12)
    plt.ylabel("Count (log scale)", fontsize=12) # Update label for log scale
    plt.title("Distribution of Repetition Run Lengths", fontsize=14)
    plt.xticks(fontsize=10)
    plt.yticks(fontsize=10)
    plt.grid(axis="y", linestyle="--", alpha=0.7)

    # Potentially set x-axis limits if there are extreme outliers that make
    # the interesting part of the data too small.
    # Example: plt.xlim(0, 50) # If most interesting run lengths are below 50

    plt.tight_layout()
    plt.show()

if __name__ == "__main__":
    try:
        # It's good practice to encapsulate connection in a try-finally block
        # to ensure it's closed even if an error occurs.
        conn = psycopg.connect("dbname=sentence_db user=jonny password=buolamwini host=localhost")
        plot_run_length_distribution(conn)
    except psycopg.Error as e:
        print(f"Database error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    finally:
        if 'conn' in locals() and conn:
            conn.close()
            print("Database connection closed.")