# Errors with and without CoT

In [1]:
# General importations.
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import platform
import time
import warnings
import math

import networkx as nx

# View versioning.
print("python version     :", platform.python_version())
print("numpy version      :", np.__version__)
print("pandas version     :", pd.__version__)
print("matplotlib version :", matplotlib.__version__)
print("seaborn version    :", sns.__version__)

python version     : 3.10.13
numpy version      : 1.26.3
pandas version     : 2.1.4
matplotlib version : 3.8.0
seaborn version    : 0.12.2


## Read data

In [2]:
df = pd.read_csv("df_errors_local.csv")
print(df.info())
display(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66000 entries, 0 to 65999
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Quantity   66000 non-null  object 
 1   Error      66000 non-null  float64
 2   Model      66000 non-null  object 
 3   Mediators  66000 non-null  int64  
 4   Distance   66000 non-null  int64  
 5   Type       66000 non-null  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 3.0+ MB
None


Unnamed: 0,Quantity,Error,Model,Mediators,Distance,Type
0,XY,11.407407,Phi-3,6,3,Global
1,XC,0.438095,Phi-3,2,1,Local
2,XD,1.191667,Phi-3,3,2,Local
3,CY,1.463768,Phi-3,3,2,Local
4,DY,0.808219,Phi-3,2,1,Local


In [4]:
np.unique(df.Model)

array(['GPT-4o', 'GPT-4o CoT', 'Llama 2', 'Llama 3', 'Llama 3.1',
       'Llama 3.1 CoT', 'Llama 3.1 Math', 'Llama 3.1 Math CoT', 'Phi-3',
       'o1', 'o1 CoT'], dtype=object)

In [6]:
np.unique(df.Quantity)

array(['CD', 'CY', 'DY', 'XC', 'XD', 'XY'], dtype=object)

In [5]:
df_llama = df[df.Model == "Llama 3.1"]
df_llama_cot = df[df.Model == "Llama 3.1 CoT"]
df_llama_math = df[df.Model == "Llama 3.1 Math"]
df_llama_math_cot = df[df.Model == "Llama 3.1 Math CoT"]
df_gpt4o = df[df.Model == "GPT-4o"]
df_gpt4o_cot = df[df.Model == "GPT-4o CoT"]
df_o1 = df[df.Model == "o1"]
df_o1_cot = df[df.Model == "o1 CoT"]

In [7]:
quantities = ['CD', 'CY', 'DY', 'XC', 'XD', 'XY']

## Define test

In [47]:
def test_errors(df, df_cot, alpha = 0.05, test = "wilcoxon"):
    for q in quantities:
        errors = df[df.Quantity == q].Error.to_numpy()
        errors_cot = df_cot[df_cot.Quantity == q].Error.to_numpy()

        # Use Wilcoxon test.
        # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.wilcoxon.html
        if test == "wilcoxon":
            p = stats.wilcoxon(errors, errors_cot).pvalue
            if p <= alpha:
                print(q, ": Errors from CoT are significantly different.")
            else: 
                print(q, ": Paired samples come from the same distribution.")

        # Use t-test (stricter assumptions).
        # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html
        elif test == "t-test":
            # Test null hypothesis that a sample comes from a normal distribution.
            p = stats.normaltest(errors).pvalue
            p_cot = stats.normaltest(errors_cot).pvalue
            if p <= alpha:
                warnings.warn("Errors for {} may not be normally distributed: {} <= {}".format(q,p,alpha))
            if p_cot <= alpha:
                warnings.warn("Errors for {} with CoT may not be normally distributed: {} <= {}".format(q,p_cot,alpha))
            p = stats.ttest_ind(errors, errors_cot).pvalue
            if p <= alpha:
                print(q, ": Errors from CoT are significantly different.")
            else: 
                print(q, ": Paired samples come from the same distribution.")

## Test Llama 3.1

In [48]:
test_errors(df_llama, df_llama_cot, test = "wilcoxon")

CD : Errors from CoT are significantly different.
CY : Errors from CoT are significantly different.
DY : Errors from CoT are significantly different.
XC : Errors from CoT are significantly different.
XD : Errors from CoT are significantly different.
XY : Errors from CoT are significantly different.


In [49]:
test_errors(df_llama, df_llama_cot, test = "t-test")

CD : Errors from CoT are significantly different.
CY : Errors from CoT are significantly different.
DY : Errors from CoT are significantly different.
XC : Errors from CoT are significantly different.
XD : Errors from CoT are significantly different.
XY : Errors from CoT are significantly different.




## Test Llama 3.1 Math

In [50]:
test_errors(df_llama_math, df_llama_math_cot, test = "wilcoxon")

CD : Errors from CoT are significantly different.
CY : Errors from CoT are significantly different.
DY : Errors from CoT are significantly different.
XC : Errors from CoT are significantly different.
XD : Errors from CoT are significantly different.
XY : Errors from CoT are significantly different.


In [51]:
test_errors(df_llama_math, df_llama_math_cot, test = "t-test")

CD : Errors from CoT are significantly different.
CY : Errors from CoT are significantly different.
DY : Errors from CoT are significantly different.
XC : Errors from CoT are significantly different.
XD : Errors from CoT are significantly different.
XY : Errors from CoT are significantly different.


  b2 = skew(a, axis)
  b2 = kurtosis(a, axis, fisher=False)
  res = hypotest_fun_out(*samples, **kwds)


## Test GPT-4o

In [52]:
test_errors(df_gpt4o, df_gpt4o_cot, test = "wilcoxon")

CD : Errors from CoT are significantly different.
CY : Errors from CoT are significantly different.
DY : Errors from CoT are significantly different.
XC : Errors from CoT are significantly different.
XD : Errors from CoT are significantly different.
XY : Errors from CoT are significantly different.


In [53]:
test_errors(df_gpt4o, df_gpt4o_cot, test = "t-test")

CD : Errors from CoT are significantly different.
CY : Errors from CoT are significantly different.
DY : Errors from CoT are significantly different.
XC : Errors from CoT are significantly different.
XD : Errors from CoT are significantly different.
XY : Errors from CoT are significantly different.




## Test o1

In [54]:
test_errors(df_o1, df_o1_cot, test = "wilcoxon")

CD : Errors from CoT are significantly different.
CY : Errors from CoT are significantly different.
DY : Errors from CoT are significantly different.
XC : Errors from CoT are significantly different.
XD : Errors from CoT are significantly different.


ValueError: zero_method 'wilcox' and 'pratt' do not work if x - y is zero for all elements.

In [55]:
test_errors(df_o1, df_o1_cot, test = "t-test")

CD : Errors from CoT are significantly different.
CY : Errors from CoT are significantly different.
DY : Errors from CoT are significantly different.
XC : Errors from CoT are significantly different.
XD : Errors from CoT are significantly different.
XY : Paired samples come from the same distribution.


