# Merge and read csv_files

In [1]:
import pandas as pd
import glob
import os

# Path to your folder
folder_path = "../../csv_files"

# Find all CSV files in the folder
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))

# Read and concatenate all CSVs into one DataFrame
df = pd.concat((pd.read_csv(file) for file in csv_files), ignore_index=True)

# Keep only rows where Experiment == "SR"
df = df[df["Experiment"] == "SR_chunking"]

# Show the combined DataFrame
print(df.head())

    Name   Experiment  Trial  \
100  liv  SR_chunking      1   
101  liv  SR_chunking      2   
102  liv  SR_chunking      3   
103  liv  SR_chunking      4   
104  liv  SR_chunking      5   

                                              Sequence  \
100  air per run won out two top law due guy win cu...   
101  and hey cut but get yes see out all one put so...   
102  age and are any cup yet its men see for day go...   
103  lot day the but tax dog son try how two for sa...   
104  die did big its put had ask was key low law fu...   

                              Recall  
100          air per run won out too  
101                  mom run got but  
102  age and are any cup yet its men  
103          lot day the but tax dog  
104          die did big its put hat  


In [2]:
df["points"] = 0   # initialize column
df["fails"] = [[] for _ in range(len(df))]  # initialize fails as empty lists

for idx, row in df.iterrows():
    seq = row["Sequence"].split()
    rec = row["Recall"].split()
    
    points = 0
    fails = []
    mistake_found = False
    
    for s, r in zip(seq, rec):
        if not mistake_found and s == r:
            points += 1
        elif s != r:
            fails.append((s, r))
            mistake_found = True  # lock the points counter
    
    # save results back into df
    df.at[idx, "points"] = points
    df.at[idx, "fails"] = fails

df.head(10)

Unnamed: 0,Name,Experiment,Trial,Sequence,Recall,points,fails
100,liv,SR_chunking,1,air per run won out two top law due guy win cu...,air per run won out too,5,"[(two, too)]"
101,liv,SR_chunking,2,and hey cut but get yes see out all one put so...,mom run got but,0,"[(and, mom), (hey, run), (cut, got)]"
102,liv,SR_chunking,3,age and are any cup yet its men see for day go...,age and are any cup yet its men,8,[]
103,liv,SR_chunking,4,lot day the but tax dog son try how two for sa...,lot day the but tax dog,6,[]
104,liv,SR_chunking,5,die did big its put had ask was key low law fu...,die did big its put hat,5,"[(had, hat)]"
105,liv,SR_chunking,6,eat all who ago six two car new bit man guy le...,est all who ago six two car,0,"[(eat, est)]"
106,liv,SR_chunking,7,few cut bed law any add bit job own age one to...,few cut bed law any add bit,7,[]
107,liv,SR_chunking,8,him red way non pay hot guy bed hit ask law wh...,him red way non bet cut,4,"[(pay, bet), (hot, cut)]"
108,liv,SR_chunking,9,her led pay was hey tax lot can air ask yet pe...,her led pay was hey tax cut,6,"[(lot, cut)]"
109,liv,SR_chunking,10,oil lot man has eat add saw guy act put for ye...,oil let man has eat,1,"[(lot, let)]"


## Points analysis

In [4]:
import scipy.stats as stats
import numpy as np
points=df['points'].to_numpy()*3
print(np.mean(points))
stats.ttest_1samp(points,popmean= np.mean(points)).confidence_interval(confidence_level=0.95)

10.8


ConfidenceInterval(low=9.068816647218586, high=12.531183352781415)

In [5]:
points=df['points'].to_numpy()
print(np.mean(points))
stats.ttest_1samp(points,popmean= np.mean(points)).confidence_interval(confidence_level=0.95)

3.6


ConfidenceInterval(low=3.022938882406195, high=4.177061117593805)