# 1. step: Prerequisites

The following steps are required for the scripts to run properly.

Please run them by pressing <b>SHIFT + ENTER</b>.

## 1.1. Import Libraries

Scripts use the following libraries.

In [17]:
import numpy as np
import pandas as pd
#import matplotlib as mp
#import matplotlib.pyplot as plt
#from matplotlib.patches import Polygon
import os
import re
import math
from datetime import datetime
from zipfile import ZipFile

## 1.2. Set constans (manually)

Set the dir variable which is the directory where CSV files are located.

If you would like to use different directory for unzip file from drive, please set dir_raw variable as well.

In [18]:
filename = "data0421"
triplet_file = 'triplet_type.csv'
csv_filename = "all_20230421.csv"

In [19]:
zip_directory = filename + "/"
print(zip_directory)
zip_file = filename + ".zip"
print(zip_file)

data0421/
data0421.zip


# Unzip

<b>uzip function</b> extracts the file.

<b>Input:</b> 
- zip_file (type: string): the path to the file
- dir_raw (type: string): where the zip file is unzipped

In [20]:
def unzip(zip_file,zip_directory):
    with ZipFile(zip_file, 'r') as zip:
        # printing all the contents of the zip file
        zip.printdir()

        # extracting all the files
        print('Extracting all the files now...')
        zip.extractall(zip_directory)
        print('Done!')

In [21]:
#unzip(zip_file,zip_directory)

Check the directory and remove the files!

In [22]:
dir_list = list(filter(lambda name: ".csv" in name, os.listdir(zip_directory)))

In [23]:
dir_list

['1011.csv',
 '2027_NeMO-asrt_mixed_fish_v4_2023-04-01_10h37.49.391.csv',
 '1028_NeMO-asrt_mixed_fish_v4_2023-04-05_17h24.51.297.csv',
 '1029_NeMO-asrt_mixed_fish_v4_2023-04-05_18h32.40.656.csv',
 '1031_NeMO-asrt_mixed_fish_v4_2023-04-12_18h15.50.164 (1).csv',
 '2018_NeMO-asrt_mixed_fish_v4_2023-02-20_15h57.55.766.csv',
 '2014_asrt_mixed_fish_v2_2023-01-28_10h44.21.487.csv',
 '2013_asrt_mixed_fish_v2_2023-01-30_12h20.28.788.csv']

# 2. step: Data cleaning

## 2.1 Read and concatenate data file and triplet file

In [24]:
df_triplet_type = pd.read_csv(triplet_file,sep=';',engine='python')
#df_triplet_type = df_triplet_type.dropna()
#df_triplet_type = df_triplet_type.reset_index(drop=True)
#print(df_triplet_type.to_string())
print(df_triplet_type)

      block  position  type  trial_type
0         1      -365     0          50
1         1      -117     0          50
2         1       127     0          50
3         1       369     0          50
4         1       369     0          50
...     ...       ...   ...         ...
1695     20       127     1          11
1696     20      -117     2          12
1697     20       369     1          11
1698     20       369     2          20
1699     20      -365     1          11

[1700 rows x 4 columns]


In [25]:
def dataclean(filename):
    print("-------------------------")
    print("Data cleaning started...")
    print
    # filter columns
    df = pd.read_csv(zip_directory + filename)
    df = df[["stimulus.thisIndex","mouse.time","mouse.clicked_name","position_x","mouse.leftButton"]]
    df = df.dropna()
    df = df.reset_index(drop=True)
    
    # print(df)
    
    # adding triplets to the files
    result = pd.concat([df, df_triplet_type],axis=1)
    result = result.dropna()
    
    #print(result[["stimulus.thisIndex","block","position_x","position","trial_type"]].to_string())
    print("Column names: ", df.columns.values)
    print("Number of rows: ", len(result))
    
    print("Data cleaning finished...")
    
    return result

In [26]:
def accuracy(result):
    print("-------------------------")
    print("Accuracy cleaning started...")
    
    result['mouse.leftButton_len'] = result['mouse.leftButton'].str.len()
    
    indexMoreTrial = result[(result['mouse.leftButton_len'] > 3)].index
    
    print("Number of rows: ", len(result))
    print("Number of deleted rows: ", len(indexMoreTrial))
          
    result.drop(indexMoreTrial , inplace=True)

    indexTrials = result[(result['trial_type'] == 50)].index
    result.drop(indexTrials , inplace=True)
    result = result.reset_index(drop=True)
    
    print("Remaining rows: ", len(result))
    
    print("Accuracy cleaning finished.")
    
    return result

In [27]:
def RTtoFloat(result):
    print("-------------------------")
    print("RT type conversation started...")
    result['mouse.time'] = result['mouse.time'].str[1:-1]
    result['mouse.time'] = result['mouse.time'].astype(float)
    
    print("Remaining rows: ", result['mouse.time'].dtype)
    
    print("RT type conversation finished.")
    
    return result

In [28]:
def elim_outliars(result):
    print("-------------------------")
    print("Eliminating outliars started...")
    dataset = result['mouse.time']
    q1,q3 = np.percentile(dataset,[25,75])
    print("Q1 = ", q1)
    print("Q3 = ", q3)
    
    iqr = q3-q1
    print("IQR = ", iqr)
    
    #find higher and lower fence
    lower_fence = q1 - (1.5 * iqr)
    higher_fence = q3 + (1.5 * iqr)
    print("Lower fence: ", lower_fence)
    print("Higher fence: ", higher_fence)
    
    indexOutliers = result[(result["mouse.time"] > higher_fence) | (result["mouse.time"] < lower_fence)].index
    #indexOutliers = result[(result["mouse.time"] > higher_fence)].index
    print("Number of rows: ", len(result))
    print("Number of deleted rows: ", len(indexOutliers))
    
    result.drop(indexOutliers , inplace=True)

    result = result.reset_index(drop=True)
    
    print("Remaining rows: ", len(result))
    print("Eliminating outliars finished.")
    
    return result

# 3. step: Analysis

In [29]:
def medianRT(result,triplet):
    print("-------------------------")
    print("Median RT analysis started...")
    print("Selected triplet types: ", triplet)
    
    # H – High Probabilty Triplets: 12
    # L – Low Probabilty Triplets: 20
    # P – Pattern Triplets: 11
    # HP – High and Pattern: 12 and 11
    

    arr_median = []
    
    max_block_value = result["block"].max()
    
    print("Number of blocks: ", max_block_value)
    
    for i in range(1,max_block_value+1): 
        print("Block number: ", i)
        
        result_block=result.loc[result["block"] == i]
        
        if triplet == "H":
            result_triplets = result_block.loc[result_block["trial_type"] == 12]
        elif triplet == "L":
            result_triplets = result_block.loc[result_block["trial_type"] == 20]
        elif triplet == "P":
            result_triplets = result_block.loc[result_block["trial_type"] == 11]
        elif triplet == "HP":
            result_triplets = result_block.loc[(result_block["trial_type"] == 12) | 
                                               (result_block["trial_type"] == 11)]
        else:
            print("Error: Triplet type is not valid.")
    
        median = result_triplets["mouse.time"].median()
        print("Median: ", median)
        
    
        arr_median.append(median)
        
        print("Array of medians by blocks: ", arr_median)
    
    print("Median RT analysis finished.")
    print("-------------------------")
        
    return arr_median


# 4.step: Data table creation

In [30]:
dir_list_len = len(dir_list)
print(dir_list_len)

8


In [40]:
df_randHigh = pd.DataFrame()
df_randLow = pd.DataFrame()
df_pattern = pd.DataFrame()
df_high = pd.DataFrame()
arr_participant = []

df = pd.DataFrame()

for file in dir_list:
    print(file)
    result = dataclean(file)
    result = accuracy(result)
    result = RTtoFloat(result)
    result = elim_outliars(result)
    
    participant = file[:4] 
    arr_participant.append(participant)
    
    
    arr_randHigh = medianRT(result,"H")
    new_row = pd.Series(arr_randHigh)
    df_randHigh = df_randHigh.append(new_row, ignore_index=True)
    
    arr_randLow = medianRT(result,"L")
    new_row = pd.Series(arr_randLow)
    df_randLow = df_randLow.append(new_row, ignore_index=True)
    
    arr_pattern = medianRT(result,"P")
    new_row = pd.Series(arr_pattern)
    df_pattern = df_pattern.append(new_row, ignore_index=True)
    
    arr_high = medianRT(result,"HP")
    new_row = pd.Series(arr_high)
    df_high = df_high.append(new_row, ignore_index=True)
    
    print(" ")
    
participant_series = pd.Series(arr_participant)

# rename the Series column to 'participant'
participant_series = participant_series.rename('participant')
    
df = pd.concat([participant_series, df_participant, df_randHigh, df_randLow, df_pattern, df_high], axis=1)

print(df)
    
#df.to_csv("all.csv")

1011.csv
-------------------------
Data cleaning started...
Column names:  ['stimulus.thisIndex' 'mouse.time' 'mouse.clicked_name' 'position_x'
 'mouse.leftButton']
Number of rows:  935
Data cleaning finished...
-------------------------
Accuracy cleaning started...
Number of rows:  935
Number of deleted rows:  72
Remaining rows:  790
Accuracy cleaning finished.
-------------------------
RT type conversation started...
Remaining rows:  float64
RT type conversation finished.
-------------------------
Eliminating outliars started...
Q1 =  0.7775000000002166
Q3 =  1.01274999999994
IQR =  0.2352499999997235
Lower fence:  0.4246250000006313
Higher fence:  1.3656249999995254
Number of rows:  790
Number of deleted rows:  106
Remaining rows:  684
Eliminating outliars finished.
-------------------------
Median RT analysis started...
Selected triplet types:  H
Number of blocks:  11
Block number:  1
Median:  0.9525000000000006
Array of medians by blocks:  [0.9525000000000006]
Block number:  2
Med

Column names:  ['stimulus.thisIndex' 'mouse.time' 'mouse.clicked_name' 'position_x'
 'mouse.leftButton']
Number of rows:  1020
Data cleaning finished...
-------------------------
Accuracy cleaning started...
Number of rows:  1020
Number of deleted rows:  58
Remaining rows:  879
Accuracy cleaning finished.
-------------------------
RT type conversation started...
Remaining rows:  float64
RT type conversation finished.
-------------------------
Eliminating outliars started...
Q1 =  0.7830000000000155
Q3 =  1.0669999999998936
IQR =  0.2839999999998781
Lower fence:  0.35700000000019827
Higher fence:  1.4929999999997108
Number of rows:  879
Number of deleted rows:  60
Remaining rows:  819
Eliminating outliars finished.
-------------------------
Median RT analysis started...
Selected triplet types:  H
Number of blocks:  12
Block number:  1
Median:  0.8474999999999966
Array of medians by blocks:  [0.8474999999999966]
Block number:  2
Median:  0.8279999999999745
Array of medians by blocks:  [0

In [41]:
columns = df.columns.tolist()

# create a list to store the new column names
new_columns = []

# initialize the prefix variable to be used for the next set of columns
prefix = ""

# loop through each column name
for col in columns:
    if col == 0:
        # if the column name is 0, change the prefix accordingly
        if prefix == "":
            prefix = "randHigh_"
        elif prefix == "randHigh_":
            prefix = "randLow_"
        elif prefix == "randLow_":
            prefix = "Pattern_"
        elif prefix == "Pattern_":
            prefix = "High_"
    # add the prefix and column number to the new column names list
    new_columns.append(prefix + str(col))

# create an empty dataframe with the new column names
df.columns = new_columns

df = df.fillna(0)

print(df)

  participant  randHigh_0  randHigh_1  randHigh_2  randHigh_3  randHigh_4  \
0        1011      0.9525      0.8790       0.893      0.8570      0.8250   
1        2027      1.9035      1.7120       0.000      0.0000      0.0000   
2        1028      1.2165      0.9260       1.125      0.9085      0.8915   
3        1029      0.9590      1.0160       1.042      0.9590      0.0000   
4        1031      0.8475      0.8280       1.013      0.9330      0.9670   
5        2018      1.1180      1.1675       1.117      0.8410      0.0000   
6        2014      1.2240      1.0250       0.975      0.9080      0.0000   
7        2013      1.3450      1.1240       0.000      0.0000      0.0000   

   randHigh_5  randHigh_6  randHigh_7  randHigh_8  ...  High_2  High_3  \
0       0.907      0.9580      0.9245       0.826  ...  0.8845  0.8420   
1       0.000      0.0000      0.0000       0.000  ...  0.0000  0.0000   
2       0.792      0.8835      1.0930       0.958  ...  1.0010  0.8990   
3       0.

In [42]:
#df.drop(df.index[0], inplace=True)
#print(df)
#df.drop(df.index[2], inplace=True)
#print(df)
#df.to_csv("all.csv")

In [43]:
df.to_csv(csv_filename)

## Archive

In [27]:
def statistical_diff(result):
    print("Statistic learning analysis started...")
    arr_exp_stat = []

    arr_randhigh = []
    arr_randlow = []
    
    max_block_value = result["block"].max()
    
    print("Number of blocks: ", max_block_value)
    
    for i in range(1,max_block_value): 
        print("Block number: ", i)
        
        result_block=result.loc[result["block"] == i]
        #print(result_block.shape)
        result_h = result_block.loc[result_block["trial_type"] == 12]
        #print(result_h)
        result_l = result_block.loc[result_block["trial_type"] == 20]
        #print(result_l.shape)

        m_h = result_h["mouse.time"].median()
        print("High Prob Median: ", m_h)
        
        m_l = result_l["mouse.time"].median()
        print("Low Prob Median: ",m_l)
    
        print("Diff Low-High: ", m_l-m_h)
    
        arr_randhigh.append(m_h)
        arr_randlow.append(m_l)

        arr_exp_stat.append(m_l-m_h)
        
        print("Array of diff: ", arr_exp_stat)
             
    df_exp_stat = pd.DataFrame(arr_exp_stat, columns = ["Stat_diff_Low-High"])
    
    print("Statistic learning analysis finished.")
        
    return df_exp_stat


In [28]:
def sequential_diff(result):
    print("Sequential learning analysis started...")
    arr_exp_stat = []
    arr_exp_seq = []

    arr_pattern = []
    arr_randhigh = []
    
    max_block_value = result["block"].max()
    print("Number of blocks: ", max_block_value)
    
    for i in range(1,max_block_value): 
        print("Block number: ", i)
        
        #print(i)
        result_block=result.loc[result["block"] == i]
        #print(result_block.shape)
        result_p = result_block.loc[result_block["trial_type"] == 11]
        #print(result_p.shape)
        result_h = result_block.loc[result_block["trial_type"] == 12]
        #print(result_h)

        m_p = result_p["mouse.time"].median()
        print("Pattern Median: ", m_p)
        
        #print(result_p[["mouse.time","block","trial_type"]])
        #print(result_p["mouse.time"].median())
        
        m_h = result_h["mouse.time"].median()
        print("High Prob Median: ", m_h)
    
        #print(m_h-m_p)
        print("Diff High-Pattern: ", m_h-m_p)
    
        arr_pattern.append(m_p)
        arr_randhigh.append(m_h)

        arr_exp_seq.append(m_h-m_p)
        print("Array of diff: ", arr_exp_seq)
        
    df_exp_seq = pd.DataFrame(arr_exp_seq, columns = ["Seq_diff_High-Pattern"])
    
    print("Sequential learning analysis finished.")
        
    return df_exp_seq


In [29]:
def implicit_diff(result):
    print("Implicit learning analysis started...")
    
    arr_imp = []
    #arr_high =[]
    
    max_block_value = result["block"].max()
    print("Number of blocks: ", max_block_value)

    for i in range(1,max_block_value): 
        print("Block number: ", i)
        
        #print(i)
        result_block=result.loc[result["block"] == i]
        #print(result_block.shape)
        #result_p = result_block.loc[result_block["trial_type"] == 11 | ]
        #print(result_p.shape)
        result_h = result_block.loc[(result_block["trial_type"] == 12) | (result_block["trial_type"] == 11)]
        #print(result_h)
        result_l = result_block.loc[result_block["trial_type"] == 20]
        #print(result_l.shape)

        #m_p = result_p["mouse.time"].median()
        #print(result_p[["mouse.time","block","trial_type"]])
        #print(result_p["mouse.time"].median())
        #print(m_p)
        m_hp = result_h["mouse.time"].median()
        print("High-Pattern Median: ", m_h)
        #print(m_h)
        
        m_l = result_l["mouse.time"].median()
        print("Low Median: ", m_l)
        #print(m_l)

        #print(m_l-m_h)
        print("Diff Low-High-Pattern: ", m_l-m_hp)

        #arr_exp_stat.append(m_h-m_p)
        arr_imp.append(m_l-m_hp)
        print("Array of diff: ", arr_imp)
        
        #arr_high.append(m_h)


    #print(arr)
    df_imp = pd.DataFrame(arr_imp, columns = ["Imp_diff_HighPattern-Low"])
    
    print("Implicit learning analysis finished.")
    
    return df_imp
