# pandas

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## 1. Data Loading and Preprocessing

### Load CSV files

In [None]:
column_names = ['Block', 'Trial', 'Response_Level', 'Condition', 'Correct', 'Reaction_Time']  # Define column names
session1_df = pd.read_csv('data/Study3_P1NAVON1.csv', header=None, names=column_names)

### Display the first few rows of each dataset

In [None]:
print("Session 1 data:")
session1_df.head()

In [None]:
type(session1_df)

### Check data info

In [None]:
session1_df.info()

## 2. Data Exploration and Visualization

### Summary statistics

In [None]:
session1_df.describe()

### Visualize reaction time distribution

In [None]:
plt.figure(figsize=(5, 3))
plt.hist(session1_df['Reaction_Time'], bins=20, density=True)
plt.title('Distribution of Reaction Times (Session 1)')
plt.xlabel('Reaction Time (seconds)')
plt.ylabel('Density')
plt.legend()
plt.grid(axis='y', alpha=0.5)
plt.show()

## 3. Data Manipulation and Transformation

### Recode numbers into new categorical columns

In [None]:
# Recode response level
session1_df['Response_Level_Catg'] = session1_df['Response_Level'].map({1: 'Local', 2: 'Global'})

session1_df.head()

In [None]:
# Recode condition
session1_df['Condition_Catg'] = session1_df['Condition'].map({0: 'Congruent', 2: 'Incongruent'})
session1_df.head()

### Outliers - remove reaction times > 3 standard deviations

In [None]:
# Handle outliers (e.g., remove reaction times > 3 standard deviations)
mean_rt = session1_df['Reaction_Time'].mean()
std_rt = session1_df['Reaction_Time'].std()
print(mean_rt, std_rt)

In [None]:
idx = ((session1_df['Reaction_Time'] <= mean_rt + 3*std_rt) & 
        (session1_df['Reaction_Time'] >= mean_rt - 3*std_rt))
idx

In [None]:
session1_df_clean = session1_df[idx]

print("Original dataset size:", len(session1_df))
print("Cleaned dataset size:", len(session1_df_clean))

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(6, 3))

# Original response times
plt.subplot(1, 2, 1)
plt.hist(session1_df['Reaction_Time'], bins=30, label='Original')
plt.title('Original Reaction Times')
plt.xlabel('Reaction Time (seconds)')
plt.ylabel('Frequency')
plt.xlim(session1_df['Reaction_Time'].min(), session1_df['Reaction_Time'].max())

# Cleaned response times
plt.subplot(1, 2, 2)
plt.hist(session1_df_clean['Reaction_Time'], bins=30, label='Cleaned', color='orange')
plt.title('Cleaned Reaction Times')
plt.xlabel('Reaction Time (seconds)')
plt.ylabel('Frequency')
plt.xlim(session1_df['Reaction_Time'].min(), session1_df['Reaction_Time'].max())

plt.tight_layout()
plt.show()


## 4. Statistical Analysis

### Calculate mean reaction time and accuracy for each condition

In [None]:
# RT
mean_rt_by_condition = session1_df_clean.groupby('Condition_Catg')['Reaction_Time'].mean()
print("Mean Reaction Time by Condition:")
print(mean_rt_by_condition)

# Accuracy
accuracy_by_condition = session1_df_clean.groupby('Condition_Catg')['Correct'].mean()
print("\nAccuracy by Condition:")
print(accuracy_by_condition)

### Calculate mean RT and accuracy for each response level

In [None]:
# RT
mean_rt_by_response = session1_df_clean.groupby('Response_Level_Catg')['Reaction_Time'].mean()
print("Mean Reaction Time by Response level:")
print(mean_rt_by_response)

# Accuracy
accuracy_by_response = session1_df_clean.groupby('Response_Level_Catg')['Correct'].mean()
print("\nAccuracy by Response level:")
print(accuracy_by_response)

In [None]:
# Calculate mean reaction time for each condition
mean_rt_by_condition = session1_df_clean.groupby('Condition_Catg')['Reaction_Time'].mean()
print("Mean Reaction Time by Condition:")
print(mean_rt_by_condition)

# Calculate accuracy for each condition
accuracy_by_condition = session1_df_clean.groupby('Condition_Catg')['Correct'].mean()
print("\nAccuracy by Condition:")
print(accuracy_by_condition)

### Group by Response level X Condition

In [None]:
mean_rt_by_responseXcondition = session1_df_clean.groupby(['Response_Level_Catg', 'Condition_Catg'])['Reaction_Time'].mean()
print("Mean Reaction Time by Condition:")
mean_rt_by_responseXcondition

#### Have both RT and accuracy in the resulting aggregated dataframe

In [None]:
mean_rt_by_responseXcondition = session1_df_clean.groupby(['Response_Level_Catg', 'Condition_Catg'])[['Reaction_Time','Correct']].mean()
print("Mean Reaction Time by Condition:")
mean_rt_by_responseXcondition

#### Plot the RT per condition with error bars

In [None]:
mean_std_rt_by_responseXcondition = session1_df_clean.groupby(['Response_Level_Catg', 'Condition_Catg'])['Reaction_Time'].agg(['mean', 'std']).reset_index()
mean_std_rt_by_responseXcondition

In [None]:
mean_std_rt_by_responseXcondition['Response_X_Condition'] = mean_std_rt_by_responseXcondition['Response_Level_Catg'] + ' ' + mean_std_rt_by_responseXcondition['Condition_Catg']
mean_std_rt_by_responseXcondition

In [None]:
plt.bar(data=mean_std_rt_by_responseXcondition,
             x = 'Response_X_Condition',
             height = 'mean',
             yerr = 'std')
plt.xticks(rotation=45, ha='right')
plt.show()

## 6. Merging Datasets

In [None]:
# Get CSV files list from a folder using pathlib
cwd = pathlib.Path.cwd() # Current working directory
data_dir = cwd / 'data'  # Path object for better file system interaction

for file in data_dir.iterdir():
    print(file.name)

#### Load together into a dataframe

In [None]:
# Import libraries
import pathlib 
import re  # regular expression

# Get CSV files list
csv_files = list(data_dir.glob('*.csv'))  # List of Path objects for CSV files

# Define column names (assuming all CSVs have same structure)
column_names = ['Block', 'Trial', 'Response_Level', 'Condition', 'Correct', 'Reaction_Time']

# Read each CSV file into DataFrame and add Participant column
df_list = []
for file in csv_files:
    df = pd.read_csv(file, header=None, names=column_names)

    # Extract participant number using regular expression
    match = re.search(r'P(\d+)N', file.name)
    participant_number = int(match.group(1)) if match else None

    # Add participant number as the first column
    df.insert(0, 'Participant', participant_number) 
    df_list.append(df)

# Concatenate all DataFrames
df = pd.concat(df_list, ignore_index=True)

In [None]:
df

In [None]:
df.describe()

#### Aggregate to a subject level data set

In [None]:
group_vars = ['Participant', 'Response_Level', 'Condition']
subj_lvl_df = df.groupby(group_vars)[['Correct','Reaction_Time']].agg(['mean', 'std']).reset_index()
subj_lvl_df