In [None]:
def transform_participant_data(participant_data):
    camel_scores = [0] * 5
    camel_times = [0] * 5
    kebab_scores = [0] * 5
    kebab_times = [0] * 5

    question_data_dict = participant_data["questionData"]

    for _, question_data in question_data_dict.items():
        case_format = question_data["caseFormat"]
        is_correct = 1 if question_data["isCorrect"] else 0
        time_taken = question_data["timeTaken"]

        if case_format.startswith("camel"):
            camel_scores[int(case_format[-1]) - 1] = is_correct
            camel_times[int(case_format[-1]) - 1] = time_taken
        elif case_format.startswith("kebab"):
            kebab_scores[int(case_format[-1]) - 1] = is_correct
            kebab_times[int(case_format[-1]) - 1] = time_taken

    camel_average = sum(camel_times) / len(camel_times) if any(camel_times) else 0
    kebab_average = sum(kebab_times) / len(kebab_times) if any(kebab_times) else 0

    return {
        "participantID": participant_data["participantID"],
        "experience": participant_data["demographicsAnswers"][0]["experience"],
        "camel1_score": camel_scores[0],
        "camel2_score": camel_scores[1],
        "camel3_score": camel_scores[2],
        "camel4_score": camel_scores[3],
        "camel5_score": camel_scores[4],
        "camel1_time": camel_times[0],
        "camel2_time": camel_times[1],
        "camel3_time": camel_times[2],
        "camel4_time": camel_times[3],
        "camel5_time": camel_times[4],
        "camel_average": camel_average,
        "kebab1_score": kebab_scores[0],
        "kebab2_score": kebab_scores[1],
        "kebab3_score": kebab_scores[2],
        "kebab4_score": kebab_scores[3],
        "kebab5_score": kebab_scores[4],
        "kebab1_time": kebab_times[0],
        "kebab2_time": kebab_times[1],
        "kebab3_time": kebab_times[2],
        "kebab4_time": kebab_times[3],
        "kebab5_time": kebab_times[4],
        "kebab_average": kebab_average,
    }


In [None]:
import pandas as pd
import os
import json

all_participants_data_transformed = []

directory_path = './data/'

# Loop through each file in the directory
for filename in os.listdir(directory_path):
    if filename.endswith(".json"):
        file_path = os.path.join(directory_path, filename)

        # Load data from the file
        with open(file_path, 'r') as file:
            data = json.load(file)

        # Transform data for each participant in the file
        participants_data_transformed = [transform_participant_data(data)]

        # Append transformed data to the list
        all_participants_data_transformed.extend(participants_data_transformed)

df = pd.DataFrame(all_participants_data_transformed)

df.head()

# CSV file path
csv_file_path = "./output.csv"
df.to_csv(csv_file_path, index=False)

print(f"Data has been saved to {csv_file_path}")

csv_data_string = df.to_string(index=False)

print(csv_data_string)

In [None]:
file_path = 'output.csv'

df = pd.read_csv(file_path)
print(df)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
melted_df = pd.melt(df, id_vars=['participantID', 'experience'], var_name='question_id', value_name='value')

melted_df[['question_type', 'question_number']] = melted_df['question_id'].str.extract('([a-zA-Z]+)(\d+)', expand=True)

melted_df['question_number'] = pd.to_numeric(melted_df['question_number'])

# List of columns to exclude
exclude_columns = ['camel1_score', 'camel2_score', 'camel3_score', 'camel4_score', 'camel5_score',
                   'kebab1_score', 'kebab2_score', 'kebab3_score', 'kebab4_score', 'kebab5_score',
                   'camel_average', 'kebab_average']

# Filter columns to include only time-related columns
time_df = melted_df[melted_df['question_type'].isin(['camel', 'kebab']) & ~melted_df['question_id'].isin(exclude_columns)]

# Plot: Average time to answer questions vs. question id
plt.figure(figsize=(12, 6))
ax = sns.barplot(x='question_id', y='value', data=time_df, hue='question_type', errorbar=None)
plt.title('Average Time to Answer by Question IDs')
plt.xlabel('Question IDs')
plt.ylabel('Average Time to Answer in milliseconds')
# plt.xticks(rotation=45, ha='right') 

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles, labels=['camelCase', 'kebab-case'], title='Question Case Format')

ax.set_xticklabels(['camel1', 'camel2', 'camel3', 'camel4', 'camel5', 'kebab1', 'kebab2', 'kebab3', 'kebab4', 'kebab5'])

plt.show()

In [None]:
melted_df = pd.melt(df, id_vars=['participantID', 'experience'], var_name='question_id', value_name='value')

melted_df[['question_type', 'question_id_number']] = melted_df['question_id'].str.extract('([a-zA-Z]+)(\d+)', expand=True)

melted_df['question_id_number'] = pd.to_numeric(melted_df['question_id_number'], errors='coerce')

score_df = melted_df[melted_df['question_id'].str.contains('_score')]

# Group by question_id and calculate the average accuracy as a percentage
avg_accuracy_df = score_df.groupby(['question_type', 'question_id'])['value'].mean() * 100
avg_accuracy_df = avg_accuracy_df.reset_index()

# Sort columns in ascending order
avg_accuracy_df = avg_accuracy_df.sort_values(by=['question_id'])

# Plot: Average Accuracy vs. question id
plt.figure(figsize=(12, 6))
ax = sns.scatterplot(x='question_id', y='value', data=avg_accuracy_df, hue='question_type')
# plt.title('Average Accuracy vs. Question IDs')
plt.title('Average Accuracy Score for Different Question Types and IDs')
plt.xlabel('Question IDs')
plt.ylabel('Average Accuracy (%)')
# plt.xticks(rotation=45, ha='right') 

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles, labels=['camelCase', 'kebab-case'], title='Question Case Format')
# score
ax.set_xticklabels(['camel1', 'camel2', 'camel3', 'camel4', 'camel5', 'kebab1', 'kebab2', 'kebab3', 'kebab4', 'kebab5'])

plt.show()

In [None]:
melted_df = pd.melt(df, id_vars=['participantID', 'experience'], var_name='question_id', value_name='value')

melted_df[['question_type', 'question_id_number']] = melted_df['question_id'].str.extract('([a-zA-Z]+)(\d+)', expand=True)

melted_df['question_id_number'] = pd.to_numeric(melted_df['question_id_number'], errors='coerce')

score_df = melted_df[melted_df['question_id'].str.contains('_score')]

avg_accuracy_df = score_df.groupby(['question_type', 'question_id'])['value'].agg(['mean', 'std']) * 100
avg_accuracy_df = avg_accuracy_df.reset_index()

avg_accuracy_df = avg_accuracy_df.sort_values(by=['question_id'])

# Plot: Scatter Plot with Error Bars
# plt.figure(figsize=(12, 6))
# sns.scatterplot(x='question_id', y='mean', data=avg_accuracy_df, hue='question_type')
# plt.errorbar(x=avg_accuracy_df['question_id'], y=avg_accuracy_df['mean'], yerr=avg_accuracy_df['std'], fmt='none', color='black', capsize=3)
# plt.title('Scatter Plot of Accuracy with Error Bars')
# plt.xlabel('Question ID')
# plt.ylabel('Average Accuracy (%)')
# plt.legend(title='Question Type')
# plt.show()


plt.figure(figsize=(12, 6))
ax = sns.scatterplot(x='question_id', y='mean', data=avg_accuracy_df, hue='question_type', marker='o')

lower_bound = avg_accuracy_df['mean'] - avg_accuracy_df['std']
upper_bound = avg_accuracy_df['mean'] + avg_accuracy_df['std']

# Draw error bars
plt.errorbar(x=avg_accuracy_df['question_id'], y=avg_accuracy_df['mean'], yerr=[avg_accuracy_df['mean'] - lower_bound, upper_bound - avg_accuracy_df['mean']], fmt='none', color='gray', capsize=5)

plt.title('Distribution of Accuracy by Question IDs')
plt.xlabel('Question IDs')
plt.ylabel('Accuracy (%)')
# plt.xticks(rotation=45, ha='right') 

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles, labels=['camelCase', 'kebab-case'], title='Question Case Format')

ax.set_xticklabels(['camel1', 'camel2', 'camel3', 'camel4', 'camel5', 'kebab1', 'kebab2', 'kebab3', 'kebab4', 'kebab5'])

plt.show()

In [None]:
# Filter columns to include only score-related columns
accuracy_columns = [col for col in df.columns if '_score' in col]

accuracy_melted_df = pd.melt(df, id_vars=['participantID', 'experience'], value_vars=accuracy_columns, 
                             var_name='question_id', value_name='accuracy')

# Separate question type and number
accuracy_melted_df[['question_type', 'question_id_number']] = accuracy_melted_df['question_id'].str.extract('([a-zA-Z]+)(\d+)', expand=True)

# Convert question_id_number to numeric
accuracy_melted_df['question_id_number'] = pd.to_numeric(accuracy_melted_df['question_id_number'], errors='coerce')

# Group by question_type and calculate the overall average accuracy as a percentage with standard deviation
overall_avg_accuracy_df = accuracy_melted_df.groupby('question_type')['accuracy'].agg(['mean', 'std']) * 100
overall_avg_accuracy_df = overall_avg_accuracy_df.reset_index()

# Plot: Two-column average accuracy by case vs. case
plt.figure(figsize=(10, 6))
ax = sns.barplot(x='question_type', y='mean', data=overall_avg_accuracy_df, ci=None)
plt.errorbar(x=overall_avg_accuracy_df['question_type'], y=overall_avg_accuracy_df['mean'],
             yerr=overall_avg_accuracy_df['std'], fmt='none', color='gray', capsize=5)

plt.title('Average Accuracy by Case (camelCase vs. kebab-case)')
plt.xlabel('Question Case Format')
plt.ylabel('Average Accuracy (%)')

ax.set_xticklabels(['camelCase', 'kebab-case'])
plt.show()

In [None]:
# Assuming your DataFrame is named 'df'

# Filter columns to include only score-related columns
score_columns = [col for col in df.columns if '_score' in col]

# Group by participant and experience, calculate mean duration and accuracy
grouped_df = df.groupby(['participantID', 'experience'])[score_columns].mean()

# Reset the index to make participantID and experience regular columns
grouped_df = grouped_df.reset_index()

# Calculate mean duration and accuracy for each participant
duration_columns = [col for col in df.columns if '_time' in col]
accuracy_columns = [col for col in df.columns if '_score' in col]

grouped_df['mean_duration'] = df[duration_columns].mean(axis=1)
grouped_df['mean_accuracy'] = df[accuracy_columns].mean(axis=1) * 100

# Print the grouped DataFrame to check the result
print("Grouped DataFrame:")
print(grouped_df.head())

# Create a scatter plot
ax = sns.scatterplot(x='mean_duration', y='mean_accuracy', hue='experience', data=grouped_df)
plt.title('Scatter Plot of Duration vs Accuracy')
plt.xlabel('Mean Duration (in milliseconds)')
plt.ylabel('Mean Accuracy (%)')

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles, labels=['No experience', '3+ years', 'Bachelor INF student'], title='Participant experience')

plt.show()

In [None]:
df = pd.DataFrame({
    'participantID': ['24e6c294', 'f016b9fd', '1f29dc05', 'b58ce07a', '9d8a6f15',
                       '0562b141', 'd3d7734f', '97842511', '93de7372', '8792e96d'],
    'experience': ['bachelor-inf', 'no-experience', 'more-three', 'bachelor-inf', 'more-three',
                    'no-experience', 'bachelor-inf', 'bachelor-inf', 'more-three', 'more-three'],
    'camel1_score': [1, 1, 0, 1, 1, 1, 1, 0, 1, 1],
    'camel2_score': [0, 1, 0, 0, 1, 0, 0, 1, 1, 1],
    'camel3_score': [1, 1, 1, 1, 1, 0, 1, 1, 1, 1],
    'camel4_score': [1, 0, 1, 1, 1, 1, 1, 1, 1, 1],
    'camel5_score': [1, 1, 1, 0, 1, 1, 1, 1, 1, 1],
    'mean_duration': [1676.1, 7248.9, 3145.2, 3053.2, 8474.7, 4707.1, 6152.0, 3050.9, 2966.4, 2330.4],
    'mean_accuracy': [90, 90, 80, 80, 100, 70, 90, 90, 100, 90],
    'kebab1_score': [1, 1, 1, 1, 1, 1, 1, 0, 1, 1],
    'kebab2_score': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    'kebab3_score': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    'kebab4_score': [1, 1, 1, 1, 1, 1, 0, 1, 1, 1],
    'kebab5_score': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    'kebab1_time': [2897.6, 7056.1, 4537.4, 2242.7, 4122.8, 3600.2, 6216.1, 2239.6, 2599.6, 2122.4],
    'kebab2_time': [4573.2, 13175.3, 5895.6, 3250.7, 9900.7, 10381.6, 10638.2, 7156.8, 4915.8, 4712.6],
    'kebab3_time': [3991.4, 8182.6, 6474.8, 3295.0, 4999.5, 7525.4, 4173.6, 6140.4, 4483.5, 4195.3],
    'kebab4_time': [4187.9, 15760.3, 6025.434562, 2613.4, 4995.1, 4267.2, 27065.0, 3467.7, 2399.6, 2792.4],
    'kebab5_time': [8365.5, 11522.7, 3901.6, 3467.1, 5381.1, 4111.0, 8177.1, 2569.5, 3583.1, 2143.3],
    'kebab_average': [4803.12, 11139.4, 5366.966912, 2973.78, 5879.840004, 5977.08, 11254.0, 4314.8, 3596.32, 3193.2]
})

# Extract experience groups
groups = df.groupby('experience')

# Display the five-number summary, mean, and standard deviation
for name, group in groups:
    print(f"\nSummary for {name}:")
    print(group[['mean_duration', 'mean_accuracy']].describe(percentiles=[.25, .5, .75]))
    print(f"\nMean for {name}:")
    print(group[['mean_duration', 'mean_accuracy']].mean())
    print(f"\nStandard Deviation for {name}:")
    print(group[['mean_duration', 'mean_accuracy']].std())
    
    
summary_data = []

for name, group in groups:
    summary = group[['mean_duration', 'mean_accuracy']].describe(percentiles=[.25, .5, .75])
    mean = group[['mean_duration', 'mean_accuracy']].mean()
    std_dev = group[['mean_duration', 'mean_accuracy']].std()

    summary_data.append({
        'Experience': name,
        'Summary': summary.to_string(),
        'Mean': mean.to_string(),
        'Standard_Deviation': std_dev.to_string()
    })

summary_df = pd.DataFrame(summary_data)

summary_df.to_csv('summary_statistics.csv', index=False)

In [None]:
from scipy.stats import ttest_ind

group1 = df[df['experience'] == 'bachelor-inf']['mean_duration']
group2 = df[df['experience'] == 'no-experience']['mean_duration']

t_statistic, p_value = ttest_ind(group1, group2, nan_policy='omit')

print(f'T-statistic: {t_statistic}')
print(f'P-value: {p_value}')

# Check the significance level
alpha = 0.05
if p_value < alpha:
    print('Reject the null hypothesis. There is a significant difference between the groups.')
else:
    print('Fail to reject the null hypothesis. There is no significant difference between the groups.')

In [None]:
from scipy.stats import ttest_ind
import scipy.stats as stats

file_path = 'output.csv'

df = pd.read_csv(file_path)

# Display the DataFrame
# print(df)

import pandas as pd
from scipy.stats import ttest_rel

# Separate the data for camelCase and kebab-case
camel_data = df[['camel1_score', 'camel2_score', 'camel3_score', 'camel4_score', 'camel5_score',
                 'camel1_time', 'camel2_time', 'camel3_time', 'camel4_time', 'camel5_time']]

kebab_data = df[['kebab1_score', 'kebab2_score', 'kebab3_score', 'kebab4_score', 'kebab5_score',
                 'kebab1_time', 'kebab2_time', 'kebab3_time', 'kebab4_time', 'kebab5_time']]


def perform_t_tests(camel, kebab, variable):
    statistic, p_value = stats.ttest_ind(camel, kebab)
    # statistic, p_value = stats.ttest_rel(camel, kebab)
    print(f"\nT-Test for {variable}:")
    print(f"  Statistic: {statistic}")
    print(f"  P-Value: {p_value}\n")

for column in camel_data.columns:
    
    kebab_column = column.replace('camel', 'kebab')
    
    if kebab_column not in kebab_data.columns:
        kebab_column = column.replace('mean_', 'kebab_')
    
    perform_t_tests(camel_data[column], kebab_data[kebab_column], column)
