In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns

In [None]:
df = pd.read_csv('delphi.csv', delimiter = ';')
df.head(1)

In [None]:
df.isna().sum()

In [None]:
df.describe()

## stdDev and Mean per Thesis

In [None]:
mean_std_results = []
for i in range(1, 21):
    thesis_col = f'Thesis {i}/Plausibility'
    df[thesis_col] = df[thesis_col].astype(float)
    thesis_mean = df[thesis_col].mean()
    thesis_std = df[thesis_col].std()
    mean_std_results.append([f'Thesis {i}', thesis_mean, thesis_std])

## visualize

In [None]:
# create a list of thesis names
thesis_names = [f'Thesis {i}' for i in range(1, 21)]

# create empty lists for human-made, machine-made, and unknown scenarios
human_made = []
machine_made = []
unknown = []

# loop through each thesis
for i in range(1, 21):
    thesis_col = f'Thesis {i}/Scenario Source'
    df[thesis_col].fillna('Unknown', inplace=True) # replace missing values with 'Unknown'
    try:
        machine_made_count = df[df[thesis_col] == 'Machine-Made Scenario'][thesis_col].count()
    except:
        machine_made_count = 0
    try:
        human_made_count = df[df[thesis_col] == 'Human-Made Scenario'][thesis_col].count()
    except:
        human_made_count = 0
    try:
        unknown_count = df[df[thesis_col] == 'Unknown'][thesis_col].count()
    except:
        unknown_count = 0
    machine_made.append(machine_made_count)
    human_made.append(human_made_count)
    unknown.append(unknown_count)

In [None]:
# plot the bar chart
sns.set(style="white", font='monospace')
plt.figure(figsize=(12,8), dpi=300)
plt.barh(thesis_names, human_made, color='#000000')
plt.barh(thesis_names, machine_made, left=human_made, color='darkgrey')
plt.barh(thesis_names, unknown, left=[human_made[i] + machine_made[i] for i in range(20)], color='#d1d1d1')
plt.xlim(0, max([sum(x) for x in zip(human_made, machine_made, unknown)]) + 10)
plt.xlabel('Number of Responses', fontsize = 12)
plt.title('Human-Made or Machine-Made Scenario', fontsize = 16)
plt.legend(['Human-Made', 'GPT-Made', 'No Answer'])
sns.despine(trim=False)
plt.show()

# Spearman Correlation Coefficient

In [None]:
from scipy.stats import spearmanr
# Your existing code to calculate correlation coefficients
correlation_results = []
for i in range(1, 21):
    source_col = f"Thesis {i}/Scenario Source"
    plausibility_col = f"Thesis {i}/Plausibility"
    
    filtered_df = df[[source_col, plausibility_col]].dropna()
    filtered_df[source_col] = filtered_df[source_col].replace({1: 'Human-Made Scenario', 0: 'Machine-Made Scenario'})
    
    corr_coef, p_value = spearmanr(filtered_df[source_col], filtered_df[plausibility_col])
    
    correlation_results.append([f'Thesis {i}', corr_coef])

In [None]:
# Save the results to a CSV file
mean_std_df = pd.DataFrame(mean_std_results, columns=['Thesis', 'Mean', 'StdDev'])
correlation_df = pd.DataFrame(correlation_results, columns=['Thesis', 'Correlation Coefficient'])
result_df = pd.merge(mean_std_df, correlation_df, on='Thesis')
#result_df.to_csv('delphi_stat_results.csv', index=False)