## QABot dialog analysis

Why do people drop off?

In [1]:
# import
import json
import numpy as np
from datetime import datetime
from collections import defaultdict
DATEFORMAT = "%m/%d/%Y, %I:%M:%S %p"
CONDITIONS = ['llm-chatbot', 'llm-qa-bot', 'reading', 'teacher-qa-bot']

In [2]:
# load data
file = "pilot_09_06.json"
with open("./prolific_logs/" + file) as f:
    logs = json.load(f)
logs = [logs["logs"][l] for l in logs["logs"]]

In [4]:
def compile_dialog(log):
    messages = []
    for m in log["chatLog"]["current"]["main"]:
        time = datetime.strptime(m["date"], DATEFORMAT)
        messages.append((time, m["text"]))
    messages.sort()
    messages = [m[1] for m in messages]
    return messages

# helper function
def get_avg_std_err(vals):
    mean = np.mean(vals)
    std_dev = np.std(vals, ddof=1)  # Using ddof=1 for sample standard deviation
    
    # Calculate the standard error using the formula: standard deviation / sqrt(sample size)
    standard_error = std_dev / np.sqrt(len(vals))
    return mean, standard_error

### Compile dialog

In [5]:
# only consider completed sessions
print("Before filtering")
print("all logs:", len(logs))
for c in CONDITIONS:
    print(c, len([l for l in logs if l["condition"] == c]))

completed = [l for l in logs if l["completedSurvey"] == True]
drop = [l for l in logs if l["completedSurvey"] == False]

Before filtering
all logs: 101
llm-chatbot 26
llm-qa-bot 25
reading 25
teacher-qa-bot 25


#### When did QA condition drop?

In [8]:
drop_qa_llm = [l for l in drop if l["condition"] == c]


In [13]:
for k in drop_qa_llm[0]:
    print(k)

completedSurvey
condition
consent
initializationTimes
isMobile
lessonModalTimes
localCreationTime
modalTimes
optOut
scrollLogs
state
surveyToken
tabSwitches
timestamps
userAgent
uuid


In [12]:
for k in drop_qa_llm:
    if "enterMain" in k["timestamps"]:
        

{'creation': '9/6/2023, 4:18:55 PM', 'enterMain': '9/6/2023, 4:19:13 PM', 'enterWelcome': '9/6/2023, 4:19:04 PM'}
{'creation': '9/6/2023, 6:11:26 PM', 'enterKnowledge': '9/6/2023, 6:16:31 PM', 'enterMain': '9/6/2023, 6:11:34 PM', 'enterWelcome': '9/6/2023, 6:11:31 PM'}
{'creation': '9/6/2023, 6:57:12 PM', 'enterMain': '9/6/2023, 6:57:30 PM', 'enterWelcome': '9/6/2023, 6:57:28 PM'}
{'creation': '9/6/2023, 2:06:58 PM', 'enterMain': '9/6/2023, 2:07:19 PM', 'enterWelcome': '9/6/2023, 2:07:10 PM'}
{'creation': '9/6/2023, 6:11:31 PM', 'enterMain': '9/6/2023, 6:13:55 PM', 'enterWelcome': '9/6/2023, 6:13:03 PM'}
{'creation': '9/6/2023, 2:53:21 PM', 'enterMain': '9/6/2023, 2:54:31 PM', 'enterWelcome': '9/6/2023, 2:54:08 PM'}
{'creation': '9/6/2023, 2:16:43\u202fPM'}
{'creation': '9/6/2023, 6:11:36 PM', 'enterMain': '9/6/2023, 6:12:55 PM', 'enterWelcome': '9/6/2023, 6:12:27 PM'}
{'creation': '9/6/2023, 2:27:38 PM', 'enterMain': '9/6/2023, 2:28:52 PM', 'enterWelcome': '9/6/2023, 2:28:14 PM'}
{'cr

In [7]:
len(drop)

41

In [None]:
print("\nAfter filtering")
print("completed logs:", len(logs))
for c in CONDITIONS:
    print(c, len([l for l in logs if l["condition"] == c]))

#### Initial filters

In [None]:
# how many actions for drop-off students
for c in CONDITIONS:
    group = [l for l in logs if l["condition"] == c]
    for log in group:
        print(len(compile_dialog(log)))

In [None]:
# TODO: check exam tab switches on test pages
# check other statistics that Meng computed befre
# NUMBER OF INTERACTIONS
# TOTAL WORDS WRITTEN
# TIME ON LESSON

In [None]:

def num_messages(log):
    if log["condition"] == "treatment":
        return sum([e["isUser"] for e in log["chatLog"]["current"]["main"]])
    else:
        return "-"

def num_help(log):
    if log["condition"] == "treatment":
        if "help" in log["chatLog"]["current"]:
            return len(log["chatLog"]["current"]["help"])
        else:
            return 0
    else:
        return "-"

def test_score(log):
    score = 0
    for quest in log["knowledgeAnswers"]:
        qlog = log["knowledgeAnswers"][quest]
        score += (qlog["answer"] == qlog["solution"])
    return score

def time_total(log):
    learning = datetime.strptime(log["timestamps"]["enterMain"], DATEFORMAT)
    completion = datetime.strptime(log["timestamps"]["completion"], DATEFORMAT)
    return round((completion - learning).total_seconds() / 60.0, 1)

def time_on_learning(log):
    learning = datetime.strptime(log["timestamps"]["enterMain"], DATEFORMAT)
    knowledge = datetime.strptime(log["timestamps"]["enterKnowledge"], DATEFORMAT)
    return round((knowledge - learning).total_seconds() / 60.0, 1)

def time_on_exam(log):
    knowledge = datetime.strptime(log["timestamps"]["enterKnowledge"], DATEFORMAT)
    survey = datetime.strptime(log["timestamps"]["enterSurvey"], DATEFORMAT)
    return round((survey - knowledge).total_seconds() / 60.0, 1)

def total_switches(log):
    switches = 0
    learning = datetime.strptime(log["timestamps"]["enterMain"], DATEFORMAT)
    if not "tabSwitches" in log:
        return 0
    for switch in log["tabSwitches"]:
        if switch[0] == "exit":
            time = datetime.strptime(switch[1], DATEFORMAT)
            if (learning < time):
                switches += 1
    return switches



### Compile results

In [None]:
# performance df
perf_df = pd.DataFrame()
perf_df["surveyToken"] = [l["surveyToken"] for l in rlogs]
perf_df["condition"] = [l["condition"] for l in rlogs]
perf_df["completed_survey"] = [completed_survey(l) for l in rlogs]
perf_df["time_learning"] = [time_on_learning(l) for l in rlogs]
perf_df["num_messages"] = [num_messages(l) for l in rlogs]
perf_df["num_help"] = [num_help(l) for l in rlogs]
perf_df["test_score"] = [test_score(l) for l in rlogs]
perf_df["total_switches"] = [total_switches(l) for l in rlogs]
perf_df["exam_switches"] = [exam_switches(l) for l in rlogs]
perf_df["time_exam"] = [time_on_exam(l) for l in rlogs]
perf_df["time_total"] = [time_total(l) for l in rlogs]
perf_df["creation_time"] = [l["timestamps"]["creation"] for l in rlogs]
perf_df["completion_time"] = [l["timestamps"]["completion"] for l in rlogs]


perf_df = perf_df.sort_values("condition")
perf_df.to_csv("./tmp/perf_tmp.csv", index=False)
perf_df

In [None]:
# basic survey responses (Q1-4 + Q18)
base_df = pd.DataFrame()
base_df["surveyToken"] = [l["surveyToken"] for l in rlogs]
base_df["condition"] = [l["condition"] for l in rlogs]

# basic answers
for q in ["q1", "q2", "q3", "q4"]:
    qtext = BeautifulSoup(rlogs[0]["surveyAnswers"][q]["question"], "html").text
    base_df[qtext] = [l["surveyAnswers"][q]["answer"] for l in rlogs]

# get open response comments
feedback = []
qtext = BeautifulSoup(rlogs[0]["surveyAnswers"]["q18"]["question"], "html").text
for log in rlogs:
    if type(log["surveyAnswers"]["q18"]) != str:
        feedback.append(log["surveyAnswers"]["q18"]["answer"])
    else:
        feedback.append("-")
base_df[qtext] = feedback

base_df = base_df.sort_values("condition")
base_df.to_csv("./tmp/base_tmp.csv", index=False)
base_df

In [None]:
# treatment survey responses (Q5-17)
treatment_df = pd.DataFrame()
tlogs = [l for l in rlogs if l["condition"] == "treatment"]
treatment_df["surveyToken"] = [l["surveyToken"] for l in tlogs]
treatment_df["condition"] = [l["condition"] for l in tlogs]

for q in ["q" + str(i) for i in range(5, 18)]:
    qtext = BeautifulSoup(tlogs[0]["surveyAnswers"][q]["question"], "html").text
    treatment_df[qtext] = [l["surveyAnswers"][q]["answer"] for l in tlogs]

treatment_df = treatment_df.sort_values("condition")
treatment_df.to_csv("./tmp/treatment_tmp.csv", index=False)
treatment_df

In [None]:
# demographics survey responses
demo_df = pd.DataFrame()
demo_df["surveyToken"] = [l["surveyToken"] for l in rlogs]
demo_df["condition"] = [l["condition"] for l in rlogs]

for q in ["q1", "q2", "q3", "q4"]:
    qtext = BeautifulSoup(tlogs[0]["demographicsAnswers"][q]["question"], "html").text
    demo_df[qtext] = [l["demographicsAnswers"][q]["answer"] for l in rlogs]

demo_df = demo_df.sort_values("condition")
demo_df.to_csv("./tmp/demo_tmp.csv", index=False)
demo_df

### Print Dialogs

In [None]:
if False:  # set to True
    for log in rlogs:
        if log["condition"] == "treatment":
            print(log["surveyToken"])
            print("---------------------------------")
            compile_dialog(log)
            print("")

# P-value between control group and tretement group for Q1 to Q4

In [None]:
# basic survey responses (Q1-4 + Q18)
base_df = pd.DataFrame()
base_df["surveyToken"] = [l["surveyToken"] for l in rlogs]
base_df["condition"] = [l["condition"] for l in rlogs]

# basic answers
for q in ["q1", "q2", "q3", "q4"]:
    qtext = BeautifulSoup(rlogs[0]["surveyAnswers"][q]["question"], "html").text
    base_df[qtext] = [l["surveyAnswers"][q]["answer"] for l in rlogs]

base_df

In [None]:
# change "Strongly Agree" in base_df to number 7
base_df = base_df.replace("Strongly Agree", 5)
base_df = base_df.replace("Strongly Disagree", 1)
base_df = base_df.replace("Disagree", 2)
base_df = base_df.replace("Agree", 4)
base_df = base_df.replace("Neutral", 3)


In [None]:
# calculate the p-value for each question between the two conditions using mann-whitney u test
from scipy.stats import mannwhitneyu
p_q1_q4 = []
for q in base_df.columns[2:]:
    print(q)
    p_q1_q4.append(mannwhitneyu(base_df[base_df["condition"] == "control"][q], base_df[base_df["condition"] == "treatment"][q]).pvalue)
    print(mannwhitneyu(base_df[base_df["condition"] == "control"][q], base_df[base_df["condition"] == "treatment"][q]))
p_q1_q4


In [None]:
# plot a bar chart of the average score and std for each question in each condition
import matplotlib.pyplot as plt
import numpy as np

# get the average score for each question in each condition
control_avg = []
treatment_avg = []
for q in base_df.columns[2:]:
    control_avg.append(base_df[base_df["condition"] == "control"][q].mean())
    treatment_avg.append(base_df[base_df["condition"] == "treatment"][q].mean())

print(control_avg)
print(treatment_avg)

# get the std for each question in each condition
control_std = []
treatment_std = []
for q in base_df.columns[2:]:
    control_std.append(base_df[base_df["condition"] == "control"][q].std())
    treatment_std.append(base_df[base_df["condition"] == "treatment"][q].std())

# plot the bar chart
x = np.arange(len(base_df.columns[2:]))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots(figsize=(10, 9))
rects1 = ax.bar(x - width / 2, control_avg, width, label="Control", yerr=control_std)
rects2 = ax.bar(x + width / 2, treatment_avg, width, label="Treatment", yerr=treatment_std)

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel("Average Score")
ax.set_title("Average Score by Question and Condition")
ax.set_xticks(x)

# rotate the x-axis labels
ax.set_xticklabels(base_df.columns[2:], rotation=30, ha="right")
ax.legend()

# add the p-values to the plot
# for i, v in enumerate(p_q1_q4):
#     ax.text(i - width / 2, control_avg[i] + 0.1, str(round(v, 3)), color="blue", fontweight="bold")
#     ax.text(i + width / 2, treatment_avg[i] + 0.1, str(round(v, 3)), color="blue", fontweight="bold")
for i in range(len(p_q1_q4)):
    if p_q1_q4[i] < 0.05:
        ax.text(x[i] - width/2, max(control_avg[i], treatment_avg[i]) + 0.5, "*", fontsize=20)

fig.tight_layout()

plt.show() 


In [None]:
treatment_df = pd.DataFrame()
tlogs = [l for l in rlogs if l["condition"] == "treatment"]
treatment_df["surveyToken"] = [l["surveyToken"] for l in tlogs]
treatment_df["condition"] = [l["condition"] for l in tlogs]

for q in ["q" + str(i) for i in range(5, 18)]:
    qtext = BeautifulSoup(tlogs[0]["surveyAnswers"][q]["question"], "html").text
    treatment_df[qtext] = [l["surveyAnswers"][q]["answer"] for l in tlogs]
    
treatment_df

In [None]:
# change "Strongly Agree" in treatment_df to number 7
treatment_df = treatment_df.replace("Strongly Agree", 5)
treatment_df = treatment_df.replace("Strongly Disagree", 1)
treatment_df = treatment_df.replace("Disagree", 2)
treatment_df = treatment_df.replace("Agree", 4)
treatment_df = treatment_df.replace("Neutral", 3)


In [None]:
# plot a bar chart of the average score and std for each question
import matplotlib.pyplot as plt
import numpy as np

# get the average score for each question
treatment_avg = []
for q in treatment_df.columns[2:]:
    treatment_avg.append(treatment_df[q].mean())

print(treatment_avg)

# get the std for each question
treatment_std = []
for q in treatment_df.columns[2:]:
    treatment_std.append(treatment_df[q].std())

# plot the bar chart
x = np.arange(len(treatment_df.columns[2:]))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots(figsize=(10, 8))
rects1 = ax.bar(x, treatment_avg, width, label="Treatment", yerr=treatment_std)

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel("Average Score")
ax.set_title("Average Score by Question")
ax.set_xticks(x)

# rotate the x-axis labels
ax.set_xticklabels(treatment_df.columns[2:], rotation=30, ha="right")
ax.legend()

fig.tight_layout()

plt.show()

In [None]:
# performance df
perf_df = pd.DataFrame()
perf_df["surveyToken"] = [l["surveyToken"] for l in rlogs]
perf_df["condition"] = [l["condition"] for l in rlogs]
perf_df["completed_survey"] = [completed_survey(l) for l in rlogs]
perf_df["time_learning"] = [time_on_learning(l) for l in rlogs]
perf_df["num_messages"] = [num_messages(l) for l in rlogs]
perf_df["num_help"] = [num_help(l) for l in rlogs]
perf_df["test_score"] = [test_score(l) for l in rlogs]
perf_df["total_switches"] = [total_switches(l) for l in rlogs]
perf_df["exam_switches"] = [exam_switches(l) for l in rlogs]
perf_df["time_exam"] = [time_on_exam(l) for l in rlogs]
perf_df["time_total"] = [time_total(l) for l in rlogs]
perf_df["creation_time"] = [l["timestamps"]["creation"] for l in rlogs]
perf_df["completion_time"] = [l["timestamps"]["completion"] for l in rlogs]

perf_df

In [None]:
#replace '-' with 0
perf_df = perf_df.replace("-", 0)
perf_df

In [None]:
# calculate the p-value for each column between the two conditions using t-test except columns 'creation_time' and 'completion_time'
from scipy.stats import ttest_ind
p_q5_q18 = []
for q in perf_df.columns[3:-2]:
    print(q)
    print(perf_df[perf_df["condition"] == "control"][q])
    print(perf_df[perf_df["condition"] == "treatment"][q])
    p_q5_q18.append(ttest_ind(perf_df[perf_df["condition"] == "control"][q], perf_df[perf_df["condition"] == "treatment"][q])[1])
    print(ttest_ind(perf_df[perf_df["condition"] == "control"][q], perf_df[perf_df["condition"] == "treatment"][q]))
p_q5_q18

In [None]:
# plot a bar chart of the average score and std for each question in each condition
import matplotlib.pyplot as plt
import numpy as np

# get the average score for each question in each condition from columns 'q5' to 'q18'
control_avg = []
for q in perf_df.columns[3:-2]:
    control_avg.append(perf_df[perf_df["condition"] == "control"][q].mean())

treatment_avg = []
for q in perf_df.columns[3:-2]:
    treatment_avg.append(perf_df[perf_df["condition"] == "treatment"][q].mean())

print(control_avg)
print(treatment_avg)

# get the std for each question in each condition from columns 'q5' to 'q18'
control_std = []
for q in perf_df.columns[3:-2]:
    control_std.append(perf_df[perf_df["condition"] == "control"][q].std())

treatment_std = []
for q in perf_df.columns[3:-2]:
    treatment_std.append(perf_df[perf_df["condition"] == "treatment"][q].std())

# plot the bar chart
x = np.arange(len(perf_df.columns[3:-2]))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots(figsize=(10, 8))
rects1 = ax.bar(x - width/2, control_avg, width, label="Control", yerr=control_std)
rects2 = ax.bar(x + width/2, treatment_avg, width, label="Treatment", yerr=treatment_std)

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel("Average Score")
ax.set_title("Average Score by Question")
ax.set_xticks(x)

# rotate the x-axis labels
ax.set_xticklabels(perf_df.columns[3:-2], rotation=30, ha="right")
ax.legend()

# add p-value to the plot
for i in range(len(p_q5_q18)):
    if p_q5_q18[i] < 0.05:
        ax.text(x[i] - width/2, max(control_avg[i], treatment_avg[i]) + 0.5, "*", fontsize=20)


fig.tight_layout()

plt.show()
