In [None]:
import os
import pandas as pd
from datetime import datetime

L_TOKEN = "<|listener|>"
C_TOKEN = "<|client|>"

log_path = "/Users/shanglinghsu/backup_logs/flask_outputs_20221202"
DATETIME_FORMAT = "%Y-%m-%d_%H-%M-%S"
DATE_MICROSEC_FORMAT = "%Y-%m-%d_%H-%M-%S.%f"
DIALOG_COLUMNS = ['user_id', 'is_listener', 'utterance', 'time']
PRED_COLUMNS = ['code', 'score', 'last_utterance_index', 'pred_index', 'text', 'time']
CLICK_COLUMNS = ['last_utterance_index', 'pred_index', 'time']

EXP_START = datetime(year=2022, month=11, day=13, hour=13, minute=28, second=51)  # 2022-11-13 13:28:50

def f_optional_strptime(dtstr):
    format = DATE_MICROSEC_FORMAT if "." in dtstr else DATETIME_FORMAT
    return datetime.strptime(dtstr, format)

def parse_df_time(df):
    df["time"] = df["time"].apply(f_optional_strptime)

In [None]:
data = {"chat_id":[], "datetime":[], "filename_prefix": [], "num_utterances": []}
filenames = os.listdir(log_path)
unique_datetime = set()
for x in filenames:
    x_split = x.split('_')
    datetime = f_optional_strptime('_'.join(x_split[:2]))
    if datetime in unique_datetime or datetime < EXP_START: continue
    unique_datetime.add(datetime)

    chat_id = x_split[2]
    filename_prefix = "_".join(x_split[:3])

    fname = "{}_dialog.csv".format(filename_prefix)
    dialog_df = pd.read_csv(os.path.join(log_path, fname), header=0)
    data["num_utterances"].append(len(dialog_df["utterance"]))

    data["chat_id"].append(chat_id)
    data["datetime"].append(datetime)
    data["filename_prefix"].append(filename_prefix)

sessions = pd.DataFrame(data=data)
sessions.sort_values("datetime", inplace=True)
sessions.reset_index(drop=True, inplace=True)
sessions.to_csv(log_path + ".csv", index=0)

print("Number of chats:", len(sessions))

In [None]:
sessions["num_utterances"].describe()[["min", "max", "mean", "std"]]

In [None]:
def load_logs(path_prefix):
    dialog = pd.read_csv(path_prefix + "_dialog.csv", header=0)[DIALOG_COLUMNS]
    click = pd.read_csv(path_prefix + "_click.csv", header=0) #[CLICK_COLUMNS]
    pred = pd.read_csv(path_prefix + "_pred.csv", header=0) #[PRED_COLUMNS]

    for df in [dialog, click, pred]:
        if "time" in df.columns:
            parse_df_time(df)
    return dialog, click, pred

def compute_response_times(dialog, mask):
    return (dialog.time - dialog.time.shift(1))[mask]

def compute_basic_stats(dialog):
    lrt = compute_response_times(dialog, dialog.is_listener == True)
    mrt = compute_response_times(dialog, dialog.is_listener == False)
    return {
        "span": dialog.time.max() - dialog.time.min(),
        "# L utterances": sum(dialog.is_listener == True),
        "# M utterances": sum(dialog.is_listener == False),
        "avg_l_response_time": lrt.mean(),
        "med_l_response_time": lrt.median(),
        "avg_m_response_time": mrt.mean(),
        "med_m_response_time": mrt.median(),
    }

In [None]:
prefix = sessions.loc[0, "filename_prefix"]
dialog, click, pred = load_logs(os.path.join(log_path, prefix))
compute_basic_stats(dialog)

# Between predictions and responses
- Text similarity
- Strategy similarity

In [None]:
import editdistance
def compute_text_similarity(truth, predictions):
    return {
        "Edit Distance": list(map(lambda p: editdistance.eval(truth, p), predictions)),
    }

In [None]:
index = 9
drow = dialog[dialog["is_listener"]].loc[index]
utterance_true = drow["utterance"].replace(L_TOKEN, "")

last_prows = pred[pred["last_utterance_index"] == index - 1]
codes_pred = last_prows["code"].tolist()
utterance_preds = last_prows["text"].tolist()

compute_text_similarity(utterance_true, utterance_preds)