In [1]:
import numpy as np

def load_glove_embeddings(glove_file_path):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            try:
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
            except ValueError:
                print(f"Skipping line due to ValueError: {line[:50]}...")
    return embeddings_index

# Example: Load 300-dimensional GloVe embeddings
glove_file_path = './glove.42B.300d.txt'
embeddings_index = load_glove_embeddings(glove_file_path)


In [2]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download NLTK stopwords and tokenizer
import nltk
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # Lowercase and remove non-alphanumeric characters
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    # Tokenize and remove stopwords
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lexin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lexin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def get_average_glove_embedding(tokens, embeddings_index, embedding_dim=50):
    if not tokens:
        return np.zeros(embedding_dim)
    valid_embeddings = [embeddings_index[word] for word in tokens if word in embeddings_index]
    if not valid_embeddings:
        return np.zeros(embedding_dim)
    avg_embedding = np.mean(valid_embeddings, axis=0)
    return avg_embedding

def compute_embeddings(conversations, embeddings_index, embedding_dim=50):
    embeddings = []
    for conversation in conversations:
        tokens = preprocess_text(conversation)
        avg_embedding = get_average_glove_embedding(tokens, embeddings_index, embedding_dim)
        embeddings.append(avg_embedding)
    return np.array(embeddings)


# OUM

In [4]:
import torch
import json

# Load the data
def load_data_oum(label='after'):
    final_convs = []
    final_labels = []
    final_experience_features = []
    wizards_data = []
    moral_foundations = ["care", "fairness", "liberty", "loyalty", "authority", "sanctity", "none"]
    input_files = {"wizards": "wizards_dialogues.json", "final_argubot": "argubot_final_exp.json",
                   "models_dialogues": "models_dialogues.json"}
    dials_with_scores = {"wizards": {}, "final_argubot": {}, "models_dialogues": {}}


    for key in input_files:
        input_file = input_files[key]
        with open(input_file, "r", encoding="utf-8") as f:
            data = json.load(f)
        for d in data:
            is_wiki = False
            for m in d["messages"]:
                if 'model' in m and (m['model'] == 'wikibot' or m['model'] == 'controlbot'):
                    is_wiki = True
                    break
            if is_wiki:
                continue
            yes_no = 'none'
            k = 'Did you vote for (Leave) or against (Remain) Brexit in the 2016 UK referendum?'
            if k in d['participant_info']:
                if d['participant_info'][k].lower() == 'against (remain)':
                    yes_no = 'no'
                elif d['participant_info'][k].lower() == 'for (leave)':
                    yes_no = 'yes'
                else:
                    yes_no = 'none'

            k = 'In the referendum on whether the UK should remain a member of the EU (BREXIT), how did you vote?'
            if k in d['participant_info']:
                if d['participant_info'][k].lower() == 'remain (against brexit)':
                    yes_no = 'no'
                elif d['participant_info'][k].lower() == 'leave (for brexit)':
                    yes_no = 'yes'
                else:
                    yes_no = 'none'
            k = 'Have you had at least one dose of an approved Covid-19 vaccine?'
            if k in d['participant_info']:
                if d['participant_info'][k].lower() == 'yes':
                    yes_no = 'yes'
                elif d['participant_info'][k].lower() == 'no':
                    yes_no = 'no'
            k = 'Are you a vegan?'
            if k in d['participant_info']:
                if d['participant_info'][k].lower() == 'yes':
                    yes_no = 'yes'
                elif d['participant_info'][k].lower() == 'no':
                    yes_no = 'no'

            if yes_no == 'none':
                continue

            if 'Questions' in d['participant_info']:
                for q in d['participant_info']['Questions']:
                    if "final" in input_file:
                        if label == 'oum':
                            continue
                        if d['participant_info']['Questions'][q]['after'] == -1:
                            continue
                    elif d['participant_info']['Questions'][q]['before'] == -1 or d['participant_info']['Questions'][q]['after'] == -1:
                        continue
                    if 'good reasons' in q.lower():
                        if d['topic'] != 'brexit' and 'not' in q.lower() and yes_no == 'no':
                            continue
                        if d['topic'] != 'brexit' and 'not' not in q.lower() and yes_no == 'yes':
                            continue
                        if 'leave' in q.lower() and yes_no == 'yes':
                            continue
                        if 'remain' in q.lower() and yes_no == 'no':
                            continue
                        if d["_id"] not in dials_with_scores[key]:
                            text = ''
                            dials_with_scores[key][d["_id"]] = {"topic": d["topic"], "dataset": key}
                            for message in d['messages']:
                                if message['role'] == 'admin' or 'modified_argument' not in message:
                                    continue

                                text = text + '\n\n' + '<' + message['role'] + '>' + '\n' + message['modified_argument']
                            dials_with_scores[key][d["_id"]]['text'] = text.strip()
                            final_convs.append(text.strip())

                    if 'good reasons' in q.lower():
                        if False and label == 'oum': 
                            final_labels.append(float(d['participant_info']['Questions'][q]['after']) - float(d['participant_info']['Questions'][q]['before']))
                        else:
                            final_labels.append(float(d['participant_info']['Questions'][q]['after']))
                        oum = d['participant_info']['Questions'][q]['after'] - d['participant_info']['Questions'][q]['before'] if "final" not in input_file else None
                        dials_with_scores[key][d["_id"]]["good_reasons"] = {"oum": oum, "after": d['participant_info']['Questions'][q]['after']}
                        if 'before' in d['participant_info']['Questions'][q] and d['participant_info']['Questions'][q]['before'] != -1:
                            dials_with_scores[key][d["_id"]]["good_reasons"]['before'] = d['participant_info']['Questions'][q]['before']
                        else:
                            dials_with_scores[key][d["_id"]]["good_reasons"]['before'] = None

                        engagement_features = list(d['participant_info']['Engagement'].values())
                        chat_content_features = list(d['participant_info']['Chat_Content'].values())
                        final_experience_features.append(engagement_features + chat_content_features)

    assert len(final_convs) == len(final_labels)
    return final_convs, final_labels #, final_experience_features
conversations, labels = load_data_oum()

In [5]:
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from scipy.stats import spearmanr
import numpy as np

# Compute embeddings
X = compute_embeddings(conversations, embeddings_index, embedding_dim=300)
y = np.array(labels)

# Initialize Ridge Regression model
ridge_reg = Ridge(alpha=1.0)

mae_scores = []
spearman_corr_scores = []

for seed in [42, 123, 456]:
    # Setup 7-fold cross-validation
    kf = KFold(n_splits=7, shuffle=True, random_state=seed)
    
    # Perform cross-validation to get predictions
    y_pred = cross_val_predict(ridge_reg, X, y, cv=kf)
    
    # Calculate Mean Absolute Error (MAE)
    mae = mean_absolute_error(y, y_pred)
    mae_scores.append(mae)
    
    # Calculate Spearman Correlation
    spearman_corr, _ = spearmanr(y, y_pred)
    spearman_corr_scores.append(spearman_corr)

# Calculate mean and standard deviation
mean_mae = np.mean(mae_scores)
std_mae = np.std(mae_scores)

mean_spearman_corr = np.mean(spearman_corr_scores)
std_spearman_corr = np.std(spearman_corr_scores)

# Print results in MEAN(SD) format
print(f'Mean Absolute Error (MAE): {mean_mae:.3f}({std_mae:.3f})')
print(f'Spearman Correlation: {mean_spearman_corr:.3f}({std_spearman_corr:.3f})')

Mean Absolute Error (MAE): 1.431(0.011)
Spearman Correlation: 0.393(0.017)


## Robustness analysis at topic-level

In [16]:
import pandas as pd

def topic_categorisation(conv):
    if 'vegan' in conv.lower():
        return 'veganism'
    elif 'covid' in conv.lower():
        return 'covid'
    elif 'brexit' in conv.lower():
        return 'brexit'
    else:
        return 'other'


# Compute embeddings
X = compute_embeddings(conversations, embeddings_index, embedding_dim=300)
y = np.array(labels)

# Initialize Ridge Regression model
ridge_reg = Ridge(alpha=1.0)

for seed in [42, 123, 456]:
    print('seed=', seed)
    # Setup 7-fold cross-validation
    kf = KFold(n_splits=7, shuffle=True, random_state=seed)
    
    # Perform cross-validation to get predictions
    y_pred = cross_val_predict(ridge_reg, X, y, cv=kf)

    df = pd.DataFrame({'Conversations': conversations, 'Labels': labels, 'Predictions': y_pred})
    df['topic'] = df['Conversations'].map(topic_categorisation)
    
    # Calculate overall Spearman correlation
    overall_corr, _ = spearmanr(df['Labels'], df['Predictions'])
    print(f"Overall Spearman correlation: {overall_corr:.3f}")
    
    # Calculate Spearman correlation for each topic
    topic_corrs = {}
    topic_maes = {}
    for topic in df['topic'].unique():
        topic_df = df[df['topic'] == topic]
        topic_corr, _ = spearmanr(topic_df['Labels'], topic_df['Predictions'])
        topic_corrs[topic] = topic_corr
        topic_maes[topic] = mean_absolute_error(topic_df['Labels'], topic_df['Predictions'])
    
    # Print Spearman correlation for each topic
    for topic, corr in topic_corrs.items():
        print(f"Spearman correlation for topic '{topic}': {corr:.3f}")
    for topic, mae in topic_maes.items():
        print(f"MAE for topic '{topic}': {mae:.3f}")

seed= 42
Overall Spearman correlation: 0.415
Spearman correlation for topic 'covid': 0.109
Spearman correlation for topic 'brexit': -0.032
Spearman correlation for topic 'veganism': 0.152
MAE for topic 'covid': 1.587
MAE for topic 'brexit': 1.688
MAE for topic 'veganism': 1.067
seed= 123
Overall Spearman correlation: 0.373
Spearman correlation for topic 'covid': -0.012
Spearman correlation for topic 'brexit': -0.065
Spearman correlation for topic 'veganism': 0.075
MAE for topic 'covid': 1.628
MAE for topic 'brexit': 1.705
MAE for topic 'veganism': 1.087
seed= 456
Overall Spearman correlation: 0.389
Spearman correlation for topic 'covid': 0.033
Spearman correlation for topic 'brexit': -0.010
Spearman correlation for topic 'veganism': 0.094
MAE for topic 'covid': 1.603
MAE for topic 'brexit': 1.695
MAE for topic 'veganism': 1.083


# Wikitac

In [83]:
import json
import pandas as pd
import numpy as np
from collections import Counter

def load_data_wikitac():
    with open('./wikitactics.json') as f:
        data = json.load(f)

    conversations = []
    utterances_cleaned = []
    labels = []
    # Extract conversations/disputes for ESCALATED disputes
    for dispute in data:
        users = list()
        conversation = ''
        utt_cleaned = ''
        for utterance in dispute['utterances']:
            username = utterance['username']
            text = utterance['text']
            conversation += f"<user_id={username}>\n{text}\n\n"
            utt_cleaned += text + '\n\n'
        conversations.append(conversation)
        utterances_cleaned.append(utt_cleaned)
        labels.append(dispute['escalation_label'])

    return conversations, utterances_cleaned, labels

conversations, utterances, labels = load_data_wikitac()

In [84]:
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score
import numpy as np

# Compute embeddings
X = compute_embeddings(conversations, embeddings_index, embedding_dim=300)
y = np.array(labels)

# Initialize Logistic Regression model
logistic_reg = LogisticRegression(max_iter=1000)

auroc_scores = []
aupr_scores = []

for seed in [42, 123, 456]:
    # Setup 7-fold cross-validation
    kf = KFold(n_splits=7, shuffle=True, random_state=seed)
    
    # Perform cross-validation to get predictions
    y_pred_proba = cross_val_predict(logistic_reg, X, y, cv=kf, method='predict_proba')[:, 1]
    
    # Calculate AUROC
    auroc = roc_auc_score(y, y_pred_proba)
    auroc_scores.append(auroc)
    
    # Calculate AUPR
    aupr = average_precision_score(y, y_pred_proba)
    aupr_scores.append(aupr)

# Calculate mean and standard deviation
mean_auroc = np.mean(auroc_scores)
std_auroc = np.std(auroc_scores)

mean_aupr = np.mean(aupr_scores)
std_aupr = np.std(aupr_scores)

# Print results in MEAN(SD) format
print(f'Area Under ROC Curve (AUROC): {mean_auroc:.3f}({std_auroc:.3f})')
print(f'Area Under Precision-Recall Curve (AUPR): {mean_aupr:.3f}({std_aupr:.3f})')


Area Under ROC Curve (AUROC): 0.590(0.021)
Area Under Precision-Recall Curve (AUPR): 0.573(0.021)


# AFD

In [85]:
def load_data_afd():
    # Load the data from the JSON file
    with open('afd_1000_randomised_dialogues.json', 'r') as json_file:
        data_dict = json.load(json_file)

    # Extract the conversations, utterances, and labels from the data dictionary
    conversations = data_dict['conversations']
    utterances = data_dict['utterances']
    labels = data_dict['labels']
    labels = [1 if i == 0 else 0 for i in labels]
    return conversations, utterances, labels

conversations, utterances, labels = load_data_afd()

In [86]:
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score
import numpy as np

# Compute embeddings
X = compute_embeddings(conversations, embeddings_index, embedding_dim=300)
y = np.array(labels)

# Initialize Logistic Regression model
logistic_reg = LogisticRegression(max_iter=1000)

auroc_scores = []
aupr_scores = []

for seed in [42, 123, 456]:
    # Setup 7-fold cross-validation
    kf = KFold(n_splits=7, shuffle=True, random_state=seed)
    
    # Perform cross-validation to get predictions
    y_pred_proba = cross_val_predict(logistic_reg, X, y, cv=kf, method='predict_proba')[:, 1]
    
    # Calculate AUROC
    auroc = roc_auc_score(y, y_pred_proba)
    auroc_scores.append(auroc)
    
    # Calculate AUPR
    aupr = average_precision_score(y, y_pred_proba)
    aupr_scores.append(aupr)

# Calculate mean and standard deviation
mean_auroc = np.mean(auroc_scores)
std_auroc = np.std(auroc_scores)

mean_aupr = np.mean(aupr_scores)
std_aupr = np.std(aupr_scores)

# Print results in MEAN(SD) format
print(f'Area Under ROC Curve (AUROC): {mean_auroc:.3f}({std_auroc:.3f})')
print(f'Area Under Precision-Recall Curve (AUPR): {mean_aupr:.3f}({std_aupr:.3f})')


Area Under ROC Curve (AUROC): 0.826(0.001)
Area Under Precision-Recall Curve (AUPR): 0.565(0.003)
