# 0. Example Usage

In [4]:
"""Constructiveness analysis on StreetCrowd: message-level features.

This is a reimplementation of the features from our paper,

Conversational markers of constructive discussions. Vlad Niculae and
Cristian Danescu-Niculescu-Mizil. In: Proc. of NAACL 2016.
https://vene.ro/constructive/

See the `test` function for a usage example.
"""

# Author: Vlad Niculae <vlad@vene.ro>
# License: Simplified BSD
import nltk
import re
import os
from collections import defaultdict

from stopwords import stopwords as mallet_stopwords


class Lexicon(object):
    """Word matching code for lexicons.

    Since lexicons may contain multi-word phrases ("I agree") and lexicons may
    overlap, we don't tokenize, use string matching instead.
    """
    def __init__(self, wordlists):
        self.wordlists = wordlists
        self.regex = {cat: self.wordlist_to_re(wordlist)
                      for cat, wordlist in wordlists.items()}

    def wordlist_to_re(self, wordlist):
        return re.compile(r'\b(?:{})\b'.format("|".join(wordlist).lower()))

    def count_words(self, text, return_match=False):
        """Returns a dict {category_name: sum 1[w in category]}

        Words are double-counted if they occur in more
        than one lexicon.
        """
        text_ = text.lower()
        match = {cat: reg.findall(text_) for cat, reg in self.regex.items()}
        count = {cat: len(m) for cat, m in match.items()}

        if return_match:
            return count, match
        else:
            return count

lexicons = {
    'pron_me': ['i', "i'd", "i'll", "i'm", "i've", 'id', 'im', 'ive',
                'me', 'mine', 'my', 'myself'],
    'pron_we': ["let's", 'lets', 'our', 'ours', 'ourselves', 'us',
                'we', "we'd", "we'll", "we're", "we've", 'weve'],
    'pron_you': ["y'all", 'yall', 'you', "you'd", "you'll", "you're",
                 "you've", 'youd', 'youll', 'your', 'youre', 'yours',
                 'youve'],
    'pron_3rd': ['he', "he'd", "he's", 'hed', 'her', 'hers', 'herself',
                 'hes', 'him', 'himself', 'his', 'she', "she'd",
                 "she'll", "she's", 'shes', 'their', 'them', 'themselves',
                 'they', "they'd", "they'll", "they've", 'theyd', 'theyll',
                 'theyve', "they're", "theyre"]
}

with open(os.path.join('lexicons', 'my_geo.txt')) as f:
    lexicons['geo'] = [line.strip().lower() for line in f]

with open(os.path.join('lexicons', 'my_meta.txt')) as f:
    lexicons['meta'] = [line.strip().lower() for line in f]

with open(os.path.join('lexicons', 'my_certain.txt')) as f:
    lexicons['certain'] = [line.strip().lower() for line in f]

with open(os.path.join('lexicons', 'my_hedges.txt')) as f:
    lexicons['hedge'] = [line.strip().lower() for line in f]


lex_matcher = Lexicon(lexicons)


def get_content_tagged(words, tags):
    """Return content words based on tag"""
    return [w for w, tag in zip(words.lower().split(), tags.split())
            if tag in ("N", "^", "S", "Z", "A", "T", "V")]


def message_features(reasons, stopwords=mallet_stopwords):
    """Compute message-level features from a chat.

    Parameters
    ----------
    reasons, iterable:
        sequence of tuples (user, tokens, tags).
        In StreetCrowd, users individually can leave an explanation for their
        solo game decision.  These reasons populate the chat when the team
        meets, but they have no intrinsic order, so they can only introduce
        but not adopt idea words.  Also, more than one reason can introduce
        an idea, because they are written independently. 

    """
    seen_words = set()
    introduced = defaultdict(set)
    where_introduced = defaultdict(list)
    repeated = defaultdict(set)

    reason_features = []

    for k, (user, tokens, tags) in enumerate(reasons):
        features = {}
        content_words = [w for w in get_content_tagged(tokens, tags)
                         if w not in stopwords]

        # all new content words are new ideas here
        introduced[user].update(content_words)
        seen_words.update(content_words)
        for w in content_words:
            where_introduced[w].append(('reason', k))

        # length statistics
        features['n_words'] = len(tokens.split())
        lex_counts = lex_matcher.count_words(tokens)
        features.update(lex_counts)

        # fillers
        features['n_introduced'] = len(content_words)  # Modified for single utterance
        features['n_introduced_w_certain'] = features['n_introduced'] * features['certain']
        features['n_introduced_w_hedge'] = features['n_introduced'] * features['hedge']

        reason_features.append(features)

    return reason_features 



In [5]:
from nltk.corpus import stopwords
# Example usage 
my_utterance = "This is a sample sentence to analyze. i am sure s."
pos_tags = nltk.pos_tag(my_utterance.split())
tokens = my_utterance
tags = " ".join([tag for (_, tag) in pos_tags])

test_reasons = [('user1', tokens, tags)]  # Create the 'reason' structure
reason_feat = message_features(test_reasons, stopwords) 
print(reason_feat)  # You might not need msg_feat

[{'n_words': 11, 'pron_me': 1, 'pron_we': 0, 'pron_you': 0, 'pron_3rd': 0, 'geo': 0, 'meta': 0, 'certain': 1, 'hedge': 0, 'n_introduced': 0, 'n_introduced_w_certain': 0, 'n_introduced_w_hedge': 0}]


# 1. Deriving collaboration markers

In [6]:
import collections
from nltk.corpus import stopwords
import json

# Load the data
def load_data(label='after'):
    final_convs = []
    final_labels = []
    wizards_data = []
    moral_foundations = ["care", "fairness", "liberty", "loyalty", "authority", "sanctity", "none"]
    input_files = {"wizards": "../../wizards_dialogues.json", "final_argubot": "../../argubot_final_exp.json",
                   "models_dialogues": "../../models_dialogues.json"}
    dials_with_scores = {"wizards": {}, "final_argubot": {}, "models_dialogues": {}}


    for key in input_files:
        input_file = input_files[key]
        with open(input_file, "r", encoding="utf-8") as f:
            data = json.load(f)
        for d in data:
            is_wiki = False
            for m in d["messages"]:
                if 'model' in m and (m['model'] == 'wikibot' or m['model'] == 'controlbot'):
                    is_wiki = True
                    break
            if is_wiki:
                continue
            yes_no = 'none'
            k = 'Did you vote for (Leave) or against (Remain) Brexit in the 2016 UK referendum?'
            if k in d['participant_info']:
                if d['participant_info'][k].lower() == 'against (remain)':
                    yes_no = 'no'
                elif d['participant_info'][k].lower() == 'for (leave)':
                    yes_no = 'yes'
                else:
                    yes_no = 'none'

            k = 'In the referendum on whether the UK should remain a member of the EU (BREXIT), how did you vote?'
            if k in d['participant_info']:
                if d['participant_info'][k].lower() == 'remain (against brexit)':
                    yes_no = 'no'
                elif d['participant_info'][k].lower() == 'leave (for brexit)':
                    yes_no = 'yes'
                else:
                    yes_no = 'none'
            k = 'Have you had at least one dose of an approved Covid-19 vaccine?'
            if k in d['participant_info']:
                if d['participant_info'][k].lower() == 'yes':
                    yes_no = 'yes'
                elif d['participant_info'][k].lower() == 'no':
                    yes_no = 'no'
            k = 'Are you a vegan?'
            if k in d['participant_info']:
                if d['participant_info'][k].lower() == 'yes':
                    yes_no = 'yes'
                elif d['participant_info'][k].lower() == 'no':
                    yes_no = 'no'

            if yes_no == 'none':
                continue

            if 'Questions' in d['participant_info']:
                for q in d['participant_info']['Questions']:
                    if "final" in input_file:
                        if label == 'oum':
                            continue
                        if d['participant_info']['Questions'][q]['after'] == -1:
                            continue
                    elif d['participant_info']['Questions'][q]['before'] == -1 or d['participant_info']['Questions'][q]['after'] == -1:
                        continue
                    if 'good reasons' in q.lower():
                        if d['topic'] != 'brexit' and 'not' in q.lower() and yes_no == 'no':
                            continue
                        if d['topic'] != 'brexit' and 'not' not in q.lower() and yes_no == 'yes':
                            continue
                        if 'leave' in q.lower() and yes_no == 'yes':
                            continue
                        if 'remain' in q.lower() and yes_no == 'no':
                            continue
                        if d["_id"] not in dials_with_scores[key]:
                            text = ''
                            dials_with_scores[key][d["_id"]] = {"topic": d["topic"], "dataset": key}
                            for message in d['messages']:
                                if message['role'] == 'admin' or 'modified_argument' not in message:
                                    continue

                                text = text + '\n\n' + '<' + message['role'] + '>' + '\n' + message['modified_argument']
                            dials_with_scores[key][d["_id"]]['text'] = text.strip()
                            final_convs.append(text.strip())

                    if 'good reasons' in q.lower():
                        if False and label == 'oum': 
                            final_labels.append(float(d['participant_info']['Questions'][q]['after']) - float(d['participant_info']['Questions'][q]['before']))
                        else:
                            final_labels.append(float(d['participant_info']['Questions'][q]['after']))
                        oum = d['participant_info']['Questions'][q]['after'] - d['participant_info']['Questions'][q]['before'] if "final" not in input_file else None
                        dials_with_scores[key][d["_id"]]["good_reasons"] = {"oum": oum, "after": d['participant_info']['Questions'][q]['after']}
                        if 'before' in d['participant_info']['Questions'][q] and d['participant_info']['Questions'][q]['before'] != -1:
                            dials_with_scores[key][d["_id"]]["good_reasons"]['before'] = d['participant_info']['Questions'][q]['before']
                        else:
                            dials_with_scores[key][d["_id"]]["good_reasons"]['before'] = None


    assert len(final_convs) == len(final_labels)
    return final_convs, final_labels

def get_utterances(text):
    """
    Given a conversation, this function returns a list of utterances
    """
    # Splitting the input text into lines
    lines = text.split('\n')
    # Variable to keep the cleaned lines
    cleaned_lines = []
    # Variable to keep track of whether the next line should be added
    add_next_line = False
    for line in lines:
        # If the line is a participant tag, set the flag to add the next line
        if line.strip() == '<participant>':
            add_next_line = True
        elif line.strip() in ['<woz>', '<chatbot>']:
            add_next_line = True
        elif add_next_line:
            # If the flag is set, add this line to the cleaned list and reset the flag
            cleaned_lines.append(line)
            add_next_line = False
    # Join the cleaned lines back into a single string
    cleaned_text = '\n\n'.join(cleaned_lines)
    return cleaned_text

def deriving_collaboration_markers(utterance):
    pos_tags = nltk.pos_tag(utterance.split())
    tokens = utterance
    tags = " ".join([tag for (_, tag) in pos_tags])

    test_reasons = [('user1', tokens, tags)]  # Create the 'reason' structure
    reason_feat = message_features(test_reasons, stopwords) 
    return reason_feat

conversations, labels = load_data(label='after')
utterances = [get_utterances(c) for c in conversations]

In [7]:
annotations = collections.defaultdict(list)
for i, conv in enumerate(utterances):
    annotations_ci = []
    
    ### participant
    for utterance in conv.split('\n\n'):
        markers = list(deriving_collaboration_markers(utterance)[0].values())
        annotations_ci.append(markers)

    annotations['utterances'].append(annotations_ci)

annotations_utts = annotations['utterances']

In [12]:
from collections import defaultdict
import numpy as np
import pandas as pd

markers = list(deriving_collaboration_markers(utterance)[0].keys())
colab_features_mean = defaultdict(list)
colab_features_grad = defaultdict(list)

for ann_ci in annotations_utts:
    for i, marker in enumerate(markers):
        marker_values_ci = [ann_ci[j][i] for j in range(len(ann_ci))]

        # MEAN
        colab_features_mean[marker].append(np.mean(marker_values_ci))
        
        # GRADIENT (SLOPE)
        x = np.arange(len(marker_values_ci))
        y = marker_values_ci
        slope, intercept = np.polyfit(x, y, 1) # Fitting a linear regression (polynomial of degree 1) to the data
        colab_features_grad[marker].append(slope)        

a = pd.DataFrame(colab_features_mean).add_suffix('_mean')
b = pd.DataFrame(colab_features_grad).add_suffix('_grad')
colab_feat_df = pd.concat([a, b], axis=1)
colab_feat_df.to_csv("../542_conversations_collaboration_markers_without_pnp.csv", index=False)