# 0. Example Usage

In [13]:
"""Constructiveness analysis on StreetCrowd: message-level features.

This is a reimplementation of the features from our paper,

Conversational markers of constructive discussions. Vlad Niculae and
Cristian Danescu-Niculescu-Mizil. In: Proc. of NAACL 2016.
https://vene.ro/constructive/

See the `test` function for a usage example.
"""

# Author: Vlad Niculae <vlad@vene.ro>
# License: Simplified BSD
import nltk
import re
import os
from collections import defaultdict

from stopwords import stopwords as mallet_stopwords


class Lexicon(object):
    """Word matching code for lexicons.

    Since lexicons may contain multi-word phrases ("I agree") and lexicons may
    overlap, we don't tokenize, use string matching instead.
    """
    def __init__(self, wordlists):
        self.wordlists = wordlists
        self.regex = {cat: self.wordlist_to_re(wordlist)
                      for cat, wordlist in wordlists.items()}

    def wordlist_to_re(self, wordlist):
        return re.compile(r'\b(?:{})\b'.format("|".join(wordlist).lower()))

    def count_words(self, text, return_match=False):
        """Returns a dict {category_name: sum 1[w in category]}

        Words are double-counted if they occur in more
        than one lexicon.
        """
        text_ = text.lower()
        match = {cat: reg.findall(text_) for cat, reg in self.regex.items()}
        count = {cat: len(m) for cat, m in match.items()}

        if return_match:
            return count, match
        else:
            return count

lexicons = {
    'pron_me': ['i', "i'd", "i'll", "i'm", "i've", 'id', 'im', 'ive',
                'me', 'mine', 'my', 'myself'],
    'pron_we': ["let's", 'lets', 'our', 'ours', 'ourselves', 'us',
                'we', "we'd", "we'll", "we're", "we've", 'weve'],
    'pron_you': ["y'all", 'yall', 'you', "you'd", "you'll", "you're",
                 "you've", 'youd', 'youll', 'your', 'youre', 'yours',
                 'youve'],
    'pron_3rd': ['he', "he'd", "he's", 'hed', 'her', 'hers', 'herself',
                 'hes', 'him', 'himself', 'his', 'she', "she'd",
                 "she'll", "she's", 'shes', 'their', 'them', 'themselves',
                 'they', "they'd", "they'll", "they've", 'theyd', 'theyll',
                 'theyve', "they're", "theyre"]
}

with open(os.path.join('lexicons', 'my_geo.txt')) as f:
    lexicons['geo'] = [line.strip().lower() for line in f]

with open(os.path.join('lexicons', 'my_meta.txt')) as f:
    lexicons['meta'] = [line.strip().lower() for line in f]

with open(os.path.join('lexicons', 'my_certain.txt')) as f:
    lexicons['certain'] = [line.strip().lower() for line in f]

with open(os.path.join('lexicons', 'my_hedges.txt')) as f:
    lexicons['hedge'] = [line.strip().lower() for line in f]


lex_matcher = Lexicon(lexicons)


def get_content_tagged(words, tags):
    """Return content words based on tag"""
    return [w for w, tag in zip(words.lower().split(), tags.split())
            if tag in ("N", "^", "S", "Z", "A", "T", "V")]


def message_features(reasons, stopwords=mallet_stopwords):
    """Compute message-level features from a chat.

    Parameters
    ----------
    reasons, iterable:
        sequence of tuples (user, tokens, tags).
        In StreetCrowd, users individually can leave an explanation for their
        solo game decision.  These reasons populate the chat when the team
        meets, but they have no intrinsic order, so they can only introduce
        but not adopt idea words.  Also, more than one reason can introduce
        an idea, because they are written independently. 

    """
    seen_words = set()
    introduced = defaultdict(set)
    where_introduced = defaultdict(list)
    repeated = defaultdict(set)

    reason_features = []

    for k, (user, tokens, tags) in enumerate(reasons):
        features = {}
        content_words = [w for w in get_content_tagged(tokens, tags)
                         if w not in stopwords]

        # all new content words are new ideas here
        introduced[user].update(content_words)
        seen_words.update(content_words)
        for w in content_words:
            where_introduced[w].append(('reason', k))

        # length statistics
        features['n_words'] = len(tokens.split())
        lex_counts = lex_matcher.count_words(tokens)
        features.update(lex_counts)

        # fillers
        features['n_introduced'] = len(content_words)  # Modified for single utterance
        features['n_introduced_w_certain'] = features['n_introduced'] * features['certain']
        features['n_introduced_w_hedge'] = features['n_introduced'] * features['hedge']

        reason_features.append(features)

    return reason_features 



In [14]:
from nltk.corpus import stopwords
# Example usage 
my_utterance = "This is a sample sentence to analyze. i am sure s."
pos_tags = nltk.pos_tag(my_utterance.split())
tokens = my_utterance
tags = " ".join([tag for (_, tag) in pos_tags])

test_reasons = [('user1', tokens, tags)]  # Create the 'reason' structure
reason_feat = message_features(test_reasons, stopwords) 
print(reason_feat)  # You might not need msg_feat

[{'n_words': 11, 'pron_me': 1, 'pron_we': 0, 'pron_you': 0, 'pron_3rd': 0, 'geo': 0, 'meta': 0, 'certain': 1, 'hedge': 0, 'n_introduced': 0, 'n_introduced_w_certain': 0, 'n_introduced_w_hedge': 0}]


# 1. Deriving collaboration markers

In [15]:
import collections
from nltk.corpus import stopwords
import json
import pandas as pd
import numpy as np
from collections import Counter

def load_data_wikitac():
    with open('../../wikitactics.json') as f:
        data = json.load(f)

    conversations = []
    utterances_cleaned = []
    labels = []

    for dispute in data:
        users = list()
        conversation = ''
        utt_cleaned = ''
        for utterance in dispute['utterances']:
            username = utterance['username']
            text = utterance['text']
            conversation += f"<user_id={username}>\n{text}\n\n"
            utt_cleaned += text + '\n\n'
        conversations.append(conversation)
        utterances_cleaned.append(utt_cleaned)
        labels.append(dispute['escalation_label'])

    return conversations, utterances_cleaned, labels


def deriving_collaboration_markers(utterance):
    pos_tags = nltk.pos_tag(utterance.split())
    tokens = utterance
    tags = " ".join([tag for (_, tag) in pos_tags])

    test_reasons = [('user1', tokens, tags)]  # Create the 'reason' structure
    reason_feat = message_features(test_reasons, stopwords) 
    return reason_feat

conversations, utterances, labels = load_data_wikitac()

In [22]:
annotations_utts = []
for i, conv in enumerate(utterances):
    print(i)
    annotations_ci = []
    
    for utterance in conv.split('\n\n'):
        markers = list(deriving_collaboration_markers(utterance)[0].values())
        annotations_ci.append(markers)

    annotations_utts.append(annotations_ci)

annotations_utts

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212


[[[91, 1, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0],
  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [34, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [25, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
  [31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [29, 1, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0],
  [75, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0],
  [13, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [243, 2, 0, 2, 3, 2, 0, 0, 7, 0, 0, 0],
  [125, 2, 1, 0, 2, 3, 0, 1, 1, 0, 0, 0],
  [25, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0],
  [18, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0],
  [42, 2, 1, 1, 0, 0, 0, 0, 2, 0, 0, 0],
  [26, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [19, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
  [41, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0],
  [34, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0],
  [44, 1, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0],
  [42, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0],
  [56, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0],
  [75, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0],
  [46, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 [[2, 0, 0, 0, 0

In [23]:
from collections import defaultdict
import numpy as np
import pandas as pd

markers = list(deriving_collaboration_markers(utterance)[0].keys())
colab_features_mean = defaultdict(list)
colab_features_grad = defaultdict(list)

for ann_ci in annotations_utts:
    for i, marker in enumerate(markers):
        marker_values_ci = [ann_ci[j][i] for j in range(len(ann_ci))]

        # MEAN
        colab_features_mean[marker].append(np.mean(marker_values_ci))
        
        # GRADIENT (SLOPE)
        x = np.arange(len(marker_values_ci))
        y = marker_values_ci
        try:
            slope, intercept = np.polyfit(x, y, 1) # Fitting a linear regression (polynomial of degree 1) to the data
        except Exception as e:
            print(e)
            slope = 0 
        
        colab_features_grad[marker].append(slope)        

a = pd.DataFrame(colab_features_mean).add_suffix('_mean')
b = pd.DataFrame(colab_features_grad).add_suffix('_grad')
colab_feat_df = pd.concat([a, b], axis=1)
# colab_feat_df.to_csv("../213_wikitac_collaboration_markers.csv", index=False)
colab_feat_df

Unnamed: 0,n_words_mean,pron_me_mean,pron_we_mean,pron_you_mean,pron_3rd_mean,geo_mean,meta_mean,certain_mean,hedge_mean,n_introduced_mean,...,pron_we_grad,pron_you_grad,pron_3rd_grad,geo_grad,meta_grad,certain_grad,hedge_grad,n_introduced_grad,n_introduced_w_certain_grad,n_introduced_w_hedge_grad
0,47.333333,0.666667,0.333333,0.333333,0.375000,0.666667,0.000000,0.041667,0.791667,0.0,...,-0.008696,0.001739,0.003043,6.956522e-03,0.000000,-0.002174,-0.010870,0.0,0.0,0.0
1,67.888889,1.444444,0.111111,2.666667,0.666667,1.666667,0.000000,0.111111,0.555556,0.0,...,0.016667,0.483333,0.216667,-1.554623e-16,0.000000,0.050000,0.066667,0.0,0.0,0.0
2,49.931034,1.241379,0.448276,0.413793,0.551724,0.448276,0.000000,0.137931,1.172414,0.0,...,0.010837,0.004433,-0.015764,2.512315e-02,0.000000,0.005419,-0.028079,0.0,0.0,0.0
3,147.733333,2.933333,0.866667,3.600000,1.000000,1.933333,0.000000,0.200000,4.133333,0.0,...,0.042857,0.146429,-0.010714,-8.571429e-02,0.000000,0.035714,0.160714,0.0,0.0,0.0
4,30.727273,0.454545,0.000000,0.727273,1.454545,0.454545,0.181818,0.090909,0.454545,0.0,...,0.000000,-0.145455,-0.163636,-1.000000e-01,0.045455,0.009091,-0.127273,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208,124.333333,3.250000,0.250000,0.666667,2.500000,3.333333,0.083333,0.333333,3.333333,0.0,...,-0.010490,0.104895,0.251748,2.587413e-01,0.003497,0.034965,-0.118881,0.0,0.0,0.0
209,95.880000,2.840000,0.480000,1.640000,2.600000,0.640000,0.040000,0.560000,2.720000,0.0,...,-0.040000,0.101538,-0.138462,-5.076923e-02,-0.004615,-0.042308,-0.058462,0.0,0.0,0.0
210,65.090909,0.954545,0.272727,1.409091,0.363636,1.954545,0.045455,0.090909,1.727273,0.0,...,0.011293,0.051383,0.018069,-8.074534e-02,0.001694,-0.021457,-0.044043,0.0,0.0,0.0
211,67.608696,1.043478,0.826087,0.826087,0.826087,2.826087,0.043478,0.260870,1.130435,0.0,...,0.035573,0.078063,0.038538,4.249012e-02,0.006917,-0.010870,0.039526,0.0,0.0,0.0
