In [1]:
from tqdm import tqdm
from convokit import Corpus, User, Utterance

## Creating Users

In [2]:
data_dir = "./data/FullData/full_info.csv"

In [3]:
anno_dir = "./data/AnnotatedData/300_info.xlsx"

The full PersuasionForGood dataset contains two files:
    full_info.csv
    full_dialog.csv
The metadata for each of the users are contained in the full_info.csv file. 
The first five columns of each line include information about the dialogue that the user is a part of. Because a user can be involved in multiple dialogues, each dialogue is a key within the user metadata which retreives the role, donation amount and number of turn associated with the user within a specific dialogue. The rest of the entries correspond to the 23 dimensional feature vector associated with each user. 

Additionally, the dictionary diag_to_user is used so that the users can be searched with just a dialogue ID and role. This is because the full_dialog.csv file only has the dialogue id and not the user_ids of the participating users. 

In [7]:
from collections import defaultdict
user_meta = defaultdict(dict)
first = True
headers = None
diag_to_user = {}
diag_meta = {}
roles = ["persuader","persuadee"]
with open(data_dir, "r", encoding='utf-8', errors='ignore') as f:
    for line in f:          
        info = line.split(",")
        if first:
            #Getting the headers of the personality vector
            headers = info[5:]
            first = False
        else:
            diag_to_user[(info[0],info[2])] = info[1]
            if info[0] in diag_meta:
                diag_meta[info[0]][roles[int(info[2])]] = info[1]
            else:
                diag_meta[info[0]] = {"donation_amt": info[3],
                                      "num_turns": info[4],
                                      "intended_amt": None}
                diag_meta[info[0]][roles[int(info[2])]] = info[1]
            if info[1] not in user_meta:
                for i, heading in enumerate(headers):
                    user_meta[info[1]][heading] = info[i+5]

In [9]:
diag_meta

{'20180904-045349_715_live': {'donation_amt': '0',
  'intended_amt': None,
  'num_turns': '11',
  'persuadee': 'A25L985XCNESXE',
  'persuader': 'A3A07QA5U733HQ'},
 '20180904-154250_98_live': {'donation_amt': '0',
  'intended_amt': None,
  'num_turns': '10',
  'persuadee': 'AG3ISFZMFGDQ9',
  'persuader': 'A3GGA28ZY5CYTH'},
 '20180904-024226_703_live': {'donation_amt': '0.05',
  'intended_amt': None,
  'num_turns': '10',
  'persuadee': 'A2JQOU78XJA6VP',
  'persuader': 'A22WWSTT8TU7G1'},
 '20180904-100019_870_live': {'donation_amt': '0',
  'intended_amt': None,
  'num_turns': '11',
  'persuadee': 'A302K8B1H9ISJA',
  'persuader': 'A29NICQTS9B05U'},
 '20180904-001208_706_live': {'donation_amt': '0',
  'intended_amt': None,
  'num_turns': '11',
  'persuadee': 'A1RGD548VIVY96',
  'persuader': 'A3LQ2F30FKC0CZ'},
 '20180904-073734_888_live': {'donation_amt': '1',
  'intended_amt': None,
  'num_turns': '10',
  'persuadee': 'A2ZHSCEKPMUX9Y',
  'persuader': 'A37UAEDJV33QQ5'},
 '20180904-133536_2_l

In [10]:
import pandas as pd
anno_info = pd.read_excel(anno_dir) 
anno_info

Unnamed: 0,B2,B3,B4,B5,B6,B7
0,20180717-200206_41_live,A272X64FOZFYLB,0,,0.5,10.0
1,20180717-200206_41_live,A3JLE2LJ5I17E2,1,0.5,0.5,10.0
2,20180719-120436_413_live,AU7A3QNJF3O00,0,,0.0,10.0
3,20180719-120436_413_live,A5NE8TWS8ZV7B,1,,0.0,10.0
4,20180719-122534_38_live,A125KW9P18V5Z1,0,,0.0,11.0
...,...,...,...,...,...,...
595,20180904-043300_214_live,A3P2LT53J1GUBG,1,,0.0,10.0
596,20180904-045349_715_live,A3A07QA5U733HQ,0,,0.0,11.0
597,20180904-045349_715_live,A25L985XCNESXE,1,0.2,0.0,11.0
598,20180904-073734_888_live,A37UAEDJV33QQ5,0,,1.0,


In [12]:
for line in anno_info.values:          
    if line[2] == 1:
        diag_meta[info[0]]["intended_amt"] = line[3]

In [13]:
corpus_users = {k: User(name = k, meta = v) for k,v in user_meta.items()}

In [16]:
print("number of users in the data = {0}".format(len(corpus_users)))

number of users in the data = 1285


## Creating Utterances

Utterances can be found in the full_dialog.csv, with 20932 lines total. The raw data has four columns: <br>
"Unit" is the text of the utterance.  <br>
"Turn" is the turn of the utterance; for example, the first back and forth between persuader and persuadee would both have "Turn" = 0.  <br>
"B2" is the conversation ID. <br>
"B4" is the role of the user in this utterance, persuader(0) or persuadee (1). <br>

As per the utterance object, we give each utterance a unique id and the unique user ID is found using a dictionary which takes in the conversation ID and role of user. The metadata for each utterance includes both the turn and role  of user. 

In [18]:
import pandas as pd
utterance_data = pd.read_csv("./data/FullData/full_dialog.csv") 
utterance_data

Unnamed: 0.1,Unnamed: 0,Unit,Turn,B4,B2
0,0,Good morning. How are you doing today?,0,0,20180904-045349_715_live
1,1,Hi. I am doing good. How about you?,0,1,20180904-045349_715_live
2,2,I'm doing pretty good for a Tuesday morning.,1,0,20180904-045349_715_live
3,3,"Haha. Same here, but it really feels like a Mo...",1,1,20180904-045349_715_live
4,4,Ugh yes it does!,2,0,20180904-045349_715_live
...,...,...,...,...,...
20927,15,My small change won't do a thing for those kid...,7,1,20180717-200206_41_live
20928,16,Well with our current president it's not likel...,8,0,20180717-200206_41_live
20929,17,"If it makes you happy, go ahead and take .50",8,1,20180717-200206_41_live
20930,18,"Oh thank you so much, you have no idea how muc...",9,0,20180717-200206_41_live


In [19]:
len(diag_to_user)

2034

In [20]:
utterance_corpus = {}
convo_order = utterance_data["Unnamed: 0"]
dia_to_id = {}

for i in list(range(len(utterance_data))):
    utterance_data_line = utterance_data.loc[i, :]
    B2 = utterance_data_line["B2"]
    B4 = utterance_data_line["B4"]
    Text = utterance_data_line["Unit"]
    meta = {"role": B4, "turn_index": utterance_data_line["Turn"], "dialogue_id" : B2, "er_label_1": None, "ee_label_1": None, "er_label_2": None, "er_label_2": None,"neg": None, "neu": None, "pos": None}
    
    if utterance_data_line["Unnamed: 0"] == 0:
        ReplyTo = None
    else:  
        ReplyTo = i - 1
    if utterance_data_line["Unnamed: 0"] == 0:
        Root = i
    else:
        utter_id = list(range(0, i+1))
        convo_order_sub = convo_order.loc[0:i]
        df = pd.DataFrame({"utter_id": utter_id, "convo_order_sub":convo_order_sub})
        df2 = df[df["convo_order_sub"] == 0]
        Root = df2['utter_id'].max()
    utterance_corpus[i] = Utterance(i, corpus_users[diag_to_user[(B2,str(B4))]], Root, ReplyTo, None, Text, meta=meta)
    dia_to_id[(B2,B4,utterance_data_line["Turn"])] = i

In [47]:
utterance_corpus[0].meta

{'dialogue_id': '20180904-045349_715_live',
 'ee_label_1': None,
 'er_label_1': 'greeting',
 'er_label_2': None,
 'neg': 0.0,
 'neu': 1.0,
 'pos': 0.0,
 'role': 0,
 'turn_index': 0}

In [22]:
import numpy as np

In [24]:
anno_utterances = pd.read_excel("./data/AnnotatedData/300_dialog.xlsx") 
anno_utterances.values[pd.isna(anno_utterances.values)] = None

In [25]:
for line in anno_utterances.values:
    if not pd.isna(line[5]):
        utterance_corpus[dia_to_id[(line[1],line[2],line[3])]].meta["er_label_1"] = line[5]
    if not pd.isna(line[6]):
        utterance_corpus[dia_to_id[(line[1],line[2],line[3])]].meta["e2_label_1"] = line[6]
    if not pd.isna(line[7]):
        utterance_corpus[dia_to_id[(line[1],line[2],line[3])]].meta["er_label_2"] = line[7]
    if not pd.isna(line[8]):
        utterance_corpus[dia_to_id[(line[1],line[2],line[3])]].meta["ee_label_2"] = line[8]
    if not pd.isna(line[9]):
        utterance_corpus[dia_to_id[(line[1],line[2],line[3])]].meta["neg"] = line[9]
    if not pd.isna(line[10]):
        utterance_corpus[dia_to_id[(line[1],line[2],line[3])]].meta["neu"] = line[10]
    if not pd.isna(line[11]):
        utterance_corpus[dia_to_id[(line[1],line[2],line[3])]].meta["pos"] = line[11]
        

##  Creating corpus from utterances

In [26]:
utterance_list = [utterance for k, utterance in utterance_corpus.items()]

In [27]:
persuasion_corpus = Corpus(utterances=utterance_list, version=1)

In [28]:
print("number of conversations in the dataset = {}".format(len(persuasion_corpus.get_conversation_ids())))

number of conversations in the dataset = 1017


In [29]:
convo_ids = persuasion_corpus.get_conversation_ids()
for i, convo_idx in enumerate(convo_ids[0:5]):
    print("sample conversation {}:".format(i))
    print(persuasion_corpus.get_conversation(convo_idx).get_utterance_ids())

sample conversation 0:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
sample conversation 1:
[21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]
sample conversation 2:
[41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60]
sample conversation 3:
[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81]
sample conversation 4:
[82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102]


In [32]:
for i in range(len(persuasion_corpus.conversations)):
    

{}

In [56]:
for conversation in persuasion_corpus.conversations.values():
    conversation.meta = diag_meta[conversation.get_utterance(conversation._utterance_ids[0]).meta['dialogue_id']]

In [58]:
persuasion_corpus.dump("persuasion_corpus","./")

In [22]:
from convokit import Parser

In [23]:
annotator = Parser()

In [25]:
movie_corpus = annotator.fit_transform(persuasion_corpus)

In [27]:
persuasion_corpus = movie_corpus

In [32]:
from convokit import PolitenessStrategies

In [33]:
ps = PolitenessStrategies()
persuasion_corpus = ps.transform(persuasion_corpus)

In [34]:
persuasion_corpus.get

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_collect_user_data',
 '_merge_utterances',
 '_update_corpus_user_data',
 'add_meta',
 'add_utterances',
 'all_users',
 'conversations',
 'dump',
 'dump_helper_bin',
 'filter_utterances_by',
 'get_conversation',
 'get_conversation_ids',
 'get_meta',
 'get_user',
 'get_usernames',
 'get_utterance',
 'get_utterance_ids',
 'iter_conversations',
 'iter_users',
 'iter_utterances',
 'iterate_by',
 'merge',
 'meta',
 'meta_index',
 'original_corpus_path',
 'pairwise_exchanges',
 'print_summary_stats',
 'speaking_pairs',
 'update_users_data',
 'utterance_threads',
 'utterances',
 'version']

In [72]:
from collections import Counter
seen = Counter()
counts = Counter()
donations = {}
for utterance in persuasion_corpus.utterances.values():
    if utterance.meta['role'] == 0:
        seen[utterance.meta['dialogue_id']] += sum(utterance.meta['politeness_strategies'].values())
        counts[utterance.meta['dialogue_id']] +=1
        donations[utterance.meta['dialogue_id']] = utterance.user.meta[utterance.meta['dialogue_id']]['donation_amt']

In [73]:
for k, v in seen.items():
    seen[k] = v/counts[k]

In [67]:
import scipy

In [74]:
scipy.stats.pearsonr(np.array(list(seen.values())),[float(i) for i in donations.values()])

(-0.14033209472628147, 7.055949627164124e-06)