In [3]:
from tqdm import tqdm
import os
os.chdir('/Users/grace/Cornell-Conversational-Analysis-Toolkit/')
from convokit import Corpus, User, Utterance

## Creating Users

In [7]:
data_dir = "/Users/grace/PersuasionForGoodData/full_info.csv"

In [12]:
anno_dir = "/Users/grace/PersuasionForGoodData/300_info.xlsx"

The full PersuasionForGood dataset contains two files:  <br>
    full_info.csv <br>
    full_dialog.csv <br>
The metadata for each of the users are contained in the full_info.csv file. 
The first five columns of each line include information about the dialogue that the user is a part of. Because a user can be involved in multiple dialogues, each dialogue is a key within the user metadata which retreives the role, donation amount and number of turn associated with the user within a specific dialogue. The rest of the entries correspond to the 23 dimensional feature vector associated with each user. 

Additionally, the dictionary diag_to_user is used so that the users can be searched with just a dialogue ID and role. This is because the full_dialog.csv file only has the dialogue id and not the user_ids of the participating users. 

In [13]:
from collections import defaultdict
user_meta = defaultdict(dict)
first = True
headers = None
diag_to_user = {}
diag_meta = {}
roles = ["persuader","persuadee"]
with open(data_dir, "r", encoding='utf-8', errors='ignore') as f:
    for line in f:          
        info = line.split(",")
        if first:
            #Getting the headers of the personality vector
            headers = info[5:]
            first = False
        else:
            diag_to_user[(info[0],info[2])] = info[1]
            if info[0] in diag_meta:
                diag_meta[info[0]][roles[int(info[2])]] = info[1]
            else:
                diag_meta[info[0]] = {"donation_amt": info[3],
                                      "num_turns": info[4],
                                      "intended_amt": None}
                diag_meta[info[0]][roles[int(info[2])]] = info[1]
            if info[1] not in user_meta:
                for i, heading in enumerate(headers):
                    user_meta[info[1]][heading] = info[i+5]

In [14]:
diag_meta

{'20180904-045349_715_live': {'donation_amt': '0',
  'num_turns': '11',
  'intended_amt': None,
  'persuader': 'A3A07QA5U733HQ',
  'persuadee': 'A25L985XCNESXE'},
 '20180904-154250_98_live': {'donation_amt': '0',
  'num_turns': '10',
  'intended_amt': None,
  'persuader': 'A3GGA28ZY5CYTH',
  'persuadee': 'AG3ISFZMFGDQ9'},
 '20180904-024226_703_live': {'donation_amt': '0.05',
  'num_turns': '10',
  'intended_amt': None,
  'persuader': 'A22WWSTT8TU7G1',
  'persuadee': 'A2JQOU78XJA6VP'},
 '20180904-100019_870_live': {'donation_amt': '0',
  'num_turns': '11',
  'intended_amt': None,
  'persuader': 'A29NICQTS9B05U',
  'persuadee': 'A302K8B1H9ISJA'},
 '20180904-001208_706_live': {'donation_amt': '0',
  'num_turns': '11',
  'intended_amt': None,
  'persuader': 'A3LQ2F30FKC0CZ',
  'persuadee': 'A1RGD548VIVY96'},
 '20180904-073734_888_live': {'donation_amt': '1',
  'num_turns': '10',
  'intended_amt': None,
  'persuader': 'A37UAEDJV33QQ5',
  'persuadee': 'A2ZHSCEKPMUX9Y'},
 '20180904-133536_2_l

In [15]:
import pandas as pd
anno_info = pd.read_excel(anno_dir) 
anno_info

Unnamed: 0,B2,B3,B4,B5,B6,B7
0,20180717-200206_41_live,A272X64FOZFYLB,0,,0.50,10.0
1,20180717-200206_41_live,A3JLE2LJ5I17E2,1,0.50,0.50,10.0
2,20180719-120436_413_live,AU7A3QNJF3O00,0,,0.00,10.0
3,20180719-120436_413_live,A5NE8TWS8ZV7B,1,,0.00,10.0
4,20180719-122534_38_live,A125KW9P18V5Z1,0,,0.00,11.0
5,20180719-122534_38_live,A2T007HZK66WM,1,5.00,0.50,11.0
6,20180719-165941_192_live,A5NE8TWS8ZV7B,0,,0.00,10.0
7,20180719-165941_192_live,A290OV59Q76QC8,1,50.00,0.00,10.0
8,20180719-175233_833_live,A5NE8TWS8ZV7B,0,,0.00,10.0
9,20180719-175233_833_live,A2GV9WSNSPX53,1,,0.05,10.0


In [16]:
for line in anno_info.values:          
    if line[2] == 1:
        diag_meta[info[0]]["intended_amt"] = line[3]

In [17]:
corpus_users = {k: User(name = k, meta = v) for k,v in user_meta.items()}

In [18]:
print("number of users in the data = {0}".format(len(corpus_users)))

number of users in the data = 1285


## Creating Utterances

Utterances can be found in the full_dialog.csv, with 20932 lines total. The raw data has four columns: <br>
"Unit" is the text of the utterance.  <br>
"Turn" is the turn of the utterance; for example, the first back and forth between persuader and persuadee would both have "Turn" = 0.  <br>
"B2" is the conversation ID. <br>
"B4" is the role of the user in this utterance, persuader(0) or persuadee (1). <br>

As per the utterance object, we give each utterance a unique id and the unique user ID is found using a dictionary which takes in the conversation ID and role of user. The metadata for each utterance includes both the turn and role  of user. 

In [20]:
import pandas as pd
utterance_data = pd.read_csv("/Users/grace/PersuasionForGoodData/full_dialog.csv") 
utterance_data

Unnamed: 0.1,Unnamed: 0,Unit,Turn,B4,B2
0,0,Good morning. How are you doing today?,0,0,20180904-045349_715_live
1,1,Hi. I am doing good. How about you?,0,1,20180904-045349_715_live
2,2,I'm doing pretty good for a Tuesday morning.,1,0,20180904-045349_715_live
3,3,"Haha. Same here, but it really feels like a Mo...",1,1,20180904-045349_715_live
4,4,Ugh yes it does!,2,0,20180904-045349_715_live
5,5,I can not believe how warm it is already.,2,1,20180904-045349_715_live
6,6,Where are you from?,3,0,20180904-045349_715_live
7,7,I am from the Midwest. What about you?,3,1,20180904-045349_715_live
8,8,I'm from the South East. It's always warm here.,4,0,20180904-045349_715_live
9,9,"Oh, yep. You are definitely in for warm weathe...",4,1,20180904-045349_715_live


In [21]:
len(diag_to_user)

2034

In [22]:
utterance_corpus = {}
convo_order = utterance_data["Unnamed: 0"]
dia_to_id = {}

for i in list(range(len(utterance_data))):
    utterance_data_line = utterance_data.loc[i, :]
    B2 = utterance_data_line["B2"]
    B4 = utterance_data_line["B4"]
    Text = utterance_data_line["Unit"]
    meta = {"role": B4, "turn_index": utterance_data_line["Turn"], "dialogue_id" : B2, "er_label_1": None, "ee_label_1": None, "er_label_2": None, "er_label_2": None,"neg": None, "neu": None, "pos": None}
    
    if utterance_data_line["Unnamed: 0"] == 0:
        ReplyTo = None
    else:  
        ReplyTo = i - 1
    if utterance_data_line["Unnamed: 0"] == 0:
        Root = i
    else:
        utter_id = list(range(0, i+1))
        convo_order_sub = convo_order.loc[0:i]
        df = pd.DataFrame({"utter_id": utter_id, "convo_order_sub":convo_order_sub})
        df2 = df[df["convo_order_sub"] == 0]
        Root = df2['utter_id'].max()
    utterance_corpus[i] = Utterance(i, corpus_users[diag_to_user[(B2,str(B4))]], Root, ReplyTo, None, Text, meta=meta)
    dia_to_id[(B2,B4,utterance_data_line["Turn"])] = i

In [23]:
utterance_corpus[0].meta

{'role': 0,
 'turn_index': 0,
 'dialogue_id': '20180904-045349_715_live',
 'er_label_1': None,
 'ee_label_1': None,
 'er_label_2': None,
 'neg': None,
 'neu': None,
 'pos': None}

In [24]:
import numpy as np

In [31]:
anno_utterances = pd.read_excel("/Users/grace/PersuasionForGoodData/300_dialog.xlsx") 
anno_utterances.values[pd.isna(anno_utterances.values)] = None

In [32]:
anno_utterances

Unnamed: 0,B2,B4,Turn,Unit,er_label_1,ee_label_1,er_label_2,ee_label_2,neg,neu,pos
0,20180719-210146_172_live,0,0,Hello.,greeting,,,,0.000,1.000,0.000
1,20180719-210146_172_live,0,0,How are you?,greeting,,,,0.000,1.000,0.000
2,20180719-210146_172_live,1,0,"I'm good, how are you doing?",,greeting,,,0.000,0.633,0.367
3,20180719-210146_172_live,0,1,Very well.,greeting,,,,0.000,0.295,0.705
4,20180719-210146_172_live,0,1,I'm just up organizing info for my charity.,other,,,,0.000,0.714,0.286
5,20180719-210146_172_live,0,1,Are you involved with charities?,task-related-inquiry,,,,0.000,0.556,0.444
6,20180719-210146_172_live,1,1,Yes!,,positive-to-inquiry,,,0.000,0.000,1.000
7,20180719-210146_172_live,1,1,I work with children who have terminal illnesses.,,positive-to-inquiry,,,0.348,0.652,0.000
8,20180719-210146_172_live,1,1,What charity are you involved in?,,task-related-inquiry,,,0.000,0.641,0.359
9,20180719-210146_172_live,0,2,That's great!,acknowledgement,,,,0.000,0.185,0.815


In [33]:
for line in anno_utterances.values:
    if not pd.isna(line[4]):
        utterance_corpus[dia_to_id[(line[0],line[1],line[2])]].meta["er_label_1"] = line[4]
    if not pd.isna(line[5]):
        utterance_corpus[dia_to_id[(line[0],line[1],line[2])]].meta["e2_label_1"] = line[5]
    if not pd.isna(line[6]):
        utterance_corpus[dia_to_id[(line[0],line[1],line[2])]].meta["er_label_2"] = line[6]
    if not pd.isna(line[7]):
        utterance_corpus[dia_to_id[(line[0],line[1],line[2])]].meta["ee_label_2"] = line[7]
    if not pd.isna(line[8]):
        utterance_corpus[dia_to_id[(line[0],line[1],line[2])]].meta["neg"] = line[8]
    if not pd.isna(line[9]):
        utterance_corpus[dia_to_id[(line[0],line[1],line[2])]].meta["neu"] = line[9]
    if not pd.isna(line[10]):
        utterance_corpus[dia_to_id[(line[0],line[1],line[2])]].meta["pos"] = line[10]
        

##  Creating corpus from utterances

In [34]:
utterance_list = [utterance for k, utterance in utterance_corpus.items()]

In [36]:
persuasion_corpus = Corpus(utterances=utterance_list, version=1)

In [37]:
print("number of conversations in the dataset = {}".format(len(persuasion_corpus.get_conversation_ids())))

number of conversations in the dataset = 1017


In [38]:
convo_ids = persuasion_corpus.get_conversation_ids()
for i, convo_idx in enumerate(convo_ids[0:5]):
    print("sample conversation {}:".format(i))
    print(persuasion_corpus.get_conversation(convo_idx).get_utterance_ids())

sample conversation 0:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
sample conversation 1:
[21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]
sample conversation 2:
[41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60]
sample conversation 3:
[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81]
sample conversation 4:
[82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102]


In [40]:
for conversation in persuasion_corpus.conversations.values():
    conversation.meta = diag_meta[conversation.get_utterance(conversation._utterance_ids[0]).meta['dialogue_id']]

In [42]:
#persuasion_corpus.dump("persuasion_corpus","./")

In [43]:
persuasion_corpus

<convokit.model.corpus.Corpus at 0x1a1c354080>

## D1 Summary Stats

Summary statistics (number of dialogues and participants) as well as replicating some stats from Table 2 of paper to demonstrate integrity of dataset.

In [50]:
#Dialogues in Table 2, matches
len(persuasion_corpus.get_conversation_ids())

1017

In [56]:
#Participants in Table 2, matches
len(persuasion_corpus.all_users)

1285

In [95]:
donations = []
turns = []
allconvoids = persuasion_corpus.get_conversation_ids()
for i in list(range(0, len(allconvoids))):
    convo_id = allconvoids[i]
    temp = persuasion_corpus.get_conversation(convo_id).meta["donation_amt"]
    convo_turn = persuasion_corpus.get_conversation(convo_id).meta["num_turns"]
    donations.append(float(temp))
    turns.append(float(convo_turn))

In [86]:
len(donations) #should be same as number of dialogues, matches

1017

#### Mean/Median/Min/Max of donation amounts and number of turns per conversation in full dataset

In [96]:
import numpy as np
print(np.mean(donations), np.median(donations), min(donations), max(donations)) #average donation, doesn't match
print(np.mean(turns), np.median(turns), min(turns), max(turns)) #average turns, matches paper

2.429390363815142 0.0 0.0 500.0
10.420845624385448 10.0 4.0 15.0


In [137]:
words = []
wordscount = []
allwords = []
allutterids = persuasion_corpus.get_utterance_ids()
for i in list(range(0, len(allutterids))):
    utter_id = allutterids[i]
    text = persuasion_corpus.get_utterance(utter_id).text
    words.append(text) #gives text of each utterance
    #print(text.split())
    wordscount.append(len(text.split())) #gives word count of each utterance
    allwords = allwords + text.split() #adds utterance words to a list of all words

In [128]:
print(np.mean(wordscount), np.median(wordscount), min(wordscount), max(wordscount)) #avg words per utterance, does not match

16.804844257596024 13.0 1 100


In [139]:
len(allwords) 

351759

In [141]:
len(np.unique(allwords)) #total unique tokens, does not match

17162

In [144]:
persuasion_corpus.get_utterance(2).meta["role"]

0

In [145]:
persuadee_words = []
persuader_words = []
for i in list(range(0, len(allutterids))):
    utter_id = allutterids[i]
    role = persuasion_corpus.get_utterance(utter_id).meta["role"]
    text = persuasion_corpus.get_utterance(utter_id).text
    if role == 1:
        persuadee_words.append(len(text.split()))
    if role == 0:
        persuader_words.append(len(text.split()))

In [146]:
#average words per utterance for persuadee, does not match
print(np.mean(persuadee_words), np.median(persuadee_words), min(persuadee_words), max(persuadee_words))
#average words per utterance for persuader, does not match
print(np.mean(persuader_words), np.median(persuader_words), min(persuader_words), max(persuader_words))



13.470286488579172 10.0 1 98
20.05509433962264 16.0 1 100


## Our statistics vs. Table 2 in Paper


| Statistic     | In Paper        | Ours  | 
| :------------- |:-------------:| :-----:| 
| Dialogues  | 1017 | 1017 | 
| Participants | 1285  |  1285 |
| Avg Donation | \$0.35 | $2.43 |
| Avg Turns Per Dialogue  | 10.43 | 10.43 |
| Avg Words Per Utterance  | 19.36 | 16.80 |
| Total Unique Tokens (words?) | 8141  |  17162 |
| Avg Words Per Utterance (Persuadee)  | 15.65 | 13.47 |
| Avg Words Per Utterance (Persuader)  | 22.96 | 20.06 |

