In [1]:
import os
import pandas as pd
import numpy as np

from convokit import Corpus, Speaker, Utterance


(a directory containing all files used in the original dataset, found [here](https://gitlab.com/ucdavisnlp/persuasionforgood/tree/master/data). replace with your own directory.)

In [2]:
ROOT_DIR = '<YOUR DIRECTORY>'

Converting user-level information. We'll assume that each user, across all the conversations they've participated in, will have the same survey information.

In [3]:
user_df = pd.read_csv(os.path.join(ROOT_DIR, 'full_info.csv'))

In [4]:
user_df.columns = [c.replace('.x','') for c in user_df.columns]

In [5]:
survey_cols = [c for c in user_df.columns if not c.startswith('B')]

In [6]:
user_df.columns = ['dialogue_id', 'speaker', 'role', 'donation', 'n_turns'] + survey_cols

In [7]:
user_df.head()

Unnamed: 0,dialogue_id,user,role,donation,n_turns,extrovert,agreeable,conscientious,neurotic,open,...,intuitive,age,sex,race,edu,marital,employment,income,religion,ideology
0,20180904-045349_715_live,A3A07QA5U733HQ,0,0.0,11,3.2,3.2,3.6,1.6,3.6,...,2.0,34.0,Male,White,Less than four-year college,Unmarried,Employed for wages,5.0,Other religion,Liberal
1,20180904-045349_715_live,A25L985XCNESXE,1,0.0,11,3.2,4.0,3.8,2.0,3.2,...,4.0,50.0,Female,White,Less than four-year college,Married,Employed for wages,10.0,Protestant,Conservative
2,20180904-154250_98_live,A3GGA28ZY5CYTH,0,0.0,10,3.6,3.8,4.0,4.0,3.6,...,4.0,25.0,Female,White,Four-year college,Married,Other,6.0,Atheist,Moderate
3,20180904-154250_98_live,AG3ISFZMFGDQ9,1,2.0,10,3.0,3.6,3.2,3.0,3.0,...,2.0,30.0,Male,White,Less than four-year college,Unmarried,Employed for wages,2.0,Atheist,Liberal
4,20180904-024226_703_live,A22WWSTT8TU7G1,0,0.05,10,1.0,5.0,2.8,1.0,4.6,...,1.0,36.0,Female,White,Less than four-year college,Unmarried,Employed for wages,1.0,Catholic,Moderate


In [8]:
user_meta_dict = user_df.drop_duplicates('speaker').set_index('speaker')[survey_cols].to_dict(orient='index')

In [9]:
corpus_speakers = {k: Speaker(id=k, meta=v) for k,v in user_meta_dict.items()}

we'll also keep track of which users are involved in which dialogue.

In [10]:
er_info = user_df[user_df.role == 0].set_index('dialogue_id')[['speaker','donation']]
ee_info = user_df[user_df.role == 1].set_index('dialogue_id')[['speaker','donation']]

In [11]:
convo_info = er_info.join(ee_info, lsuffix='_er', rsuffix='_ee')

In [12]:
convo_info.head()

Unnamed: 0_level_0,user_er,donation_er,user_ee,donation_ee
dialogue_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20180904-045349_715_live,A3A07QA5U733HQ,0.0,A25L985XCNESXE,0.0
20180904-154250_98_live,A3GGA28ZY5CYTH,0.0,AG3ISFZMFGDQ9,2.0
20180904-024226_703_live,A22WWSTT8TU7G1,0.05,A2JQOU78XJA6VP,0.05
20180904-100019_870_live,A29NICQTS9B05U,0.0,A302K8B1H9ISJA,0.0
20180904-001208_706_live,A3LQ2F30FKC0CZ,0.0,A1RGD548VIVY96,1.0


Converting utterances.

In [13]:
utt_df = pd.read_csv(os.path.join(ROOT_DIR, 'full_dialog.csv'))

In [14]:
utt_df.columns = ['turn_id', 'text', 'user_turn_id', 'role', 'dialogue_id']

In [15]:
utt_df.head()

Unnamed: 0,turn_id,text,user_turn_id,role,dialogue_id
0,0,Good morning. How are you doing today?,0,0,20180904-045349_715_live
1,1,Hi. I am doing good. How about you?,0,1,20180904-045349_715_live
2,2,I'm doing pretty good for a Tuesday morning.,1,0,20180904-045349_715_live
3,3,"Haha. Same here, but it really feels like a Mo...",1,1,20180904-045349_715_live
4,4,Ugh yes it does!,2,0,20180904-045349_715_live


Here, we get the user_ids of the author of each utterance.

In [16]:
utt_df = utt_df.join(convo_info[['user_er','user_ee']], on='dialogue_id')

In [17]:
utt_df['speaker'] = utt_df[['user_er','user_ee']].values[np.arange(len(utt_df)), utt_df.role.values]

since convokit conversation ids use the id of the root utterance (i.e., the first utterance), we'll keep track of the mapping from dialogue_ids to convokit's conversation ids.

In [18]:
utt_df['id'] = utt_df.index

In [19]:
dialogue_to_convo_id = utt_df.groupby('dialogue_id')['id'].min()

In [20]:
dialogue_to_convo_id.sort_values().head()

dialogue_id
20180904-045349_715_live     0
20180904-154250_98_live     21
20180904-024226_703_live    41
20180904-100019_870_live    61
20180904-001208_706_live    82
Name: id, dtype: int64

Adds the reply and root info.

In [21]:
utt_df['reply_to'] = utt_df['id'].shift()

In [22]:
utt_df.loc[utt_df.turn_id==0,'reply_to'] = np.nan

In [23]:
utt_df = utt_df.join(dialogue_to_convo_id.rename('conversation_id'), on='dialogue_id')

In [24]:
utt_df.head()

Unnamed: 0,turn_id,text,user_turn_id,role,dialogue_id,user_er,user_ee,user,id,reply_to,root
0,0,Good morning. How are you doing today?,0,0,20180904-045349_715_live,A3A07QA5U733HQ,A25L985XCNESXE,A3A07QA5U733HQ,0,,0
1,1,Hi. I am doing good. How about you?,0,1,20180904-045349_715_live,A3A07QA5U733HQ,A25L985XCNESXE,A25L985XCNESXE,1,0.0,0
2,2,I'm doing pretty good for a Tuesday morning.,1,0,20180904-045349_715_live,A3A07QA5U733HQ,A25L985XCNESXE,A3A07QA5U733HQ,2,1.0,0
3,3,"Haha. Same here, but it really feels like a Mo...",1,1,20180904-045349_715_live,A3A07QA5U733HQ,A25L985XCNESXE,A25L985XCNESXE,3,2.0,0
4,4,Ugh yes it does!,2,0,20180904-045349_715_live,A3A07QA5U733HQ,A25L985XCNESXE,A3A07QA5U733HQ,4,3.0,0


In [25]:
utt_df.head()

Unnamed: 0,turn_id,text,user_turn_id,role,dialogue_id,user_er,user_ee,user,id,reply_to,root
0,0,Good morning. How are you doing today?,0,0,20180904-045349_715_live,A3A07QA5U733HQ,A25L985XCNESXE,A3A07QA5U733HQ,0,,0
1,1,Hi. I am doing good. How about you?,0,1,20180904-045349_715_live,A3A07QA5U733HQ,A25L985XCNESXE,A25L985XCNESXE,1,0.0,0
2,2,I'm doing pretty good for a Tuesday morning.,1,0,20180904-045349_715_live,A3A07QA5U733HQ,A25L985XCNESXE,A3A07QA5U733HQ,2,1.0,0
3,3,"Haha. Same here, but it really feels like a Mo...",1,1,20180904-045349_715_live,A3A07QA5U733HQ,A25L985XCNESXE,A25L985XCNESXE,3,2.0,0
4,4,Ugh yes it does!,2,0,20180904-045349_715_live,A3A07QA5U733HQ,A25L985XCNESXE,A3A07QA5U733HQ,4,3.0,0


keeps track of intended donations. note these are only available for persuadees (`role==1`) for 300 manually-annotated dialogues.

In [26]:
intended_df = pd.read_excel(os.path.join(ROOT_DIR, '300_info.xlsx'))

In [27]:
intended_df.columns = ['dialogue_id','speaker','role','intended', 'actual', 'n_turns']

In [28]:
intended_df = intended_df[intended_df.role == 1].set_index('dialogue_id')

In [29]:
intended_df.head()

Unnamed: 0_level_0,user,role,intended,actual,n_turns
dialogue_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
20180717-200206_41_live,A3JLE2LJ5I17E2,1,0.5,0.5,10.0
20180719-120436_413_live,A5NE8TWS8ZV7B,1,,0.0,10.0
20180719-122534_38_live,A2T007HZK66WM,1,5.0,0.5,11.0
20180719-165941_192_live,A290OV59Q76QC8,1,50.0,0.0,10.0
20180719-175233_833_live,A2GV9WSNSPX53,1,,0.05,10.0


In [30]:
convo_info = convo_info.join(intended_df.intended)

In [31]:
convo_info['is_annotated'] = convo_info.intended.notnull()

In [32]:
convo_info.head()

Unnamed: 0_level_0,user_er,donation_er,user_ee,donation_ee,intended,is_annotated
dialogue_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
20180904-045349_715_live,A3A07QA5U733HQ,0.0,A25L985XCNESXE,0.0,0.2,True
20180904-154250_98_live,A3GGA28ZY5CYTH,0.0,AG3ISFZMFGDQ9,2.0,,False
20180904-024226_703_live,A22WWSTT8TU7G1,0.05,A2JQOU78XJA6VP,0.05,0.05,True
20180904-100019_870_live,A29NICQTS9B05U,0.0,A302K8B1H9ISJA,0.0,,False
20180904-001208_706_live,A3LQ2F30FKC0CZ,0.0,A1RGD548VIVY96,1.0,,False


Reads the annotated subset. Note that annotations are per sentence. We'll store these as a list of annotations; we'll also keep around the sentence-tokenized text for these annotated utterances to facilitate corresponding the annotations with the texts.

In [33]:
annot_df = pd.read_excel(os.path.join(ROOT_DIR, 'data_AnnotatedData_300_dialog.xlsx'), index_col=0)

In [34]:
annot_df.columns = ['dialogue_id', 'role', 'user_turn_id', 'text', 'er_label_1', 'ee_label_1',
                   'er_label_2', 'ee_label_2', 'neg','neu','pos']

In [35]:
annot_df['turn_id'] = annot_df.dialogue_id + '__' + annot_df.role.map(str) + '__' + annot_df.user_turn_id.map(str)

In [36]:
agg_annots = annot_df.groupby('turn_id')[['er_label_1','er_label_2','ee_label_1','ee_label_2',
                            'neg','neu','pos']].agg(list)

In [37]:
agg_sents = annot_df.groupby('turn_id').text.agg(lambda x: ' <s> '.join(x)).rename('text_by_sent')

In [38]:
sents_per_utt = annot_df.turn_id.value_counts().rename('n_sents')

Join the annotations to the utterances. 

In [39]:
utt_df['turn_id'] = utt_df.dialogue_id + '__' + utt_df.role.map(str) + '__' + utt_df.user_turn_id.map(str)

In [40]:
utt_df = utt_df.join(agg_annots, on='turn_id').join(agg_sents, on='turn_id').join(sents_per_utt, on='turn_id')

In [41]:
utt_df['label_1'] = utt_df[['er_label_1','ee_label_1']].values[np.arange(len(utt_df)), utt_df.role.values]
utt_df['label_2'] = utt_df[['er_label_2','ee_label_2']].values[np.arange(len(utt_df)), utt_df.role.values]

In [42]:
utt_df.head()

Unnamed: 0,turn_id,text,user_turn_id,role,dialogue_id,user_er,user_ee,user,id,reply_to,...,er_label_2,ee_label_1,ee_label_2,neg,neu,pos,text_by_sent,n_sents,label_1,label_2
0,20180904-045349_715_live__0__0,Good morning. How are you doing today?,0,0,20180904-045349_715_live,A3A07QA5U733HQ,A25L985XCNESXE,A3A07QA5U733HQ,0,,...,"[nan, nan]","[nan, nan]","[nan, nan]","[0.0, 0.0]","[0.256, 1.0]","[0.7440000000000001, 0.0]",Good morning. <s> How are you doing today?,2.0,"[greeting, greeting]","[nan, nan]"
1,20180904-045349_715_live__1__0,Hi. I am doing good. How about you?,0,1,20180904-045349_715_live,A3A07QA5U733HQ,A25L985XCNESXE,A25L985XCNESXE,1,0.0,...,"[nan, nan, nan]","[greeting, greeting, greeting]","[nan, nan, nan]","[0.0, 0.0, 0.0]","[1.0, 0.408, 1.0]","[0.0, 0.5920000000000001, 0.0]",Hi. <s> I am doing good. <s> How about you?,3.0,"[greeting, greeting, greeting]","[nan, nan, nan]"
2,20180904-045349_715_live__0__1,I'm doing pretty good for a Tuesday morning.,1,0,20180904-045349_715_live,A3A07QA5U733HQ,A25L985XCNESXE,A3A07QA5U733HQ,2,1.0,...,[nan],[nan],[nan],[0.0],[0.45],[0.55],I'm doing pretty good for a Tuesday morning.,1.0,[greeting],[nan]
3,20180904-045349_715_live__1__1,"Haha. Same here, but it really feels like a Mo...",1,1,20180904-045349_715_live,A3A07QA5U733HQ,A25L985XCNESXE,A25L985XCNESXE,3,2.0,...,"[nan, nan]","[other, off-task]","[nan, nan]","[0.0, 0.0]","[0.0, 0.6559999999999999]","[1.0, 0.344]","Haha. <s> Same here, but it really feels like ...",2.0,"[other, off-task]","[nan, nan]"
4,20180904-045349_715_live__0__2,Ugh yes it does!,2,0,20180904-045349_715_live,A3A07QA5U733HQ,A25L985XCNESXE,A3A07QA5U733HQ,4,3.0,...,[nan],[nan],[nan],[0.397],[0.257],[0.347],Ugh yes it does!,1.0,[off-task],[nan]


We're now ready to construct a new Corpus object given the utterances. Note a gotcha (to be fixed or documented in a future release) -- to facilitate reading the corpus back into memory later, utterance and hence conversation IDs must be stored as strings.

In [66]:
# to deal with some data type problems
def safe_convert_str(x):
    if np.isnan(x): return None
    else: return str(int(x))

In [67]:
utterances = []
for utt_id, row in utt_df.iterrows():
    utterance = Utterance(id=str(utt_id), speaker=corpus_speakers[row.speaker], conversation_id=str(row.root),
                         reply_to=safe_convert_str(row.reply_to),text=row.text,
                         meta={
                             'user_turn_id': row.user_turn_id,
                             'role': row.role,
                             'text_by_sent': row.text_by_sent,
                             'n_sents': safe_convert_int(row.n_sents),
                             'label_1': row.label_1,
                             'label_2': row.label_2,
                             'sentiment': {'neg': row.neg, 'neu': row.neu, 'pos': row.pos}
                             
                         })
    utterances.append(utterance)

In [68]:
utterances[13]

Utterance({'id': '13', 'speaker': Speaker([('id', 'A25L985XCNESXE')]), 'conversation_id': '0', 'reply_to': '12', 'timestamp': None, 'text': 'Yes, I am sure you get a lot of storms.', 'meta': {'user_turn_id': 6, 'role': 1, 'text_by_sent': 'Yes, I am sure you get a lot of storms.', 'n_sents': 1, 'label_1': ['off-task'], 'label_2': [nan], 'sentiment': {'neg': [0.0], 'neu': [0.545], 'pos': [0.455]}}})

In [69]:
utterances[1993]

Utterance({'id': '1993', 'speaker': Speaker([('id', 'A3VYD9XUBY7RVO')]), 'conversation_id': '1980', 'reply_to': '1992', 'timestamp': None, 'text': "That's great! Although charity is a good cause, I want to feel like my time is being valued so I want to leave $1 for myself", 'meta': {'user_turn_id': 6, 'role': 1, 'text_by_sent': nan, 'n_sents': None, 'label_1': nan, 'label_2': nan, 'sentiment': {'neg': nan, 'neu': nan, 'pos': nan}}})

In [70]:
corpus = Corpus(utterances=utterances, version=1)

In [72]:
corpus.meta['name'] = "Persuasion For Good Corpus"

In [73]:
corpus.print_summary_stats()

Number of Speakers: 1285
Number of Utterances: 20932
Number of Conversations: 1017


update convo-level metadata

In [51]:
convo_info = convo_info.join(dialogue_to_convo_id.rename('convo_id'))

In [52]:
convo_info['dialogue_id'] = convo_info.index
convo_info = convo_info.set_index('convo_id')

In [53]:
convo_info_dict = convo_info.to_dict(orient='index')

In [75]:
for convo_id in corpus.get_conversation_ids():
    conversation = corpus.get_conversation(convo_id)
    conversation.meta = convo_info_dict[int(convo_id)]

finally, write to disk.

In [76]:
corpus.dump('persuasionforgood_corpus', base_path=os.path.dirname(ROOT_DIR))