In [49]:
from tqdm import tqdm
from convokit import Corpus, User, Utterance

## Creating Users

In [50]:
data_dir = "./data/FullData/full_info.csv"

The full PersuasionForGood dataset contains two files:
    full_info.csv
    full_dialog.csv
The metadata for each of the users are contained in the full_info.csv file. 
The first five columns of each line include information about the dialogue that the user is a part of. Because a user can be involved in multiple dialogues, each dialogue is a key within the user metadata which retreives the role, donation amount and number of turn associated with the user within a specific dialogue. The rest of the entries correspond to the 23 dimensional feature vector associated with each user. 

Additionally, the dictionary diag_to_user is used so that the users can be searched with just a dialogue ID and role. This is because the full_dialog.csv file only has the dialogue id and not the user_ids of the participating users. 

In [58]:
user_meta = {}
first = True
headers = None
diag_to_user = {}
with open(data_dir, "r", encoding='utf-8', errors='ignore') as f:
    for line in f:          
        info = line.split(",")
        if first:
            #Getting the headers of the personality vector
            headers = info[5:]
            first = False
        else:
            diag_to_user[(info[0],info[2])] = info[1]
            if info[1] in user_meta:
                user_meta[info[1]][info[0]] = {"role": info[2],
                                       "donation_amt": info[3],
                                       "num_turns": info[4]}
            else:
                user_meta[info[1]] = {info[0]: {"role": info[2],
                                       "donation_amt": info[3],
                                       "num_turns": info[4]}}
                for i, heading in enumerate(headers):
                    user_meta[info[1]][heading] = info[i+5]
        

In [59]:
corpus_users = {k: User(name = k, meta = v) for k,v in user_meta.items()}

In [60]:
print("number of users in the data = {0}".format(len(corpus_users)))

number of users in the data = 1285


In [84]:
headers

['extrovert.x',
 'agreeable.x',
 'conscientious.x',
 'neurotic.x',
 'open.x',
 'care.x',
 'fairness.x',
 'loyalty.x',
 'authority.x',
 'purity.x',
 'freedom.x',
 'conform.x',
 'tradition.x',
 'benevolence.x',
 'universalism.x',
 'self_direction.x',
 'stimulation.x',
 'hedonism.x',
 'achievement.x',
 'power.x',
 'security.x',
 'rational.x',
 'intuitive.x',
 'age.x',
 'sex.x',
 'race.x',
 'edu.x',
 'marital.x',
 'employment.x',
 'income.x',
 'religion.x',
 'ideology.x\n']

## Creating Utterances

Utterances can be found in the full_dialog.csv, with 20932 lines total. The raw data has four columns: <br>
"Unit" is the text of the utterance.  <br>
"Turn" is the turn of the utterance; for example, the first back and forth between persuader and persuadee would both have "Turn" = 0.  <br>
"B2" is the conversation ID. <br>
"B4" is the role of the user in this utterance, persuader(0) or persuadee (1). <br>

As per the utterance object, we give each utterance a unique id and the unique user ID is found using a dictionary which takes in the conversation ID and role of user. The metadata for each utterance includes both the turn and role  of user. 

In [61]:
import pandas as pd
utterance_data = pd.read_csv("./data/FullData/full_dialog.csv") 
utterance_data

Unnamed: 0.1,Unnamed: 0,Unit,Turn,B4,B2
0,0,Good morning. How are you doing today?,0,0,20180904-045349_715_live
1,1,Hi. I am doing good. How about you?,0,1,20180904-045349_715_live
2,2,I'm doing pretty good for a Tuesday morning.,1,0,20180904-045349_715_live
3,3,"Haha. Same here, but it really feels like a Mo...",1,1,20180904-045349_715_live
4,4,Ugh yes it does!,2,0,20180904-045349_715_live
...,...,...,...,...,...
20927,15,My small change won't do a thing for those kid...,7,1,20180717-200206_41_live
20928,16,Well with our current president it's not likel...,8,0,20180717-200206_41_live
20929,17,"If it makes you happy, go ahead and take .50",8,1,20180717-200206_41_live
20930,18,"Oh thank you so much, you have no idea how muc...",9,0,20180717-200206_41_live


In [62]:
len(diag_to_user)

2034

In [85]:
utterance_corpus = {}
convo_order = utterance_data["Unnamed: 0"]

for i in list(range(len(utterance_data))):
    utterance_data_line = utterance_data.loc[i, :]
    B2 = utterance_data_line["B2"]
    B4 = utterance_data_line["B4"]
    Text = utterance_data_line["Unit"]
    meta = {"role": str(B4), "num_turns": utterance_data_line["Turn"], "dialogue_id" : B2}
    
    if utterance_data_line["Unnamed: 0"] == 0:
        ReplyTo = None
    else:  
        ReplyTo = i - 1
    if utterance_data_line["Unnamed: 0"] == 0:
        Root = i
    else:
        utter_id = list(range(0, i+1))
        convo_order_sub = convo_order.loc[0:i]
        df = pd.DataFrame({"utter_id": utter_id, "convo_order_sub":convo_order_sub})
        df2 = df[df["convo_order_sub"] == 0]
        Root = df2['utter_id'].max()
    utterance_corpus[i] = Utterance(i, corpus_users[diag_to_user[(B2,str(B4))]], Root, ReplyTo, None, Text, meta=meta)

In [89]:
utterance_corpus[0].meta

{'dialogue_id': '20180904-045349_715_live', 'num_turns': 0, 'role': '0'}

In [65]:
len(utterance_corpus)

20932

##  Creating corpus from utterances

In [94]:
utterance_list = [utterance for k, utterance in utterance_corpus.items()]

In [95]:
persuasion_corpus = Corpus(utterances=utterance_list, version=1)

In [96]:
print("number of conversations in the dataset = {}".format(len(persuasion_corpus.get_conversation_ids())))

number of conversations in the dataset = 1017


In [97]:
convo_ids = persuasion_corpus.get_conversation_ids()
for i, convo_idx in enumerate(convo_ids[0:5]):
    print("sample conversation {}:".format(i))
    print(persuasion_corpus.get_conversation(convo_idx).get_utterance_ids())

sample conversation 0:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
sample conversation 1:
[21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]
sample conversation 2:
[41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60]
sample conversation 3:
[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81]
sample conversation 4:
[82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102]


In [98]:
persuasion_corpus.dump("persuasion_corpus","./")