In [1]:
import os
import convokit
from convokit import Corpus, Parser, PolitenessStrategies, download
import timeit
import re
from numpy import mean
from scipy import stats

In [None]:
corpus = Corpus(download('winning-args-corpus'))

In [3]:
utterance_ids = corpus.get_utterance_ids()

In [4]:
len(utterance_ids)

293297

In [5]:
u=0
s=0
for each in utterance_ids:
    if corpus.get_utterance(each).meta['success']==0:
        u=u+1
    if corpus.get_utterance(each).meta['success']==1:
        s=s+1

In [6]:
print('number of successful comments: '+str(s))
print('number of unsuccessful comments: '+str(u))

number of successful comments: 12420
number of unsuccessful comments: 7294


In [7]:
z=[]
for iD in utterance_ids:
    a=corpus.get_utterance(iD)
    b=a.meta
    for every in b['pair_ids']:
        z.append(every)
print('the number of unique pair_ids is: '+str(len(list(set(z)))))

the number of unique pair_ids is: 4263


Since this corpus is too large to parse with my laptop, we select the utterance ids for the groups of utterances that we are interested in. The following code finds the utterance ids for OP's comments (including the original post) and the challenger's comments (for both successful and unsuccessful arguments). Every other comment in the thread is excluded.

Note: this subset of data is still larger than the 'pair_data.json' in the data provided by the changemyview paper (see readme for citation) because I have also matched the OP replies to the challenger's comments.

In [8]:
#we want the original post made by op, the challenger's comments and all of OP's responses to the challengers
#these three lists are utterance ids for the original post, challenger comments and op replies respectively

opPost=[]
challengerComments=[]
opReplies=[]
for iD in utterance_ids:
    
    if corpus.get_utterance(iD).id==corpus.get_utterance(iD).root:
        opPost.append(iD)
    if corpus.get_utterance(iD).speaker.id != corpus.get_utterance(corpus.get_utterance(iD).root).speaker.id and corpus.get_utterance(iD).meta['success']==0:
        challengerComments.append(iD)

    if corpus.get_utterance(iD).speaker.id != corpus.get_utterance(corpus.get_utterance(iD).root).speaker.id and corpus.get_utterance(iD).meta['success']==1:
        challengerComments.append(iD)


    if corpus.get_utterance(iD).id!=corpus.get_utterance(iD).root and corpus.get_utterance(iD).speaker.id == corpus.get_utterance(corpus.get_utterance(iD).root).speaker.id and corpus.get_utterance(iD).meta['success']==0:
        opReplies.append(iD)
    if corpus.get_utterance(iD).id!=corpus.get_utterance(iD).root and corpus.get_utterance(iD).speaker.id == corpus.get_utterance(corpus.get_utterance(iD).root).speaker.id and corpus.get_utterance(iD).meta['success']==1:
        opReplies.append(iD)
        
#subset challenger and op replies for later use (into successful and unsuccessful arguments)
challengerPos=[]
challengerNeg=[]
for iD in challengerComments:
    if corpus.get_utterance(iD).meta['success']==1:
        challengerPos.append(iD)
    if corpus.get_utterance(iD).meta['success']==0:
        challengerNeg.append(iD)
#these are OP's replies to successful and unsuccessful challengers        
opReplyPos=[]
opReplyNeg=[]
for iD in opReplies:
    if corpus.get_utterance(iD).meta['success']==1:
        opReplyPos.append(iD)
    if corpus.get_utterance(iD).meta['success']==0:
        opReplyNeg.append(iD)

In [9]:
subset=opPost+challengerComments+opReplies

In [10]:
#collect utterance dict given the subset of ids
utterance_list=[]
for iD in subset:
    utterance_list.append(corpus.get_utterance(iD))

In [11]:
#collect challenger comments to count pairID (data validation check: there should be 3456 unique pair IDs)
challenger_utterance_list = []
for iD in challengerComments:
    challenger_utterance_list.append(corpus.get_utterance(iD))

In [12]:
pair_idz=[]
for utt in utterance_list:
    for each in utt.meta['pair_ids']:
        pair_idz.append(each)
print(len(pair_idz))
print(len(list(set(pair_idz))))

20536
4263


In [13]:
pair_idz=[]
for utt in challenger_utterance_list:
    for each in utt.meta['pair_ids']:
        pair_idz.append(each)
print(len(pair_idz))
print(len(list(set(pair_idz))))

11567
4263


Create the subset corpus that we are interested (note: the original data from the paper only contained the challenger replies and the original post, nothing else was included -- I collected OP's replies from the 'all' data)

In [14]:
#this subset separates OP comments and challenger utterances from all other comments in every conversation (thread)
corpus = convokit.Corpus(utterances=utterance_list,version=1)

In [15]:
corpus.print_summary_stats()

Number of Speakers: 6210
Number of Utterances: 22765
Number of Conversations: 3051


In [16]:
len(challengerComments)

11020

In [17]:
len(opReplies)

8694

Simple statistics:

In [18]:
print('Note: the averages below are for challenger comments only')

corpus.print_summary_stats()

utts = list(corpus.iter_utterances()) #list of all uterrance objects in the corpus
succ_length = [] #length of all comments in succesful threads
root_succ_length = [] #length of successful root replies
succ_deltas = [] #num deltas given to users commenting in all succesful threads
root_succ_deltas = [] #num deltas given to root commenters in succesful threads
unsucc_length = [] #length of all comments in unsuccesful threads
root_unsucc_length = [] #length of unsuccessful root replies 
unsucc_deltas = [] #num deltas given to users commenting in all unsuccesful threads
root_unsucc_deltas = [] #num deltas given to root commenters in unsuccessful threads

for i in utts:
    if i.root != i.id and i.speaker.id != corpus.get_utterance(i.root).speaker.id: #exclude the original post and exlcude comments made by op
        if i.meta['success'] == 1: #if succesful
            succ_length.append(len((i.text).split())) #num words
            if i.reply_to == i.root: #it's a root comment
                root_succ_length.append(len((i.text).split()))
            
            if i.meta['author_flair_text']:
                r = re.search(r'\d+',i.meta['author_flair_text'])#number of delta given to author. Note: have not checked this regex
                if r: #ignore weird cases that don't fit pattern, e.g. inf
                    succ_deltas.append(int(r.group())) 
                    if i.reply_to == i.root: #it's a root comment
                        root_succ_deltas.append(int(r.group()))
            else:
                succ_deltas.append(0)
                if i.reply_to == i.root: #it's a root comment
                    root_succ_deltas.append(0)            
        elif i.meta['success'] ==0: #if unsuccesful:
            unsucc_length.append(len((i.text).split())) #num words
            if i.reply_to == i.root: #it's a root comment
                root_unsucc_length.append(len((i.text).split()))
            
            if i.meta['author_flair_text']:
                r = re.search(r'\d+',i.meta['author_flair_text'])
                if r: #ignore weird cases that don't fit pattern, e.g. inf
                    unsucc_deltas.append(int(r.group()))
                    if i.reply_to == i.root: #it's a root comment
                        root_unsucc_deltas.append(int(r.group()))
            else:
                unsucc_deltas.append(0)
                if i.reply_to == i.root: #it's a root comment
                    root_unsucc_deltas.append(0)            
            

#length of comments
print('Average number of words in a succesful comment is ' + str(mean(succ_length)))
print('Average number of words in an unsuccesful comment is ' + str(mean(unsucc_length)))
p_val = stats.ttest_ind(succ_length,unsucc_length,equal_var=False)[1] #using Welch's t-test, because I have no reason to assume variances are the same.
print('p-value for number of words is ' + str(p_val))

#length of root comments
print('Average number of words in a succesful root comment is ' + str(mean(root_succ_length)))
print('Average number of words in an unsuccesful rootcomment is ' + str(mean(root_unsucc_length)))
p_val = stats.ttest_ind(root_succ_length,root_unsucc_length,equal_var=False)[1] #using Welch's t-test, because I have no reason to assume variances are the same.
print('p-value for number of words in root comments is ' + str(p_val))

#deltas to commenters
print('Average number of deltas assigned to an author of a succesful comment is ' + str(mean(succ_deltas)))
print('Average number of deltas assigned to an author of a unsuccesful comment is ' + str(mean(unsucc_deltas)))
p_val = stats.ttest_ind(succ_deltas,unsucc_deltas,equal_var=False)[1] #using Welch's t-test, because I have no reason to assume variances are the same.
print('p-value for number of deltas assigned to commenters is ' + str(p_val))

#deltas to root commenters
print('Average number of deltas assigned to an author of a succesful root comment is ' + str(mean(root_succ_deltas)))
print('Average number of deltas assigned to an author of a unsuccesful root comment is ' + str(mean(root_unsucc_deltas)))
p_val = stats.ttest_ind(root_succ_deltas,root_unsucc_deltas,equal_var=False)[1] #using Welch's t-test, because I have no reason to assume variances are the same.
print('p-value for number of deltas assigned to root commenters is ' + str(p_val))

Note: the averages below are for challenger comments only
Number of Speakers: 6210
Number of Utterances: 22765
Number of Conversations: 3051
Average number of words in a succesful comment is 257.3235489220564
Average number of words in an unsuccesful comment is 197.236873747495
p-value for number of words is 5.38614002411782e-57
Average number of words in a succesful root comment is 276.7655660377358
Average number of words in an unsuccesful rootcomment is 214.2785825142266
p-value for number of words in root comments is 7.645306791441279e-44
Average number of deltas assigned to an author of a succesful comment is 13.61011608623549
Average number of deltas assigned to an author of a unsuccesful comment is 8.446492985971943
p-value for number of deltas assigned to commenters is 3.355536404744011e-38
Average number of deltas assigned to an author of a succesful root comment is 12.837028301886793
Average number of deltas assigned to an author of a unsuccesful root comment is 8.18494568028