In [1]:
import a1_dataloader
import dill, json, codecs
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from collections import OrderedDict
from pprint import pprint

# Environment vars
LANG = 'en'
GOLD = False

# Exploring explicit discourse connectives - Auto UD
For each explicit discourse connective example (positive and negative), using the automatically generated UD1.0 annotations obtained from the conversion using gold constituency parses, extract information (out-degree, dependent set, etc) to obtain a view of the prototypical structure for each DC. 

### A. Prepping the data and extracting insights
##### 0. Load and pre-treat data - train and dev set only

In [2]:
for dataset in ['train', 'dev']:
    with open('../../03_data/{}/pdtb_conll_data/ParsePDTB_dict_{}.dill'.format(LANG, dataset), 'rb') as f:
        globals()['ParsePDTB_dict_{}'.format(dataset)] = dill.load(f)

In [3]:
# filter Parse_dict and retain only Parses for sentences with explicit connectives
explicit_conns = list()
for dataset in ['train', 'dev']:
    for i in globals()['ParsePDTB_dict_{}'.format(dataset)]:
        for i2 in range(len(globals()['ParsePDTB_dict_{}'.format(dataset)][i])):
            if globals()['ParsePDTB_dict_{}'.format(dataset)][i][i2].Connectives['Explicit'] != []:
                explicit_conns.append(globals()['ParsePDTB_dict_{}'.format(dataset)][i][i2]) 

In [4]:
# load both the positive and negatives examples of explicit connectives 
for type_ in ['pos', 'neg']:
    for dataset in ['train', 'dev']:
        with open('../..//03_data/{}/explicit_connectives/ExpConn_{}exp_{}.dill'.format(LANG, type_, dataset), 'rb') as f:
            globals()['{}exp_{}'.format(type_, dataset)] = dill.load(f)

In [5]:
# separate between singleton and MWE connectives, to facilitate analysis
# NOTE: using the connheadmapper provided by CoNLL 2016 Shared Task organiser to map MWE explicit DCs to 
# their minimal fixed expression unit - e.g. "Just as soon as" to "as soon as"
expconn_singleton = list()
expconn_mwe = list()
for type_ in ['pos', 'neg']:
    for dataset in ['train', 'dev']:
        for expconn in globals()['{}exp_{}'.format(type_, dataset)]:
            if len(expconn.Connective['MappedText'].split(' ')) == 1: expconn_singleton.append(expconn)
            if len(expconn.Connective['MappedText'].split(' ')) > 1: expconn_mwe.append(expconn)

##### 2. Extract info to obtain insights on prototypical structures

For MWE DCs

In [6]:
EXPCONN_SET = 'mwe'

expconns_list = list()
for expconn in globals()['expconn_{}'.format(EXPCONN_SET)]:
    # create empty dict to store info about a single explicit DC
    expconn_data = OrderedDict() # to stop pandas auto sorting the columns
    expconn_data['docid'], docid = expconn.DocID, expconn.DocID
    expconn_data['sentid'], sentid = expconn.Connective['TokenList'][0][-2], expconn.Connective['TokenList'][0][-2]
    expconn_data['posexp'] = expconn.PosExp
    expconn_data['cstr'] = expconn.Connective['RawText']
    expconn_data['cstr_mapped'] = expconn.Connective['MappedText']

    connidx = [i[-1] for i in expconn.Connective['TokenList']] 
    # this is a list, with one element for singleton, it is zero-indexed on the first word

    # extract the relevant Parse object from the ParsePDTB_dict_
    if 2 <=int(docid.split('_')[-1][0:2])<22: parsejson = 'ParsePDTB_dict_{}'.format('train')
    if 22 <= int(docid.split('_')[-1][0:2]) < 23: parsejson = 'ParsePDTB_dict_{}'.format('dev')

    # work with the relevant Parse object 
    _parse_obj = globals()[parsejson][docid][sentid] 

    expconn_data['rawtext']=_parse_obj.RawText
    if GOLD == True: _dependencies = _parse_obj.DepTree_UD1_Gold     
        # using Gold UD1.0 parses (converted from PTB root s-trees)
    if GOLD == False: _dependencies = _parse_obj.DepTree_UD1_Auto  
        # using Auto UD1.0 parses (parsed from raw text using CoreNLP 3.9.2)
    _coordinates = [(int(i[1].split('-')[-1]), int(i[2].split('-')[-1])) for i in _dependencies] 
    # note that some tokens are mwes, e.g. their entries would be like "Association-College-6", take -1 index to be safe
    _wordsdict = {int(i[2].split('-')[-1]) : '-'.join(i[2].split('-')[:-1]) for i in _dependencies}
    _wordsdict[0] = '_root_'

    ##### 1. create nx DiGraph object #####
    nxgraph = nx.DiGraph(_coordinates)
    # a. add the wordform, UD Gold pos tag to the attribute for each node 
    _nodes = sorted([i for i in nxgraph.nodes()])
    for i in _nodes[1:]:
        nxgraph.node[i]['FORM'] = _wordsdict[i]
        if GOLD == True: nxgraph.node[i]['UDPOS'] = _parse_obj.Words[i-1][1]['PartOfSpeech_UDGold'] 
        if GOLD == False: nxgraph.node[i]['UDPOS'] = _parse_obj.Words[i-1][1]['PartOfSpeech_UDAuto'] 
        # minus 1 because PartOfSpeech_UDGold is zero-indexed
        nxgraph.node[0]['UDPOS'] = 'root'

    # b. add the labels for each edge (the dependency relation)
    for idx in range(len(_coordinates)):
        coordpair = _coordinates[idx]
        nxgraph[coordpair[0]][coordpair[1]]['deprel_incoming'] = _dependencies[coordpair[1]-1][0]
    # note that nxgraph[i] will show the i-th node (in our case the i-th token
    # in the sentence) and it's descendents.
    # minus 1 (in coordpair[1]-1) because PartOfSpeech_UDGold/PartOfSpeech_UDAuto is zero-indexed


    ##### 2. generate statistics #####
    # a. create the necessary subgraphs 
    conn_subgraph = nxgraph.subgraph([i+1 for i in connidx]) 
    # nx object, use .nodes() to get node nums... recall that connidx is zero-indexed. 
    # The nodes in nxgraph are not zero-indexed. i.e. the index of the first word is 1

    __ = [list(nxgraph.predecessors(i)) for i in conn_subgraph]
    __flat = [item for sublist in __ for item in sublist]
    gov_subgraph = nxgraph.subgraph(__flat) # nx object

    __ = [list(nxgraph.successors(i)) for i in conn_subgraph]
    __flat = [item for sublist in __ for item in sublist]
    dep_subgraph = nxgraph.subgraph(__flat) # nx object

    # b1. check if the DC is entirely internally connected [it will be yes for singletons]
    # 'entirely' in the sense that every node has at least one directed, local, path between itself 
    # and one other node of the DC. 
    if EXPCONN_SET == 'singleton': connected = 'yes'    
    else:  
        family = list()
        for eachnode in conn_subgraph:
            othernodes = set(conn_subgraph).difference(set([eachnode]))
            siblings = nxgraph.neighbors(eachnode)
            gov = nxgraph.predecessors(eachnode)
            deps = nxgraph.successors(eachnode)

            [family.extend(i) for i in [siblings, gov, deps]]

        # check that each node is connected with one of its parent
        truth = [i in family for i in othernodes]     
        if False in truth: connected = 'no'
        else: connected = 'yes'

    expconn_data['fullyconnected'] = connected

    # b2. check if the DC shares the same parent [it will be yes for singletons]
    if EXPCONN_SET == 'singleton': connected = 'yes'    
    else:  
        gov_set = set()
        for eachnode in conn_subgraph:
            gov = list(nxgraph.predecessors(eachnode))
            assert len(gov) == 1 # there should only be 1 gov
            [gov_set.add(i) for i in gov] 
        if len(gov_set)==1: sameparent = 'yes'
        else: sameparent = 'no'

    expconn_data['sameparent'] = sameparent

    # c. calculate outdegree
    # think about: what happens if it is an MWE? what is the outdegree? collective?
    __ = nxgraph.out_degree([i+1 for i in connidx]) # example return: DiDegreeView({4: 2, 5: 2, 6: 1})

    outdegree = [i[1] for i in __] # result is a list, 1st elem is the out degree for the 
    # first token in the connective. 
    expconn_data['outdegree'] = outdegree

    # d. return governor set of pos tags [set of 1 for singletons]
    all_neighbours = list()
    all_siblings = list()
    all_govs = list()
    all_deps = list()
    all_deprels = set()
    for eachnode in conn_subgraph:
        
#         neighbours = list(nxgraph.neighbors(eachnode)) 
#         neighbours_pos = [nxgraph.node[i]['UDPOS'] for i in neighbours]
#         all_neighbours.append(neighbours_pos)
# since UD is a directed tree, the neighbours of a node (the nodes it is connected to) 
# are the same as its dependents (because of the directed edge)
        
        gov = list(nxgraph.predecessors(eachnode))
        gov_pos = [nxgraph.node[i]['UDPOS'] for i in gov]
        deprel = nxgraph[gov[0]][eachnode]['deprel_incoming']
        all_govs.append(gov_pos)
        all_deprels.add(deprel)
        
        _siblings = [list(nxgraph.successors(i)) for i in gov]
        siblings = [item for sublist in _siblings for item in sublist]
        siblings_pos = set([nxgraph.node[i]['UDPOS'] for i in siblings])
        all_siblings.append(siblings_pos)


        deps = list(nxgraph.successors(eachnode))
        deps_pos = [nxgraph.node[i]['UDPOS'] for i in deps]
        all_deps.append(deps_pos)

#     expconn_data['neighbours_pos'] = all_neighbours
    expconn_data['siblings_pos'] = all_siblings
    expconn_data['gov_pos'] = all_govs
    expconn_data['deps_pos'] = all_deps
    expconn_data['deprel_incoming'] = all_deprels
    expconn_data['deptree']=_parse_obj.DepTree_UD1_Gold

    # e. return governor set of pos tags [set of 1 for singletons]


    expconns_list.append(expconn_data)
             
             
# generate a pandas dataframe     
expconnstats_df = pd.DataFrame(expconns_list)

In [7]:
expconnstats_df.shape

(1420, 14)

In [8]:
expconnstats_df.head(2)

Unnamed: 0,docid,sentid,posexp,cstr,cstr_mapped,rawtext,fullyconnected,sameparent,outdegree,siblings_pos,gov_pos,deps_pos,deprel_incoming,deptree
0,wsj_0207,12,True,as if,as if,"Then , as if to show that he could play fast a...",yes,no,"[1, 0]","[{SCONJ, PART, VERB}, {SCONJ}]","[[VERB], [SCONJ]]","[[SCONJ], []]","{mwe, mark}","[[advmod, offered-16, Then-1], [punct, offered..."
1,wsj_0208,1,True,In fact,in fact,"In fact , he liberated the U.S. from one of th...",yes,no,"[0, 1]","[{ADP}, {PRON, NUM, PUNCT, NOUN, PROPN}]","[[NOUN], [VERB]]","[[], [ADP]]","{nmod, case}","[[case, fact-2, In-1], [nmod, liberated-5, fac..."


##### 3. Data integrity check 

In [9]:
expconnstats_df.loc[811]

docid                                                       wsj_2276
sentid                                                            30
posexp                                                          True
cstr                                                     for example
cstr_mapped                                              for example
rawtext            Many major institutions , for example , came i...
fullyconnected                                                   yes
sameparent                                                        no
outdegree                                                     [0, 1]
siblings_pos                       [{ADP}, {PUNCT, VERB, NOUN, ADJ}]
gov_pos                                             [[NOUN], [VERB]]
deps_pos                                                 [[], [ADP]]
deprel_incoming                                         {nmod, case}
deptree            [[amod, major-2, Many-1], [amod, institutions-...
Name: 811, dtype: object

In [10]:
ParsePDTB_dict_dev['wsj_2276'][30].RawText

'Many major institutions , for example , came into work yesterday ready to buy some of the blue chips they felt had been sharply undervalued on Friday .'

In [11]:
# check that the outdegree for the connective tokens are correct
from collections import Counter
if GOLD == True: pprint( Counter([i[1] for i in ParsePDTB_dict_dev['wsj_2276'][30].DepTree_UD1_Gold]) )
if GOLD == False: pprint( Counter([i[1] for i in ParsePDTB_dict_dev['wsj_2276'][30].DepTree_UD1_Auto]) )

Counter({'came-8': 9,
         'chips-19': 4,
         'undervalued-25': 4,
         'institutions-3': 2,
         'buy-14': 2,
         'example-6': 1,
         'ROOT-0': 1,
         'work-10': 1,
         'ready-12': 1,
         'some-15': 1,
         'felt-21': 1,
         'Friday-27': 1})


In [12]:
# check that the deprels going into the connective tokens are correct
if GOLD == True: pprint(ParsePDTB_dict_dev['wsj_2276'][30].DepTree_UD1_Gold )
if GOLD == False: pprint( ParsePDTB_dict_dev['wsj_2276'][30].DepTree_UD1_Auto )

[['amod', 'institutions-3', 'Many-1'],
 ['amod', 'institutions-3', 'major-2'],
 ['nsubj', 'came-8', 'institutions-3'],
 ['punct', 'came-8', ',-4'],
 ['case', 'example-6', 'for-5'],
 ['nmod', 'came-8', 'example-6'],
 ['punct', 'came-8', ',-7'],
 ['root', 'ROOT-0', 'came-8'],
 ['case', 'work-10', 'into-9'],
 ['nmod', 'came-8', 'work-10'],
 ['nmod:tmod', 'came-8', 'yesterday-11'],
 ['xcomp', 'came-8', 'ready-12'],
 ['mark', 'buy-14', 'to-13'],
 ['xcomp', 'ready-12', 'buy-14'],
 ['dobj', 'buy-14', 'some-15'],
 ['case', 'chips-19', 'of-16'],
 ['det', 'chips-19', 'the-17'],
 ['amod', 'chips-19', 'blue-18'],
 ['nmod', 'some-15', 'chips-19'],
 ['nsubj', 'felt-21', 'they-20'],
 ['acl:relcl', 'chips-19', 'felt-21'],
 ['aux', 'undervalued-25', 'had-22'],
 ['auxpass', 'undervalued-25', 'been-23'],
 ['advmod', 'undervalued-25', 'sharply-24'],
 ['advcl', 'came-8', 'undervalued-25'],
 ['case', 'Friday-27', 'on-26'],
 ['nmod', 'undervalued-25', 'Friday-27'],
 ['punct', 'came-8', '.-28']]


In [13]:
# check that the governor and dependents pos tag sets are correctly captured
if GOLD == True: print([i[1]['PartOfSpeech_UDGold'] for i in ParsePDTB_dict_dev['wsj_2276'][30].Words])
if GOLD == False: print([i[1]['PartOfSpeech_UDAuto'] for i in ParsePDTB_dict_dev['wsj_2276'][30].Words])

['ADJ', 'ADJ', 'NOUN', 'PUNCT', 'ADP', 'NOUN', 'PUNCT', 'VERB', 'ADP', 'NOUN', 'NOUN', 'ADJ', 'PART', 'VERB', 'DET', 'ADP', 'DET', 'ADJ', 'NOUN', 'PRON', 'VERB', 'AUX', 'AUX', 'ADV', 'VERB', 'ADP', 'PROPN', 'PUNCT']


### B. Analysing the data on DC structures - Positive Examples

##### 1. Inspecting the unusual cases

In [14]:
# number of instances where MWE DCs are not entirely internally connected (i.e. one or more node in the DC is does not have
# a local path to at least one other node in the DC)
condition1 = expconnstats_df['fullyconnected']=='no' # retaining only DCs that are not entirely internally connected
condition2 = expconnstats_df['posexp']==True         # retaining only the positive examples 
mwe_pos_notconnected = expconnstats_df[ condition1 & condition2 ]
mwe_pos_notconnected.shape

(75, 14)

In [15]:
mwe_pos_notconnected.head(2)

Unnamed: 0,docid,sentid,posexp,cstr,cstr_mapped,rawtext,fullyconnected,sameparent,outdegree,siblings_pos,gov_pos,deps_pos,deprel_incoming,deptree
2,wsj_0214,8,True,Now that,now that,Now that the baseball season is officially ove...,no,yes,"[0, 0]","[{SCONJ, ADV, VERB, NOUN}, {SCONJ, ADV, VERB, ...","[[ADV], [ADV]]","[[], []]","{advmod, mark}","[[advmod, over-8, Now-1], [mark, over-8, that-..."
18,wsj_0275,12,True,If then,if then,If the answers to these questions are affirmat...,no,no,"[0, 0]","[{SCONJ, VERB, NOUN}, {ADV, ADJ}]","[[ADJ], [NOUN]]","[[], []]","{advmod, mark}","[[mark, affirmative-8, If-1], [det, answers-3,..."


In [16]:
# 
mwe_pos_notconnected_diffparent = mwe_pos_notconnected[mwe_pos_notconnected['sameparent']=='no']
mwe_pos_notconnected_diffparent.shape

(46, 14)

In [17]:
# if then and neither nor are parallel/disjoint connectives. Naturally, in UD annotation, they are not expected to be 
# entirely internally connected, but attached to their clausal counterparts
mwe_pos_notconnected_diffparent['cstr_mapped'].unique()

array(['if then', 'as soon as', 'now that', 'as long as', 'neither nor'],
      dtype=object)

##### 2. Analysing MWE DCs that are not entirely internally connected - 'as soon as', 'as long as', 
>a. there are 11 instances in the (train and dev set) of such MWEs

>b. OBSERVATION all of such split DCs have incoming deprel sets of {advmod, mark}!

In [18]:
mwe_pos_notconnected_diffparent[mwe_pos_notconnected_diffparent['cstr_mapped'] != 'if then']

Unnamed: 0,docid,sentid,posexp,cstr,cstr_mapped,rawtext,fullyconnected,sameparent,outdegree,siblings_pos,gov_pos,deps_pos,deprel_incoming,deptree
294,wsj_1059,33,True,as soon as,as soon as,"IBM , Armonk , N.Y. , said it wanted to bring ...",no,no,"[0, 2, 0]","[{ADP, ADV, PART, NOUN}, {SCONJ, VERB, PRON}, ...","[[VERB], [AUX], [ADV]]","[[ADV, AUX], [], []]","{advmod, mark}","[[nsubj, said-7, IBM-1], [punct, IBM-1, ,-2], ..."
311,wsj_1116,4,True,as soon as,as soon as,After a wonderfully frivolous early childhood ...,no,no,"[0, 2, 0]","[{SCONJ, PRON, NOUN}, {ADV, VERB}, {PRON, ADV,...","[[VERB], [ADV], [VERB]]","[[], [], [ADV, VERB]]","{advmod, mark}","[[case, childhood-6, After-1], [det, childhood..."
334,wsj_1157,28,True,as soon as,as soon as,But previous sales have often been sales of sh...,no,no,"[0, 2, 0]","[{ADV, PUNCT}, {ADP}, {ADV, VERB}]","[[VERB], [VERB], [ADV]]","[[ADV, VERB], [], []]","{advmod, mark}","[[cc, sales-7, But-1], [amod, sales-3, previou..."
337,wsj_1162,9,True,just as soon as,as soon as,"Thus , optimistic entrepreneurs await a promis...",no,no,"[0, 0, 4, 0]","[{DET, ADV, NOUN, ADJ}, {SCONJ, PROPN, ADP, VE...","[[NOUN], [VERB], [ADV], [ADV]]","[[PUNCT, ADV, ADV, VERB], [], [], []]","{dep, advmod, mark}","[[advmod, await-5, Thus-1], [punct, await-5, ,..."
489,wsj_1554,11,True,now that,now that,Some analysts contend that silver is cheap now...,no,no,"[0, 0]","[{DET, ADV, VERB, NOUN}, {SCONJ, AUX, NOUN}]","[[ADJ], [VERB]]","[[], []]","{advmod, mark}","[[det, analysts-2, Some-1], [nsubj, contend-3,..."
643,wsj_1830,25,True,as long as,as long as,"With nearly 4,000 machines in use , there have...",no,no,"[0, 2, 0]","[{SCONJ, AUX, NOUN}, {ADV, VERB}, {PRON, ADV, ...","[[VERB], [ADV], [NOUN]]","[[], [], [ADV, VERB]]","{advmod, mark}","[[mark, machines-4, With-1], [advmod, 4,000-3,..."
752,wsj_2065,7,True,neither nor,neither nor,The Merc said that as part of the disciplinary...,no,no,"[0, 0]","[{SCONJ, CONJ, PUNCT, NOUN, VERB}, {DET}]","[[VERB], [NOUN]]","[[], []]","{det, cc}","[[det, Merc-2, The-1], [nsubj, said-3, Merc-2]..."
758,wsj_2102,21,True,as long as,as long as,Bush assured Roh that the U.S. would stand by ...,no,no,"[0, 2, 0]","[{SCONJ, PRON, NOUN}, {ADV, VERB}, {SCONJ, ADV...","[[VERB], [ADV], [VERB]]","[[], [], [ADV, VERB]]","{advmod, mark}","[[nsubj, assured-2, Bush-1], [root, ROOT-0, as..."
784,wsj_2162,15,True,as soon as,as soon as,The loan was to have matured in just two to th...,no,no,"[0, 2, 0]","[{SCONJ, AUX, NOUN}, {ADV, VERB}, {ADV, NOUN, ...","[[VERB], [ADV], [VERB]]","[[], [], [ADV, VERB]]","{advmod, mark}","[[det, loan-2, The-1], [nsubj, was-3, loan-2],..."
794,wsj_2222,27,True,as long as,as long as,"The new circuit breakers , if they are to be a...",no,no,"[0, 2, 0]","[{SCONJ, ADV, NOUN}, {SCONJ, PROPN, ADJ}, {ADV...","[[VERB], [VERB], [ADV]]","[[ADV, VERB], [], []]","{advmod, mark}","[[det, breakers-4, The-1], [amod, breakers-4, ..."


In [19]:
# examining the NEGATIVE EXAMPLES for 'as soon as'
# all of them don't have {advmod, mark} as incoming deprels; almost all of these negative examples of 'as soon as' have
# case and advmod as their incoming deprel set
expconnstats_df[(expconnstats_df['cstr_mapped']=='as soon as') & (expconnstats_df['posexp']==False)]

Unnamed: 0,docid,sentid,posexp,cstr,cstr_mapped,rawtext,fullyconnected,sameparent,outdegree,siblings_pos,gov_pos,deps_pos,deprel_incoming,deptree
955,wsj_0661,0,False,as soon as,as soon as,A unit of DPC Acquisition Partners launched a ...,yes,no,"[0, 2, 0]","[{ADV, ADJ}, {PUNCT, ADV, PART, NOUN}, {ADP}]","[[ADV], [VERB], [ADJ]]","[[], [ADV, ADJ], []]","{advmod, case}","[[det, unit-2, A-1], [nsubj, launched-7, unit-..."
978,wsj_0755,27,False,as soon as,as soon as,`` We 're trying to get it on line as soon as ...,yes,no,"[0, 2, 0]","[{ADV, ADJ}, {PRON, ADV, PART, NOUN}, {ADP}]","[[ADV], [VERB], [ADJ]]","[[], [ADV, ADJ], []]","{advmod, case}","[[punct, trying-4, ``-1], [nsubj, trying-4, We..."
1013,wsj_0990,3,False,as soon as,as soon as,Senate Majority Leader George Mitchell -LRB- D...,yes,no,"[0, 3, 0]","[{ADV, NOUN}, {PRON, PUNCT, ADV, NOUN}, {ADP}]","[[ADV], [VERB], [NOUN]]","[[], [ADV, ADV, NOUN], []]","{advmod, case}","[[compound, Mitchell-5, Senate-1], [compound, ..."
1070,wsj_1148,0,False,as soon as,as soon as,Mobil Corp. is preparing to slash the size of ...,yes,no,"[0, 3, 0]","[{ADV, NOUN}, {PUNCT, ADV, PART, NOUN}, {ADP, ...","[[ADV], [VERB], [NOUN]]","[[], [ADV, ADV, NOUN], []]","{advmod, case}","[[compound, Corp.-2, Mobil-1], [nsubj, prepari..."
1096,wsj_1217,19,False,as soon as,as soon as,Mobil is preparing to slash its work force in ...,no,no,"[0, 3, 0]","[{ADP, ADJ}, {ADV, NOUN}, {PUNCT, ADV, PART, N...","[[NOUN], [ADV], [VERB]]","[[], [], [ADV, ADV, NOUN]]","{advmod, case}","[[nsubj, preparing-3, Mobil-1], [aux, preparin..."
1116,wsj_1281,12,False,as soon as,as soon as,"If the ships are n't delivered , however , it ...",yes,no,"[0, 2, 0]","[{ADV, NOUN}, {PRON, ADV, PUNCT, NOUN, AUX, VE...","[[ADV], [VERB], [NOUN]]","[[], [ADV, NOUN], []]","{advmod, case}","[[mark, delivered-6, If-1], [det, ships-3, the..."
1170,wsj_1426,1,False,as soon as,as soon as,"Payment will begin `` as soon as Oct. 25 , '' ...",yes,no,"[0, 2, 0]","[{PROPN, ADV}, {AUX, ADV, NOUN, PUNCT}, {ADP, ...","[[ADV], [VERB], [PROPN]]","[[], [ADV, PROPN], []]","{advmod, case}","[[nsubj, begin-3, Payment-1], [aux, begin-3, w..."
1223,wsj_1598,1,False,as soon as,as soon as,"USACafes , which is nearly half-owned by Sam a...",no,no,"[0, 2, 0]","[{ADP}, {ADV, ADJ}, {ADP, ADV, DET, NOUN}]","[[ADJ], [ADV], [NOUN]]","[[], [], [ADV, ADJ]]","{advmod, case}","[[nsubj, said-15, USACafes-1], [punct, USACafe..."
1261,wsj_1686,6,False,as soon as,as soon as,"In 1986 , Stamford officials thanked Mr. Hoelz...",yes,no,"[0, 2, 0]","[{ADV, ADJ}, {PRON, ADV}, {ADP}]","[[ADV], [VERB], [ADJ]]","[[], [ADV, ADJ], []]","{advmod, case}","[[case, 1986-2, In-1], [nmod, thanked-6, 1986-..."
1332,wsj_1963,5,False,as soon as,as soon as,Now those items will be discussed in a House-S...,no,no,"[0, 2, 0]","[{PRON, AUX, ADV}, {ADP}, {ADV, NOUN}]","[[VERB], [NOUN], [ADV]]","[[ADV, NOUN], [], []]","{advmod, case}","[[advmod, discussed-6, Now-1], [det, items-3, ..."


In [20]:
# examine the 
mwe_pos_notconnected_diffparent.loc[294]['deptree']

[['nsubj', 'said-7', 'IBM-1'],
 ['punct', 'IBM-1', ',-2'],
 ['compound', 'N.Y.-5', 'Armonk-3'],
 ['punct', 'N.Y.-5', ',-4'],
 ['appos', 'IBM-1', 'N.Y.-5'],
 ['punct', 'IBM-1', ',-6'],
 ['root', 'ROOT-0', 'said-7'],
 ['nsubj', 'wanted-9', 'it-8'],
 ['ccomp', 'said-7', 'wanted-9'],
 ['mark', 'bring-11', 'to-10'],
 ['xcomp', 'wanted-9', 'bring-11'],
 ['compound:prt', 'bring-11', 'out-12'],
 ['det', 'mainframes-14', 'the-13'],
 ['dobj', 'bring-11', 'mainframes-14'],
 ['advmod', 'soon-16', 'as-15'],
 ['advmod', 'bring-11', 'soon-16'],
 ['mark', 'could-19', 'as-17'],
 ['nsubj', 'could-19', 'it-18'],
 ['ccomp', 'soon-16', 'could-19'],
 ['mark', 'spark-21', 'to-20'],
 ['advcl', 'bring-11', 'spark-21'],
 ['dep', 'many-23', 'as-22'],
 ['amod', 'sales-24', 'many-23'],
 ['dobj', 'spark-21', 'sales-24'],
 ['case', 'possible-26', 'as-25'],
 ['acl', 'sales-24', 'possible-26'],
 ['case', 'end-29', 'by-27'],
 ['det', 'end-29', 'the-28'],
 ['nmod', 'spark-21', 'end-29'],
 ['case', 'year-32', 'of-30'],
 

##### 2. Inspecting the most frequent MWE DCs

In [21]:
# top 10 most frequent 
pos_expconnstats_df = expconnstats_df[expconnstats_df['posexp']==True]
neg_expconnstats_df = expconnstats_df[expconnstats_df['posexp']==False]

pos_expconnstats_df.groupby('cstr_mapped')['deps_pos'].size().sort_values(ascending=False)[0:11]

cstr_mapped
for example          161
in addition          129
for instance          77
in fact               70
as a result           65
if then               36
on the other hand     28
in turn               27
by contrast           25
so that               22
as long as            21
Name: deps_pos, dtype: int64

##### 2a. findings for 'in addition'
In UD, the explicit MWE DC 'in addition' can be clearly distinguished from the non-connective usage by observing whether (i) 'in' has no sibling, and (ii) 'addition' only has an ADP for a sibling.

In [22]:
# in the case of the DC 'for example', an ADP is always present in its neighbours (nodes that the DC nodes are connected to) 
# although it typically appears for the right (i.e. coming from 'in' in 'in addition'). 
# Another way of seeing it: connective use of 'in addition' correlate to when they are leaf nodes in UD representation
[i for i in pos_expconnstats_df.groupby('cstr')['deps_pos'].get_group('for example') if 'ADP' not in i[1]]

[[['ADP'], []],
 [['ADP'], []],
 [['ADP'], []],
 [['ADP'], []],
 [['ADP'], []],
 [['ADP'], []],
 [['ADP'], []],
 [['ADP'], []],
 [['ADP'], []],
 [['ADP'], []],
 [['ADP'], []],
 [['ADP'], []]]

In [23]:
# 'for example': showing that 
pos_expconnstats_df.groupby('cstr_mapped')['deps_pos'].get_group('in addition')

13     [[], [ADP]]
17     [[], [ADP]]
20     [[], [ADP]]
38     [[], [ADP]]
44     [[], [ADP]]
56     [[], [ADP]]
58     [[], [ADP]]
69     [[], [ADP]]
74     [[], [ADP]]
87     [[], [ADP]]
90     [[], [ADP]]
91     [[], [ADP]]
111    [[], [ADP]]
114    [[], [ADP]]
115    [[], [ADP]]
116    [[], [ADP]]
117    [[], [ADP]]
125    [[], [ADP]]
136    [[], [ADP]]
138    [[], [ADP]]
141    [[], [ADP]]
151    [[], [ADP]]
155    [[], [ADP]]
157    [[], [ADP]]
171    [[], [ADP]]
173    [[], [ADP]]
175    [[], [ADP]]
191    [[], [ADP]]
203    [[], [ADP]]
204    [[], [ADP]]
          ...     
641    [[], [ADP]]
653    [[], [ADP]]
655    [[], [ADP]]
657    [[], [ADP]]
662    [[], [ADP]]
663    [[], [ADP]]
664    [[], [ADP]]
666    [[], [ADP]]
669    [[], [ADP]]
676    [[], [ADP]]
677    [[], [ADP]]
679    [[], [ADP]]
682    [[], [ADP]]
691    [[], [ADP]]
692    [[], [ADP]]
700    [[], [ADP]]
703    [[], [ADP]]
704    [[], [ADP]]
711    [[], [ADP]]
718    [[], [ADP]]
719    [[], [ADP]]
729    [[], 

In [24]:
pos_expconnstats_df.loc[13]['rawtext']

'In addition , Healthcare agreed to make monthly rent and mortgage payments of $ 2.7 million to $ 3 million to HealthVest during the standstill period , to be paid when Healthcare successfully completes asset sales .'

In [25]:
pos_expconnstats_df.loc[13]

docid                                                       wsj_0265
sentid                                                             5
posexp                                                          True
cstr                                                     In addition
cstr_mapped                                              in addition
rawtext            In addition , Healthcare agreed to make monthl...
fullyconnected                                                   yes
sameparent                                                        no
outdegree                                                     [0, 1]
siblings_pos                     [{ADP}, {PROPN, PUNCT, VERB, NOUN}]
gov_pos                                             [[NOUN], [VERB]]
deps_pos                                                 [[], [ADP]]
deprel_incoming                                         {nmod, case}
deptree            [[case, addition-2, In-1], [nmod, agreed-5, ad...
Name: 13, dtype: object

In [26]:
if GOLD == True: print([i[1]['PartOfSpeech_UDGold'] for i in ParsePDTB_dict_train['wsj_0265'][5].Words])
if GOLD == False: print([i[1]['PartOfSpeech_UDAuto'] for i in ParsePDTB_dict_train['wsj_0265'][5].Words])

['ADP', 'NOUN', 'PUNCT', 'PROPN', 'VERB', 'PART', 'VERB', 'ADJ', 'NOUN', 'CONJ', 'NOUN', 'NOUN', 'ADP', 'SYM', 'NUM', 'NUM', 'ADP', 'SYM', 'NUM', 'NUM', 'ADP', 'PROPN', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT', 'PART', 'AUX', 'VERB', 'ADV', 'PROPN', 'ADV', 'VERB', 'NOUN', 'NOUN', 'PUNCT']


In [27]:
pos_expconnstats_df.loc[13]['deptree']

[['case', 'addition-2', 'In-1'],
 ['nmod', 'agreed-5', 'addition-2'],
 ['punct', 'agreed-5', ',-3'],
 ['nsubj', 'agreed-5', 'Healthcare-4'],
 ['root', 'ROOT-0', 'agreed-5'],
 ['mark', 'make-7', 'to-6'],
 ['xcomp', 'agreed-5', 'make-7'],
 ['amod', 'rent-9', 'monthly-8'],
 ['dobj', 'make-7', 'rent-9'],
 ['cc', 'rent-9', 'and-10'],
 ['compound', 'payments-12', 'mortgage-11'],
 ['conj', 'rent-9', 'payments-12'],
 ['case', '$-18', 'of-13'],
 ['dep', '$-18', '$-14'],
 ['compound', '$-18', '2.7-15'],
 ['compound', '$-18', 'million-16'],
 ['dep', '$-18', 'to-17'],
 ['nmod', 'rent-9', '$-18'],
 ['compound', 'million-20', '3-19'],
 ['nummod', '$-18', 'million-20'],
 ['case', 'HealthVest-22', 'to-21'],
 ['nmod', 'rent-9', 'HealthVest-22'],
 ['case', 'period-26', 'during-23'],
 ['det', 'period-26', 'the-24'],
 ['amod', 'period-26', 'standstill-25'],
 ['nmod', 'make-7', 'period-26'],
 ['punct', 'make-7', ',-27'],
 ['mark', 'paid-30', 'to-28'],
 ['auxpass', 'paid-30', 'be-29'],
 ['xcomp', 'make-7', 

In [28]:
# Notably, in the non-connective use of 'in addition', 'addition' is always connected to one other node besides 'in'
# there are two instances where the neighbour node is ['ADP', 'SYM'], which could be interesting to understand why
neg_expconnstats_df.groupby('cstr_mapped')['deps_pos'].get_group('in addition')

828                                   [[], [ADP, VERB]]
848                                   [[], [ADP, NOUN]]
850                                   [[], [ADP, NOUN]]
853                                   [[], [ADP, NOUN]]
861                                   [[], [ADP, VERB]]
874                                   [[], [ADP, VERB]]
886                                    [[], [ADP, SYM]]
902                                   [[], [ADP, VERB]]
909                                   [[], [ADP, VERB]]
910                                   [[], [ADP, NOUN]]
929                           [[], [ADP, PROPN, PUNCT]]
936                                   [[], [ADP, NOUN]]
937                                   [[], [ADP, VERB]]
954                                   [[], [ADP, VERB]]
957                                   [[], [ADP, NOUN]]
968                                   [[], [ADP, NOUN]]
987                                   [[], [ADP, NOUN]]
995                [[], [PRON, VERB, ADP, NOUN, 

In [29]:
neg_expconnstats_df.loc[886]

docid                                                       wsj_0448
sentid                                                             7
posexp                                                         False
cstr                                                     in addition
cstr_mapped                                              in addition
rawtext            In addition to the $ 8 million for Citicorp an...
fullyconnected                                                   yes
sameparent                                                        no
outdegree                                                     [0, 2]
siblings_pos       [{ADP, SYM}, {ADV, PUNCT, NOUN, AUX, SYM, PROP...
gov_pos                                             [[NOUN], [VERB]]
deps_pos                                            [[], [ADP, SYM]]
deprel_incoming                                         {nmod, case}
deptree            [[case, addition-2, In-1], [nmod, owed-17, add...
Name: 886, dtype: object

In [30]:
neg_expconnstats_df.loc[1310]['deptree']

[['case', 'addition-2', 'In-1'],
 ['nmod', 'sell-25', 'addition-2'],
 ['case', '$-5', 'to-3'],
 ['det', '$-5', 'the-4'],
 ['nmod', 'addition-2', '$-5'],
 ['compound', 'billion-7', '15.6-6'],
 ['nummod', '$-5', 'billion-7'],
 ['case', 'bills-10', 'of-8'],
 ['compound', 'bills-10', 'Treasury-9'],
 ['nmod', '$-5', 'bills-10'],
 ['mark', 'sold-13', 'to-11'],
 ['auxpass', 'sold-13', 'be-12'],
 ['acl', '$-5', 'sold-13'],
 ['case', 'auction-20', 'at-14'],
 ['amod', 'week-16', 'next-15'],
 ['nmod:poss', 'auction-20', 'week-16'],
 ['case', 'week-16', "'s-17"],
 ['amod', 'auction-20', 'regular-18'],
 ['compound', 'auction-20', 'Monday-19'],
 ['nmod', 'sold-13', 'auction-20'],
 ['punct', 'sell-25', ',-21'],
 ['det', 'government-23', 'the-22'],
 ['nsubj', 'sell-25', 'government-23'],
 ['aux', 'sell-25', 'will-24'],
 ['root', 'ROOT-0', 'sell-25'],
 ['dobj', 'sell-25', '$-26'],
 ['compound', 'billion-28', '10-27'],
 ['nummod', '$-26', 'billion-28'],
 ['case', 'notes-33', 'of-29'],
 ['amod', 'notes-3