# Playing with MMAX2 coreference data in DiscourseGraphs

In [1]:
from discoursegraphs.readwrite import MMAXDocumentGraph
from discoursegraphs import get_pointing_chains

MMAX_FILE = '/home/arne/repos/pcc-annis-merged/maz176/coreference/maz-1423.mmax'

In [2]:
mdg = MMAXDocumentGraph(MMAX_FILE)

## Extracting coreference chains

In [3]:
get_pointing_chains(mdg)

[['markable_22',
  'markable_19',
  'markable_17',
  'markable_14',
  'markable_12',
  'markable_11'],
 ['markable_21', 'markable_10', 'markable_8', 'markable_7', 'markable_2']]

In [4]:
for chain in get_pointing_chains(mdg):
    for node_id in chain:
        print node_id, mdg.node[node_id], '\n'

markable_22 {'mmax:dir_speech': 'text_level', 'mmax:np_form': 'ne', 'layers': set(['mmax', 'mmax:markable', 'mmax:primmark']), 'mmax:type': 'anaphoric', 'mmax:id': 'markable_22', 'mmax:anaphor_type': 'anaphor_nominal', 'mmax:grammatical_role': 'sbj', 'mmax:phrase_type': 'np', 'mmax:ambiguity': 'not_ambig', 'label': 'markable_22:primmark', 'mmax:span': 'word_180', 'mmax:anaphor_antecedent': 'markable_19', 'mmax:referentiality': 'referring', 'mmax:complex_np': 'no'} 

markable_19 {'mmax:dir_speech': 'text_level', 'mmax:np_form': 'ne', 'layers': set(['mmax', 'mmax:markable', 'mmax:primmark']), 'mmax:type': 'anaphoric', 'mmax:id': 'markable_19', 'mmax:anaphor_type': 'anaphor_nominal', 'mmax:grammatical_role': 'sbj', 'mmax:phrase_type': 'np', 'mmax:ambiguity': 'not_ambig', 'label': 'markable_19:primmark', 'mmax:span': 'word_160..word_161', 'mmax:anaphor_antecedent': 'markable_17', 'mmax:referentiality': 'referring', 'mmax:complex_np': 'no'} 

markable_17 {'mmax:dir_speech': 'text_level', 'm

In [5]:
from discoursegraphs.readwrite.mmax2 import spanstring2text

for chain in get_pointing_chains(mdg):
    for node_id in chain:
        print node_id, spanstring2text(mdg, mdg.node[node_id]['mmax:span'])
    print '\n'

markable_22 Wittstock
markable_19 die Dosse-Stadt
markable_17 Wittstock
markable_14 Wittstock
markable_12 in der Region
markable_11 Wittstocker


markable_21 die Halle
markable_10 Die Halle
markable_8 die Halle
markable_7 für den schmucken Veranstaltungsort
markable_2 die neue Wittstocker Stadthalle




In [6]:
for chain in get_pointing_chains(mdg):
    for node_id in chain:
            print mdg.in_edges(node_id, data=True)
            print mdg.out_edges(node_id, data=True)

[]
[('markable_22', 'markable_19', {'layers': set(['mmax', 'mmax:markable', 'mmax:primmark']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'}), ('markable_22', 'word_180', {'layers': set(['mmax', 'mmax:markable', 'mmax:primmark']), 'edge_type': 'spans', 'label': 'mmax:primmark'})]
[('markable_22', 'markable_19', {'layers': set(['mmax', 'mmax:markable', 'mmax:primmark']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'})]
[('markable_19', 'markable_17', {'layers': set(['mmax', 'mmax:markable', 'mmax:primmark']), 'edge_type': 'points_to', 'label': 'mmax:antecedent'}), ('markable_19', 'word_160', {'layers': set(['mmax', 'mmax:markable', 'mmax:primmark']), 'edge_type': 'spans', 'label': 'mmax:primmark'}), ('markable_19', 'word_161', {'layers': set(['mmax', 'mmax:markable', 'mmax:primmark']), 'edge_type': 'spans', 'label': 'mmax:primmark'})]
[('markable_19', 'markable_17', {'layers': set(['mmax', 'mmax:markable', 'mmax:primmark']), 'edge_type': 'points_to', 'label': 'mmax:antece

## Handling sentence annotatations

In [9]:
# only nodes that represent a sentence are part of the 'mmax:sentence' layer

for node_id, node_attr in mdg_with_sents.nodes(data=True):
    if 'mmax:sentence' in node_attr['layers']:
        print node_id, node_attr

markable_50012 {'mmax:span': 'word_125..word_135', 'mmax:id': 'markable_50012', 'label': 'markable_50012:sentence', 'tokens': ['word_125', 'word_126', 'word_127', 'word_128', 'word_129', 'word_130', 'word_131', 'word_132', 'word_133', 'word_134', 'word_135'], 'layers': set(['mmax', 'mmax:markable', 'mmax:sentence'])}
markable_50013 {'mmax:span': 'word_136..word_155', 'mmax:id': 'markable_50013', 'label': 'markable_50013:sentence', 'tokens': ['word_136', 'word_137', 'word_138', 'word_139', 'word_140', 'word_141', 'word_142', 'word_143', 'word_144', 'word_145', 'word_146', 'word_147', 'word_148', 'word_149', 'word_150', 'word_151', 'word_152', 'word_153', 'word_154', 'word_155'], 'layers': set(['mmax', 'mmax:markable', 'mmax:sentence'])}
markable_50010 {'mmax:span': 'word_108..word_115', 'mmax:id': 'markable_50010', 'label': 'markable_50010:sentence', 'tokens': ['word_108', 'word_109', 'word_110', 'word_111', 'word_112', 'word_113', 'word_114', 'word_115'], 'layers': set(['mmax', 'mmax:m

# Test if ignore_sentence_annotations=True/False produces the same CoNLL output

In [11]:
import os

MAZ_ROOTDIR = os.path.expanduser('~/repos/pcc-annis-merged/maz176/')
COREF_DIR = os.path.join(MAZ_ROOTDIR, 'coreference')
coref_files = !ls $COREF_DIR/*.mmax

In [17]:
from discoursegraphs.readwrite.conll import write_conll

for coref_file in coref_files:
    mdg_true = MMAXDocumentGraph(coref_file, ignore_sentence_annotations=True)
    mdg_false = MMAXDocumentGraph(coref_file, ignore_sentence_annotations=False)
    write_conll(mdg_true, os.path.join('/tmp/dg', os.path.basename(coref_file)+'.true'))
    write_conll(mdg_false, os.path.join('/tmp/dg', os.path.basename(coref_file)+'.false'))

In [None]:
If ignore_sentence_annotations=False, all sentence annotations are interpreted as coreferences by the CoNLL exporter!