In [121]:
from io import open
from conllu import parse
from collections import defaultdict

file_name = "tr_imst-ud-train.conllu"
written_file_identification_beginning = "imst"
written_file_identification_ending = "train"

data_file = open(file_name, "r", encoding="utf-8")

data = data_file.read()

sentences = parse(data)

In [122]:
sent_info = {}


for index, sentence in enumerate(sentences):
    index_key = str(index)
    sent_info[index_key] = {'PRED': None,
                            'Objs': [],
                            'Obls': [],
                            'XComps': [],
                            'CComps': []}
    
    
    for token in sentence:
        if  token['deprel'] == 'root':
            predicate_info = {'Lemma': token['lemma'],
                              'Cat': token['upos'],
                              'Predicate_ID': token['id']}
            sent_info[index_key]['PRED'] = predicate_info
            
            
                    
        elif token['deprel'] == 'obj':
            if token['upos'] == 'NOUN' or token['upos'] == 'PROPN':
                try:
                    info_obj = {'Form': token['form'],
                                'Cat': token['upos'], 
                                'Case': token['feats']['Case'],
                                'Head': token['head']}
                    sent_info[index_key]['Objs'].append(info_obj)
                except:
                    info_obj = {'Form': token['form'],
                                'Cat': token['upos'],
                                'Head': token['head']}
                    sent_info[index_key]['Objs'].append(info_obj)
            
            else:
                info_obj = {'Form': token['form'],
                            'Cat': token['upos'],
                            'Head': token['head']}
                sent_info[index_key]['Objs'].append(info_obj)
                
                
        
        elif token['deprel'] == 'obl':
            if token['upos'] == 'NOUN' or token['upos'] == 'PROPN':
                try:
                    info_obl = {'Form': token['form'],
                                'Cat': token['upos'], 
                                'Case': token['feats']['Case'],
                                'Head': token['head']}
                    sent_info[index_key]['Obls'].append(info_obl)
                except:
                    info_obl = {'Form': token['form'],
                                'Cat': token['upos'],
                                'Head': token['head']}
                    sent_info[index_key]['Obls'].append(info_obl)
            
            else:
                info_obl = {'Form': token['form'],
                            'Cat': token['upos'],
                            'Head': token['head']}
                sent_info[index_key]['Obls'].append(info_obl)
        
        
        
        elif token['deprel'] == 'xcomp':
            if token['upos'] == 'NOUN' or token['upos'] == 'PROPN':
                try:
                    info_xcomp = {'Form': token['form'],
                                'Cat': token['upos'], 
                                'Case': token['feats']['Case'],
                                'Head': token['head']}
                    sent_info[index_key]['XComps'].append(info_xcomp)
                except:
                    info_xcomp = {'Form': token['form'],
                                'Cat': token['upos'],
                                'Head': token['head']}
                    sent_info[index_key]['XComps'].append(info_xcomp)
            
            else:
                info_xcomp = {'Form': token['form'],
                            'Cat': token['upos'],
                            'Head': token['head']}
                sent_info[index_key]['XComps'].append(info_xcomp)
        
        
        
        elif token['deprel'] == 'ccomp':
            if token['upos'] == 'NOUN' or token['upos'] == 'PROPN':
                try:
                    info_ccomp = {'Form': token['form'],
                                'Cat': token['upos'], 
                                'Case': token['feats']['Case'],
                                'Head': token['head']}
                    sent_info[index_key]['XComps'].append(info_ccomp)
                except:
                    info_ccomp = {'Form': token['form'],
                                'Cat': token['upos'],
                                'Head': token['head']}
                    sent_info[index_key]['XComps'].append(info_ccomp)
            
            else:
                info_ccomp = {'Form': token['form'],
                            'Cat': token['upos'],
                            'Head': token['head']}
                sent_info[index_key]['XComps'].append(info_ccomp)
                
                
        else:
            continue

In [123]:
sent_info_processed = {}

for key, val in sent_info.items():
    sent_info_processed[key] = {'PRED': None,
                                'Objs': [],
                                'Obls': [],
                                'XComps': [],
                                'CComps': []}
    
    predicate_id = val['PRED']['Predicate_ID']
    sent_info_processed[key]['PRED'] = {'Lemma': val['PRED']['Lemma'], 
                                        'Cat': val['PRED']['Cat']}
    
    try:
        for obj in val['Objs']:
            if obj['Head'] == predicate_id:
                sent_info_processed[key]['Objs'].append(obj)
            else:
                continue
    except:
        pass
    
    try:
        for obl in val['Obls']:
            if obl['Head'] == predicate_id:
                sent_info_processed[key]['Obls'].append(obl)
            else:
                continue
    except:
        pass
    
        
    try:
        for xcomp in val['XComps']:
            if xcomp['Head'] == predicate_id:
                sent_info_processed[key]['XComps'].append(xcomp)
            else:
                continue
    except:
        pass
    
    
            
    try:
        for ccomp in val['CComps']:
            if ccomp['Head'] == predicate_id:
                sent_info_processed[key]['CComps'].append(ccomp)
            else:
                continue
    except:
        pass
        
        

In [124]:
predicates = {}

for key, val in sent_info_processed.items():
    if val['PRED']['Cat'] == 'VERB':
        lemma = val['PRED']['Lemma'] + "(V)"
        predicates[lemma] = {'ObjCat': [],
                             'ObjCase': [],
                             'OblCat': [],
                             'OblCase': [],
                             'XCompCat': [],
                             'XCompCase': []}
        
    else:
        non_verb_lemma = val['PRED']['Lemma'] + "(NonVerb)"
        predicates[non_verb_lemma] = None

        

In [125]:
for key, val in sent_info_processed.items():
    try:
        for arg in val['Objs']:
            arg_cat = arg['Cat'] 
            arg_case = arg['Case']
            lemma_name = val['PRED']['Lemma'] + f"({val['PRED']['Cat'][0]})"
            predicates[lemma_name]['ObjCat'].append(arg_cat)
            predicates[lemma_name]['ObjCase'].append(arg_case)
            
    except:
        continue
        
for key, val in sent_info.items():
    try:
        for arg in val['Obls']:
            arg_cat = arg['Cat'] 
            arg_case = arg['Case']
            lemma_name = val['PRED']['Lemma'] + f"({val['PRED']['Cat'][0]})"
            predicates[lemma_name]['OblCat'].append(arg_cat)
            predicates[lemma_name]['OblCase'].append(arg_case)
            
    except:
        continue
            
            
for key, val in sent_info.items():
    try:
        for xcomp in val['XComps']:
            xcomp_cat = xcomp['Cat'] 
            xcomp_case = xcomp['Case']
            lemma_name = val['PRED']['Lemma'] + f"({val['PRED']['Cat'][0]})"
            predicates[lemma_name]['XCompCat'].append(xcomp_cat)
            predicates[lemma_name]['XCompCase'].append(xcomp_case)
            
    except:
        continue
        

for key, val in sent_info.items():
    try:
        for ccomp in val['CComps']:
            ccomp_cat = ccomp['Cat'] 
            ccomp_case = ccomp['Case']
            lemma_name = val['PRED']['Lemma'] + f"({val['PRED']['Cat'][0]})"
            predicates[lemma_name]['CCompCat'].append(ccomp_cat)
            predicates[lemma_name]['CCompCase'].append(ccomp_case)
            
    except:
        continue

In [126]:
for key, val in predicates.items():
    try:
        val['ObjCat'] = set(val['ObjCat'])
        val['ObjCase'] = set(val['ObjCase'])
    except:
        continue
        
for key, val in predicates.items():
    try:
        val['OblCat'] = set(val['OblCat'])
        val['OblCase'] = set(val['OblCase'])
    except:
        continue
        
for key, val in predicates.items():
    try:
        val['XCompCat'] = set(val['XCompCat'])
        val['XCompCase'] = set(val['XCompCase'])
    except:
        continue
        
for key, val in predicates.items():
    try:
        val['CCompCat'] = set(val['CCompCat'])
        val['CCompCase'] = set(val['CCompCase'])
    except:
        continue
    

In [127]:
written_file_name = written_file_identification_beginning + "_valency_info_" + written_file_identification_ending + ".txt"


with open(written_file_name, 'w') as f: 
    for key, value in predicates.items(): 
        f.write('%s:%s\n' % (key, value))