In [332]:
import nltk
from collections import defaultdict

## Preprocessing and data required for conversion of the two tagsets
### 1. Creating AnnCorra bigram counts table
This will be used to diambiguate tags for UD to AnnCorra conversion

1. Preparing data - using AnnCorra tags from 200 tagged sentences

In [360]:
# Preprocessing raw data
def preprocessTrainingData():
    tags = []
    shabd =[]
    file = open("./anntags", "r")
    raw_data = file.read()
    sentences = raw_data.split("<s>")
    for sentence in sentences:
        tags.append("start2")
        tags.append("start1")
        shabd.append("start2")
        shabd.append("start1")
        words_info = sentence.split("\n")
        for word_info in words_info:
            tokens = word_info.split("\t")
            try:
#                 print(tokens[1], tokens[7])
                tags.append(tokens[1])
                shabd.append(tokens[0])
            except:
                print("WARNING: No problem, just an empty line detected in raw data")
    
    return tags, shabd

2. Grenerate a probability/count table with bigrams/trigrams of AnnCorra tags.

In [361]:
def generateCountTable(tags):
    
    tokens = nltk.word_tokenize(tags)
    bgs = nltk.ngrams(tokens,2)
    tgs = nltk.ngrams(tokens,3)
    dict_bgs = dict(nltk.FreqDist(bgs))
    dict_tgs = dict(nltk.FreqDist(tgs))
    dict_bgs = sorted(dict_bgs.items(), key = lambda kv:(kv[1], kv[0]), reverse=True)
    dict_tgs = sorted(dict_tgs.items(), key = lambda kv:(kv[1], kv[0]), reverse=True)
    
    return dict_bgs, dict_tgs

In [362]:
def find_tags():
    
    tags, words = preprocessTrainingData()
    return tags, words

In [363]:
ann_tags, ann_words = find_tags()
bi_pt, tri_pt = generateCountTable(' '.join(ann_tags))

# print(ann_tags)



In [364]:
# bi = dict(bi_pt)
# tri = dict(tri_pt)

In [365]:
# print(tri_pt[1][1])
bi = defaultdict(lambda : defaultdict(int))
tri = defaultdict(lambda : defaultdict(lambda: defaultdict(int)))

for i in range(len(bi_pt)):
    bi[bi_pt[i][0][0]][bi_pt[i][0][1]] = bi_pt[i][1]
    
for i in range(len(tri_pt)):
    tri[tri_pt[i][0][0]][tri_pt[i][0][1]][tri_pt[i][0][2]] = tri_pt[i][1]

In [366]:
# print(bi_pt)

In [367]:
# print(tri['lwg__psp'])
# print(ann_tags)



### 2. Mappings

Using a dataset of 200 sentences maked with both AnnCorra and UD, using linguistic cues and other resources like [TANDON J. et al](https://www.aclweb.org/anthology/W16-1716.pdf)

In [368]:
def ud_train_tags():
    udfile = open('./asgn3_Ud.txt', 'r')
    udinp = udfile.read()
    ud_train_tags = []
    shabd =[]
    sentences = udinp.split("<s>")
    for sentence in sentences:
        ud_train_tags.append("start2")
        ud_train_tags.append("start1")
        shabd.append("start2")
        shabd.append("start1")
        words = sentence.split("\n")
        for word in words:
            tokens = word.split("\t")
            try: 
                if len(tokens[1]) > 1:
    #                 print("this" ,tokens[1])
                    ud_train_tags.append(tokens[1])
                    shabd.append(tokens[0])
            except:
                print("WARNING: No problem, just an empty line detected in raw data")
    
    return ud_train_tags, shabd

In [369]:
ud_tags, ud_words = ud_train_tags()



In [370]:
# # print(ud_tags)
# for i in range(len(ud_tags)):
#     print(ann_words[i])

In [371]:
# for i in range(len(ud_tags)):
#     print(ud_words[i])

In [372]:
def find_map_with_data(ud_tags, ann_tags):
    new_map = {}
#     print(len(ud_tags), len(ann_tags))
    for i,ud in enumerate(ud_tags):
        new_map.setdefault(ud,[])
        if(ann_tags[i] not in new_map[ud]):
            new_map[ud].append(ann_tags[i])
    return new_map

In [373]:
new_map = find_map_with_data(ud_tags, ann_tags)
print(new_map)

{'start2': ['start2'], 'start1': ['start1'], 'obj': ['k2', 'ccof', 'sent-adv', 'pof', 'pof__cn', 'k1s', 'k1', 'rt'], 'nsubj': ['k1', 'k1s', 'mod', 'nmod', 'pof__cn', 'k2', 'nmod__adj', 'rsym', 'nmod__k1inv', 'k4a', 'pof', 'k7', 'ccof'], 'dep': ['lwg__rp', 'lwg__neg', 'main', 'lwg__psp', 'rs', 'mod'], 'advmod': ['lwg__neg', 'jjmod__intf', 'adv', 'k1', 'jjmod', 'k1s', 'k2', 'k7t', 'ccof', 'pof__cn'], 'root': ['main', 'r6', 'k1s', 'k2', 'rt', 'vmod', 'ccof', 'k1', 'lwg__psp', 'k7', 'nmod__k2inv', 'pof__cn', 'nmod__adj', 'rs', 'nmod', 'nmod__relc'], 'aux': ['lwg__vaux', 'main'], 'aux:pass': ['lwg__vaux_cont', 'lwg__vaux'], 'nmod': ['k5', 'r6', 'k7p', 'rsym', 'vmod', 'lwg__psp', 'r6-k2', 'k7t', 'ccof', 'nmod', 'k7', 'ras-k1', 'nmod__emph', 'nmod__adj', 'k1', 'lwg__neg', 'rh', 'k1s', 'pof__cn', 'jjmod', 'k1u', 'k3', 'adv', 'fragof', 'rsp', 'lwg__rp', 'rt'], 'case': ['lwg__psp', 'lwg__rp', 'mod', 'vmod', 'pof__cn', 'nmod__adj'], 'compound': ['nmod', 'pof__cn', 'pof', 'k2', 'k1', 'vmod', 'r6',

Upated the map found using the dataset of 200 sentences with those found in other resources mentioned above.

In [374]:
ud_tagset = ['acl', 'neg', 'dislocated', 'nmod', 'iobj', 'punct', 'vocative', 'advmod', 'dep', 'compound', 'case'
            , 'det', 'dobj', 'amod', 'parataxis', 'ccomp','xcomp', 'aux', 'auxpass', 'nsubj','nsubjpass',
             'nummod','advcl','root','conj','csubj','obj','obl','expl','discourse','cop','mark','appos',
             'clf','fixed','flat','list_','orphan','goeswith','reparandum','cc']

In [389]:
mapping = {'acl' : ['nmod_k1inv', 'nmod_k2inv', 'nmod_relc', 'rs', 'k2g', 'k2s', 'rbmod_relc'], 
           'neg' : ['nmod_neg', 'lwg_neg'], 'dislocated' : ['fragof'], 
           'nmod' : ['k2u', 'jk1', 'k1u', 'k3', 'k3u', 'k2p', 'k4u', 'k5', 'k7', 'k7a', 'k7p', 'k7pu', 'k7t', 'k7tu',
                       'k7u','r6', 'r6-k1', 'r6-k2', 'r6v', 'ras-k1', 'ras-k1u', 'ras-k2', 'ras-k4', 'ras-k4a', 
                       'ras-k7', 'ras-k7p','ras-neg', 'ras-pof', 'ras-r6', 'ras-r6-k2', 'ras-rt','nmod_emph'], 
           'iobj' : ['k4'], 
           'punct' : ['rsym'], 
           'start1' : ['start1'],
           'start2' : ['start2'],
           'vocative' : ['rad'], 
           'nmod:poss' : ['r6'],
           'advmod' : ['rd', 'rsp', 'lwg_intf', 'vmod_adv', 'jjmod_intf', 'jjmod', 'adv', 'rbmod'], 
           'dep' : ['lwg_rp', 'lwg_unk', 'undef'], 
           'compound' : ['pof_cn', 'pof_redup', 'lwg_rdp', 'lwg_vm', 'nmod_pofinv', 'pof', 'pof_inv'], 
           'case' : ['lwg_psp', 'lwg_nst', 'psp_cl', 'lwg_k1'], 
           'det' : ['mod_wq'], 
           'dobj' : ['k2', 'k1s', 'mk1'], 
           'amod' : ['nmod_adj', 'nmod'], 
           'parataxis' : ['vmod'], 'ccomp' : ['k2'], 
           'xcomp' : ['k2'], 
           'aux' : ['lwg_vaux', 'lwg_vaux_cont'], 
           'aux:pass' : ['lwg_vaux'], 
           'nsubj' : ['k1', 'k4a', 'pk1', 'k1s', 'mod', 'nmod', 'k2'], 
           'nsubj:pass' : ['k1'], 
           'nummod' : ['enm'], 
           'advcl' : ['rh', 'rt', 'rtu', 'sent-adv', 'vmod'], 
           'root' : ['root'], 
           'conj' : ['ccof'], 
           'obj' : ['k2', 'ccof', 'sent-adv', 'pof', 'pof__cn', 'k1s', 'k1', 'rt'], 
           'obl' : ['k7t', 'k2p', 'k7', 'rt', 'k5', 'rh', 'vmod', 'sent-adv', 'adv', 'k7p', 'k7a', 
                    'k1', 'ras-k1', 'ccof', 'k3', 'rsym', 'rd', 'k2', 'nmod', 'k1s'], 
           'cop' : ['lwg_vaux', 'root'], 
           'mark' : ['lwg__psp'], 
           'cc' : ['k1']}

### 3. Creating UD bigram counts table
This will be used to diambiguate tags for AnnCorra to UD conversion

In [376]:
def generateUDCountTable(tags):
    tokens = nltk.word_tokenize(tags)
    bgs = nltk.ngrams(tokens,2)
    dict_bgs = dict(nltk.FreqDist(bgs))
    dict_bgs = sorted(dict_bgs.items(), key = lambda kv:(kv[1], kv[0]), reverse=True)
    
    return dict_bgs

In [377]:
bi_ud_old = generateUDCountTable(' '.join(ud_tags))

In [378]:
bi_ud = defaultdict(lambda : defaultdict(int))

for i in range(len(bi_ud_old)):
    bi_ud[bi_ud_old[i][0][0]][bi_ud_old[i][0][1]] = bi_ud_old[i][1]

In [379]:
# print(bi_ud)

## Input
The program takes in a file containing a list of tab separated word and its tag and each sentence is separated by a `</s>` tag.

In [380]:
def preprocessUDInput():
    tags =[]
    shabd =[]
    inputfile = open("./20171099/Udfile.txt", "r")
    inputtext = inputfile.read()
    sentences = inputtext.split("<s>")
    for sentence in sentences:
        tags.append("start2")
        tags.append("start1")
        shabd.append("start2")
        shabd.append("start1")
        words = sentence.split("\n")
        for word in words:
            tokens = word.split("\t")
#             print(tokens)
            try:
    #                 print(tokens[1], tokens[7])
                shabd.append(tokens[0])
                tags.append(tokens[1])
            except:
                print("WARNING: No problem, just an empty line detected in raw data")
                
    return tags, shabd

In [381]:
def preprocessAnnCorraInput():
    tags =[]
    shabd =[]
    inputfile = open("./20171099/annCorrafile.txt", "r")
    inputtext = inputfile.read()
    sentences = inputtext.split("<s>")
    for sentence in sentences:
        tags.append("start2")
        tags.append("start1")
        shabd.append("start2")
        shabd.append("start1")
        words_info = sentence.split("\n")
        for word_info in words_info:
            tokens = word_info.split("\t")
            try:
    #                 print(tokens[1], tokens[7])
                shabd.append(tokens[1])
                tags.append(tokens[7])
            except:
                print("WARNING: No problem, just an empty line detected in raw data")
                
    return tags, shabd

Convert assignment1 and assignment2 submissions to the above mentioned format.  

In [382]:
# inp_tags = ['start2', 'start1','amod', 'nsubj', 'cc', 'amod', 'conj', 'nmod:poss', 'dobj', 'case', 'root', 'aux', 'punct']
# inp_tags = ['start2', 'start1','r6', 'ccof', 'k7p', 'ccof', 'lwg_psp', 'lwg_psp', 'k1', 'pof', 'root', 'lwg_vaux', 'rsym']

MANUALLY CHANGE to select conversion

In [390]:
def findTagsetUsed(inp_tags):
    #MANUALLY CHANGE
    findUDtoAnnTag = False
    if findUDtoAnnTag == True:
        inp_tags, words = preprocessUDInput()

    if findUDtoAnnTag == False:
        inp_tags, words = preprocessAnnCorraInput()
        
    
    return findUDtoAnnTag, inp_tags, words

In [391]:
findUDtoAnnTag, inp_tags, words = findTagsetUsed(inp_tags)



In [392]:
print(inp_tags)

['start2', 'start1', 'start2', 'start1', 'nmod_adj', 'k1', 'r6', 'root', 'rsym', 'start2', 'start1', 'k1', 'nmod_adj', 'jjmod', 'k1s', 'root', 'rsym', 'start2', 'start1', 'k1', 'nmod_adj', 'k1s', 'root', 'rsym', 'start2', 'start1', 'k1', 'pof', 'root', 'lwg_vaux', 'rsym', 'start2', 'start1', 'k1', 'k7t', 'k1s', 'root', 'rsym', 'start2', 'start1', 'nmod_adj', 'k1', 'k1s', 'root', 'lwg_vaux', 'rsym', 'start2', 'start1', 'k1', 'nmod_adj', 'k2', 'root', 'lwg_vaux', 'rsym', 'start2', 'start1', 'k1', 'r6', 'lwg_psp', 'k1s', 'root', 'rsym', 'start2', 'start1', 'k1', 'k2p', 'vmod', 'root', 'lwg_vaux', 'rsym', 'start2', 'start1', 'k1', 'pof', 'root', 'lwg_vaux', 'lwg_vaux_cont', 'rsym', 'start2', 'start1', 'k1', 'nmod_adj', 'nmod_adj', 'k1s', 'root', 'rsym', 'start2', 'start1', 'k1', 'k5', 'lwg_psp', 'nmod_adj', 'k7t', 'root', 'lwg_vaux', 'rsym', 'start2', 'start1', 'nmod_adj', 'k7p', 'lwg_psp', 'pof', 'nmod_k1inv', 'k1', 'root', 'rsym', 'start2', 'start1', 'r6', 'lwg_psp', 'pof', 'root', 'lwg_

## AnnCorra to UD Conversion
Using mappings and UD counts table to find the correct UD tag for the given AnnCorra tag

In [393]:
if findUDtoAnnTag == False:
    ud = []
    dictionary_of_possibilities = {}
    for tag in inp_tags:
        for i,ann_tag in enumerate(list(mapping.values())):
            if tag in ann_tag:
#                 print(tag, list(mapping.keys())[i])
                dictionary_of_possibilities.setdefault(tag,[])
                if(list(mapping.keys())[i] not in dictionary_of_possibilities[tag]):
                    dictionary_of_possibilities[tag].append(list(mapping.keys())[i])

    for i in range(len(inp_tags)):
        try:
            if len(dictionary_of_possibilities[inp_tags[i]]) == 1:
                ud.append(dictionary_of_possibilities[inp_tags[i]][0])
            elif len(dictionary_of_possibilities[inp_tags[i]]) > 1:
                max_score = 0
                for possibility in dictionary_of_possibilities[inp_tags[i]]:
                    score = bi_ud[ud[-1]][possibility]
                    if score != 0: #bigram found
                        if score > max_score:
                            max_score = score
                            ud_tag = possibility

                    else:
                        if len(list(bi[ud[-1]])) != 0:
                            ud_tag = (list(bi[ud[-1]])[0])
                        else:
                            ud_tag = ("case") #most_frequent_tag 

                    ud.append(ud_tag)
                    break;  
        except:
            if len(list(bi[ud[-1]])) != 0:
                ud_tag = (list(bi[ud[-1]])[0])
            else:
                ud_tag = ("case") #most_frequent_tag 

            ud.append(ud_tag)

    print("S.No.\t\tWord\t\tAnnCorra\t\tUD\n")
    for i, ann in enumerate(inp_tags):
        print(i, "\t\t", words[i], "\t\t",ann, "\t\t", ud[i])   

S.No.		Word		AnnCorra		UD

0 		 start2 		 start2 		 start2
1 		 start1 		 start1 		 start1
2 		 start2 		 start2 		 start2
3 		 start1 		 start1 		 start1
4 		 यह 		 nmod_adj 		 amod
5 		 जूते 		 k1 		 nsubj
6 		 उसके 		 r6 		 nmod
7 		 हैं 		 root 		 root
8 		 । 		 rsym 		 punct
9 		 start2 		 start2 		 start2
10 		 start1 		 start1 		 start1
11 		 यह 		 k1 		 nsubj
12 		 40 		 nmod_adj 		 amod
13 		 मीटर 		 jjmod 		 advmod
14 		 ऊँची 		 k1s 		 case
15 		 है 		 root 		 root
16 		 । 		 rsym 		 punct
17 		 start2 		 start2 		 start2
18 		 start1 		 start1 		 start1
19 		 सुनहरी 		 k1 		 nsubj
20 		 एक 		 nmod_adj 		 amod
21 		 जासूस 		 k1s 		 case
22 		 है 		 root 		 root
23 		 । 		 rsym 		 punct
24 		 start2 		 start2 		 start2
25 		 start1 		 start1 		 start1
26 		 राजा 		 k1 		 nsubj
27 		 नतमस्तक 		 pof 		 compound
28 		 हो 		 root 		 root
29 		 गया 		 lwg_vaux 		 aux
30 		 । 		 rsym 		 punct
31 		 start2 		 start2 		 start2
32 		 start1 		 start1 		 start1
33 		 माही 		 k1 		 nsubj

380 		 यह 		 nmod_adj 		 amod
381 		 साईकल 		 k1 		 nsubj
382 		 तुम्हारे 		 rt 		 advcl
383 		 लिए 		 lwg_psp 		 case
384 		 बहुत 		 nmod_adj 		 amod
385 		 छोटी 		 k1s 		 case
386 		 है 		 root 		 root
387 		 । 		 rsym 		 punct
388 		 start2 		 start2 		 start2
389 		 start1 		 start1 		 start1
390 		 यह 		 nmod_adj 		 amod
391 		 साथी 		 k1 		 nsubj
392 		 इमारतें 		 k7p 		 nmod
393 		 1643 		 k7t 		 nmod
394 		 में 		 lwg_psp 		 case
395 		 पुरी 		 pof 		 compound
396 		 हुईं 		 root 		 root
397 		 । 		 rsym 		 punct
398 		 start2 		 start2 		 start2
399 		 start1 		 start1 		 start1
400 		 मुख्य 		 nmod_adj 		 amod
401 		 द्वार 		 k1 		 nsubj
402 		 भी 		 lwg_rp 		 dep
403 		 एक 		 nmod_adj 		 amod
404 		 स्मारक 		 pof_cn 		 compound
405 		 स्वरूप 		 k1 		 nsubj
406 		 है 		 root 		 root
407 		 । 		 rsym 		 punct
408 		 start2 		 start2 		 start2
409 		 start1 		 start1 		 start1
410 		 दोनों 		 k1 		 nsubj
411 		 ही 		 lwg_rp 		 dep
412 		 लाल 		 nmod_adj 		 amod
413 		 बलुआ 		 p

828 		 पर 		 lwg_psp 		 case
829 		 वही 		 nmod_adj 		 amod
830 		 कमलाकार 		 k1 		 nsubj
831 		 आकृति 		 ccof 		 conj
832 		 एवं 		 k1s 		 case
833 		 किरीट 		 pof_cn 		 compound
834 		 कलश 		 ccof 		 conj
835 		 भी 		 lwg_rp 		 dep
836 		 हैं 		 root 		 root
837 		 । 		 rsym 		 punct
838 		 start2 		 start2 		 start2
839 		 start1 		 start1 		 start1
840 		 तब 		 k7t 		 nmod
841 		 इस 		 nmod_adj 		 amod
842 		 किले 		 k7p 		 nmod
843 		 में 		 lwg_psp 		 case
844 		 इब्राहिम 		 r6 		 nmod
845 		 के 		 lwg_psp 		 case
846 		 स्थान 		 k7 		 nmod
847 		 पर 		 lwg_psp 		 case
848 		 बाबर 		 k1 		 nsubj
849 		 आया 		 root 		 root
850 		 । 		 rsym 		 punct
851 		 start2 		 start2 		 start2
852 		 start1 		 start1 		 start1
853 		 रास्ते 		 k7p 		 nmod
854 		 में 		 lwg_psp 		 case
855 		 , 		 rsym 		 punct
856 		 माही 		 k4a 		 nsubj
857 		 को 		 lwg_psp 		 case
858 		 राज 		 k5 		 nmod
859 		 से 		 lwg_psp 		 case
860 		 प्यार 		 pof 		 compound
861 		 हो 		 root 		 root
862 		 जाता 		 l

In [394]:
# print(list(mapping.values())[0])

## UD to AnnCorra Conversion

### Finding AnnCorra tag for a given UD tag 
Using Mappings and counts table with the help of the following rules:
1. If a UD tag has only one equivalent AnnCorra tag in Mappings - assign that tag(no need to use counts table.)
2. If a UD tag has multiple equivalent AnnCorra tags in Mappings - use the counts table to find most likely tag - checking from trigrams/bigrams.
3. If the neither trigrams nor bigrams exist corresponding to the tags in Mappings OR if a UD tag is not present in the table - then select the best possible trigram/bigram.

*Using the trigram table reduced accuracy hence, has been omitted in the following code:*

In [395]:
# print(findUDtoAnnTag)
if findUDtoAnnTag == True:
    ann = []
    for i in range(len(inp_tags)):
        try:
            if len(mapping[inp_tags[i]]) == 1:
                ann.append(mapping[inp_tags[i]][0])

            elif len(mapping[inp_tags[i]]) > 1:
                max_score = 0

                #check bigrams
                for mapped in mapping[inp_tags[i]]:    
                    score = bi[ann[-1]][mapped]
                    if score != 0: #bigram found
                        if score > max_score:
                            max_score = score
                            ann_tag = mapped

                    else:
                        if len(list(tri[ann[-2]][ann[-1]])) != 0:
                            ann_tag = (list(tri[ann[-2]][ann[-1]])[0])
                        elif len(list(bi[ann[-1]])) != 0:
                            ann_tag = (list(bi[ann[-1]])[0])
                        else:
                            ann_tag = ("lwg_psp") #most_frequent_tag 

                    ann.append(ann_tag)
                    break;  
        except:
            if len(list(tri[ann[-2]][ann[-1]])) != 0:
                ann_tag = (list(tri[ann[-2]][ann[-1]])[0])
            elif len(list(bi[ann[-1]])) != 0:
                ann_tag = (list(bi[ann[-1]])[0])
            else:
                ann_tag = ("lwg_psp") #most_frequent_tag 

            ann.append(ann_tag)
    
    print("S.No.\t\tUD\t\tAnnCorra\n")
    for i, ud in enumerate(inp_tags):
        print(i, "\t\t", ud, "\t\t", ann[i])    