# Converter to Standard cdli-conll Data

In [24]:
import pandas as pd
import re

In [7]:
# import pyconll
# data = pyconll.load_from_file("./output/P000001.conll")

In [15]:
def read_conll(filepath):
    # read in the conll file
    data = pd.read_fwf(filepath)
    # extract useful information from the conll file
    useful_info = data.iloc[:,0].str.split('\t')
    # prepare a clean dataframe
    cleaned_data = pd.DataFrame({'ID': [], 'WORD': [], 'SEGM': [], 'POS': [], \
        'MORPH': [], 'HEAD': [], 'EDGE': [], 'MISC': []})
    # update the clean dataframe
    for i in range(cleaned_data.shape[1] - 1):
        cleaned_data.iloc[:, i] = [row[i] for row in useful_info]
    cleaned_data.iloc[:, -1] = [row[-1] for row in useful_info]

    return cleaned_data

In [16]:
df = read_conll("./royal_subcorpus_data/P216736.conll")
df

Unnamed: 0,ID,WORD,SEGM,POS,MORPH,HEAD,EDGE,MISC
0,1,da-da,da-da[3],PN,PN,11,ERG,_
1,2,ensi2,ensi2[ruler],N,N,1,appos,_
2,3,szuruppak{ki},szuruppak{ki}[1],SN,SN.GEN,2,GEN,_
3,4,ha-la-ad-da,ha-la-ad-da[1],PN,PN,1,appos,_
4,5,ensi2,ensi2[ruler],N,N,1,appos,_
5,6,szuruppak{ki},szuruppak{ki}[1],SN,SN.GEN,5,GEN,_
6,7,dumu-ni,dumu[child],N,N.3-SG-H-POSS.ERG,1,appos,_
7,8,ad-us2,ad-us2[plank],N,N.ABS,11,ABS,_
8,9,abul,abul[gate],N,N,11,LOC,_
9,10,{d}sud3-da-ke4,{d}sud3[1],DN,DN.GEN.L3-NH,9,GEN,_


In [5]:
# rename the WORD and MORPH column
df = df.rename(columns={"WORD": "FORM", "MORPH": "XPOSTAG"})
# drop the EDGE and POS columns
df = df.drop(columns = ['EDGE', 'POS'])

df

Unnamed: 0,ID,FORM,SEGM,XPOSTAG,HEAD,MISC
0,1,da-da,da-da[3],PN,11,_
1,2,ensi2,ensi2[ruler],N,1,_
2,3,szuruppak{ki},szuruppak{ki}[1],SN.GEN,2,_
3,4,ha-la-ad-da,ha-la-ad-da[1],PN,1,_
4,5,ensi2,ensi2[ruler],N,1,_
5,6,szuruppak{ki},szuruppak{ki}[1],SN.GEN,5,_
6,7,dumu-ni,dumu[child],N.3-SG-H-POSS.ERG,1,_
7,8,ad-us2,ad-us2[plank],N.ABS,11,_
8,9,abul,abul[gate],N,11,_
9,10,{d}sud3-da-ke4,{d}sud3[1],DN.GEN.L3-NH,9,_


In [26]:
def extract_id(filepath):
    # read in the conll file converted by the ATF converter
    data = pd.read_fwf(filepath)
    useful_info = data.iloc[1:, 0].str.split("\t")
    # get only the id
    id_col = [row[0] for row in useful_info]
    # clean the id (want: o.1.1, but have: s1.1.1, o.col1.1.1)
    for id in id_col:
        ans = id[0] # get only the first character
        numberings = re.findall(pattern = "[0-9]", string = id) # get all numbers in the string
        for num in numberings:
            ans = ans + "." + num
        id = ans
    return id_col

In [27]:
df['ID'] = extract_id("./output/P216736.conll")
df

Unnamed: 0,ID,WORD,SEGM,POS,MORPH,HEAD,EDGE,MISC
0,a.1.1,da-da,da-da[3],PN,PN,11,ERG,_
1,a.2.1,ensi2,ensi2[ruler],N,N,1,appos,_
2,a.3.1,szuruppak{ki},szuruppak{ki}[1],SN,SN.GEN,2,GEN,_
3,a.4.1,ha-la-ad-da,ha-la-ad-da[1],PN,PN,1,appos,_
4,a.5.1,ensi2,ensi2[ruler],N,N,1,appos,_
5,a.6.1,szuruppak{ki},szuruppak{ki}[1],SN,SN.GEN,5,GEN,_
6,a.7.1,dumu-ni,dumu[child],N,N.3-SG-H-POSS.ERG,1,appos,_
7,a.8.1,ad-us2,ad-us2[plank],N,N.ABS,11,ABS,_
8,a.8.2,abul,abul[gate],N,N,11,LOC,_
9,a.9.1,{d}sud3-da-ke4,{d}sud3[1],DN,DN.GEN.L3-NH,9,GEN,_
