# Converter to Standard cdli-conll Data

In [2]:
import pandas as pd
import re
import numpy as np
import os

In [12]:
# read_conll takes in a subcorpus conll file and outputs a dataframe
def read_conll(filepath):
    # read in the conll file
    data = pd.read_fwf(filepath)
    
    # deal with files with more than one line of comment in the beginning
    k = 0
    while data.iloc[k, 0][0] == '#':
        data = data.drop(k).reset_index(drop=True)
    
    # prepare a clean dataframe
    cleaned_data = pd.DataFrame({'ID': [], 'WORD': [], 'SEGM': [], 'POS': [], \
        'MORPH': [], 'HEAD': [], 'EDGE': [], 'MISC': []})

    # update the clean dataframe
    if '#' in data.columns: # deal with files that automatically get its ID read into a separate column
        cleaned_data['ID'] = data['#']
        # extract useful information from the conll file
        useful_info = data.iloc[:, 1].str.split('\t')
        for j in range(1, cleaned_data.shape[1]-1):
            cleaned_data.iloc[:, j] = [row[j] for row in useful_info]
        cleaned_data.iloc[:, -1] = [row[-1] for row in useful_info]
    
    else:
        # extract useful information from the conll file
        useful_info = data.iloc[:, 0].str.split('\t')
        for i in range(cleaned_data.shape[1]-1):
            cleaned_data.iloc[:, i] = [row[i] for row in useful_info]
        cleaned_data.iloc[:, -1] = [row[-1] for row in useful_info]

    return cleaned_data


# extract_id takes an ATF converted file and extracts the ID column as a vector
def extract_id(filepath):
    # read in the conll file converted by the ATF converter
    data = pd.read_fwf(filepath)
    useful_info = data.iloc[1:, 0].str.split('\t')
    # get only the id
    id_col = [row[0] for row in useful_info]
    # clean the id (want: o.1.1, but have: s1.1.1, o.col1.1.1)
    for index in range(len(id_col)):
        ans = id_col[index][0] # get only the first character
        numberings = re.findall(pattern = '[0-9]', string = id_col[index]) # get all numbers in the string
        for num in numberings:
            ans = ans + '.' + num
        id_col[index] = ans
    return id_col


def do_it_all(subcorpus_filepath):
    df = read_conll(subcorpus_filepath)
    # extract text number for later use
    text_number = re.findall(pattern = '.*\/([^\/]*)$', string = subcorpus_filepath)[0][0:-6]
    # get the ATF converted data's file path
    atf_converted_filepath = './atf_converted_data/' + text_number + '.conll'
    # check if this is a file
    if not os.path.isfile(atf_converted_filepath):
        print(text_number + ': This file errored out during the ATF conversion')
        return None

    full_id = extract_id(atf_converted_filepath)
    if len(full_id) != df.shape[0]:
        print(text_number + ': Rows of Subcorpus Data and ATF converted data do not match')
        return None
        
    # update the ID column
    df['ID'] = full_id
    # rename the WORD and MORPH column
    df = df.rename(columns={'WORD': 'FORM', 'MORPH': 'XPOSTAG'})
    # drop the EDGE and POS columns
    df = df.drop(columns = ['EDGE', 'POS'])
    # add the DEPREL column
    df['DEPREL'] = np.repeat('_', df.shape[0])
    # reorder the columns
    df = df[['ID', 'FORM', 'SEGM', 'XPOSTAG', 'HEAD', 'DEPREL', 'MISC']]

    with open(text_number + ".conll", "w") as fp:
        fp.write("#new_text=" + text_number + "\n")
    df.to_csv(text_number + '.conll', header = True, index = None, sep = '\t', mode = 'a')

In [None]:
subcorpus_directory = './royal_subcorpus_data'

filepaths = []
for filename in os.listdir(subcorpus_directory):
    f = os.path.join(subcorpus_directory, filename)
    # check if this is a file
    if os.path.isfile(f):
        filepaths += [f]

for fp in filepaths:
    

Old method

In [None]:
# read_conll takes in a subcorpus conll file and outputs a dataframe
def read_conll(filepath):
    # read in the conll file
    data = pd.read_fwf(filepath)
    
    # deal with files with more than one line of comment in the beginning
    k = 0
    while data.iloc[k, 0][0] == '#':
        data = data.drop(k).reset_index(drop=True)
    
    # prepare a clean dataframe
    cleaned_data = pd.DataFrame({'ID': [], 'WORD': [], 'SEGM': [], 'POS': [], \
        'MORPH': [], 'HEAD': [], 'EDGE': [], 'MISC': []})

    # update the clean dataframe
    if '#' in data.columns: # deal with files that automatically get its ID read into a separate column
        cleaned_data['ID'] = data['#']
        # extract useful information from the conll file
        useful_info = data.iloc[:, 1].str.split('\t')
        for j in range(1, cleaned_data.shape[1]-1):
            cleaned_data.iloc[:, j] = [row[j] for row in useful_info]
        cleaned_data.iloc[:, -1] = [row[-1] for row in useful_info]
    
    else:
        # extract useful information from the conll file
        useful_info = data.iloc[:, 0].str.split('\t')
        for i in range(cleaned_data.shape[1]-1):
            cleaned_data.iloc[:, i] = [row[i] for row in useful_info]
        cleaned_data.iloc[:, -1] = [row[-1] for row in useful_info]

    return cleaned_data


# extract_id takes an ATF converted file and extracts the ID column as a vector
def extract_id(filepath):
    # read in the conll file converted by the ATF converter
    data = pd.read_fwf(filepath)
    useful_info = data.iloc[1:, 0].str.split('\t')
    # get only the id
    id_col = [row[0] for row in useful_info]
    # clean the id (want: o.1.1, but have: s1.1.1, o.col1.1.1)
    for index in range(len(id_col)):
        ans = id_col[index][0] # get only the first character
        numberings = re.findall(pattern = '[0-9]', string = id_col[index]) # get all numbers in the string
        for num in numberings:
            ans = ans + '.' + num
        id_col[index] = ans
    return id_col


def do_it_all(subcorpus_filepath, atf_converted_filepath):
    df = read_conll(subcorpus_filepath)
    # extract text number for later use
    text_number = re.findall(pattern = '.*\/([^\/]*)$', string = subcorpus_filepath)[0][0:-6]

    full_id = extract_id(atf_converted_filepath)
    if len(full_id) != df.shape[0]:
        print(text_number + ': Rows of Subcorpus Data and ATF converted data do not match')
        return None
    # update the ID column
    df['ID'] = full_id
    # rename the WORD and MORPH column
    df = df.rename(columns={'WORD': 'FORM', 'MORPH': 'XPOSTAG'})
    # drop the EDGE and POS columns
    df = df.drop(columns = ['EDGE', 'POS'])
    # add the DEPREL column
    df['DEPREL'] = np.repeat('_', df.shape[0])
    # reorder the columns
    df = df[['ID', 'FORM', 'SEGM', 'XPOSTAG', 'HEAD', 'DEPREL', 'MISC']]

    with open(text_number + ".conll", "w") as fp:
        fp.write("#new_text=" + text_number + "\n")
    df.to_csv(text_number + '.conll', header = True, index = None, sep = '\t', mode = 'a')

## Example 1: P216736

In [11]:
pd.read_fwf('./royal_subcorpus_data/P216736.conll')

Unnamed: 0,# global.columns = ID WORD SEGM POS MORPH HEAD EDGE MISC,Unnamed: 1
0,1\tda-da\tda-da[3]\tPN\tPN\t11\tERG\t_,
1,2\tensi2\tensi2[ruler]\tN\tN\t1\tappos\t_,
2,3\tszuruppak{ki}\tszuruppak{ki}[1]\tSN\tSN.GEN...,
3,4\tha-la-ad-da\tha-la-ad-da[1]\tPN\tPN\t1\tapp...,
4,5\tensi2\tensi2[ruler]\tN\tN\t1\tappos\t_,
5,6\tszuruppak{ki}\tszuruppak{ki}[1]\tSN\tSN.GEN...,
6,7\tdumu-ni\tdumu[child]\tN\tN.3-SG-H-POSS.ERG\...,
7,8\tad-us2\tad-us2[plank]\tN\tN.ABS\t11\tABS\t_,
8,9\tabul\tabul[gate]\tN\tN\t11\tLOC\t_,
9,10\t{d}sud3-da-ke4\t{d}sud3[1]\tDN\tDN.GEN.L3-...,


In [12]:
do_it_all('./royal_subcorpus_data/P216736.conll', './output/P216736.conll')

In [6]:
d = read_conll('./royal_subcorpus_data/P430122.conll')
d

Unnamed: 0,ID,WORD,SEGM,POS,MORPH,HEAD,EDGE,MISC
0,1,{d}szu-{d}suen,{d}szu-{d}suen[1],RN,RN,0,root,_
1,2,lugal,lugal[king],N,N,1,appos,_
2,3,kal-ga,kalag[strong],V,NF.V.PT,2,amod,_
3,4,lugal,lugal[king],N,N,1,appos,_
4,5,uri5{ki}-ma,urim5{ki}[1],SN,SN.GEN,4,GEN,_
5,6,lugal,lugal[king],N,N,1,appos,_
6,7,an,_,_,_,6,_,_
7,8,ub-da,an-ub-da[quarter],N,N,6,GEN,_
8,9,limmu2-ba,limmu2[four],NU,NF.V.3-SG-NH-POSS.GEN.ABS,8,nummod,_
9,10,a-ha-am-wa-qar,a-ha-am-wa-qar[1],PN,PN,11,amod,_


In [13]:
do_it_all('./royal_subcorpus_data/P430122.conll', './atf_converted_data/P430122.conll')

P430122: Rows of Subcorpus Data and ATF converted data do not match


In [10]:
do_it_all('./royal_subcorpus_data/P429949.conll', './atf_converted_data/P429949.conll')