In [1]:
import os 
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
#os.getcwd()

## Parsing SweLL-gold corpus 

In [2]:
def parse_xml(doc):
    import xml.etree.ElementTree as ET
    from xml.etree.ElementTree import Element, dump, ElementTree 

    doc = ET.parse(doc)
    root = doc.getroot()
    sentences = [] # sentences
    label = [] # error corretion label
    
    for child in root.iter('sentence'):
        sentence = ''
        label_dict = {}
        label_list = []
        for neighbor in child.iter('w'):
            word = neighbor.text
            if neighbor.get('correction_label'):
                corr_label = neighbor.get('correction_label')
                label.append(corr_label)
                label_list.append(corr_label)
                if '␤' not in word and 'BT' not in word:
                #if 'BT' not in word:
                    sentence = sentence + word + ' '
            else:
                if '␤' not in word and 'BT' not in word:
                #if 'BT' not in word:
                    sentence = sentence + word + ' '        
                    
        if label_list:
            label_dict[sentence] = label_list
            sentences.append(label_dict)
            #print('appended')        
        
        else: sentences.append(sentence)
            
    return sentences, label   

In [3]:
# src_file =  sentences that contain errors 
src_file = './SweLL_release_v1/SweLL_release_v1/SweLL_Gold/SweLL_Gold/swellOriginal/sourceSweLL.xml'
# tgt_file = grammatically clean sentences 
tgt_file = './SweLL_release_v1/SweLL_release_v1/SweLL_Gold/SweLL_Gold/swellTarget/targetSweLL.xml'
src_sents, src_label = parse_xml(src_file)
tgt_sents, tgt_label = parse_xml(tgt_file)

In [4]:
assert len(src_sents) == 7807 
assert len(tgt_sents) == 8137  

In [5]:
def sentence_label_division(doc):
    error = []
    clean = []
    
    for i in doc:
        if type(i)==dict:
            error.append(i)
        else:
            clean.append(i)
        
    sent = [list(i.keys())[0] for i in error] + clean
    lab = [list(i.values())[0] for i in error]

    return sent, lab

In [6]:
# making the parallel corpus as a data frame 
tgt_sent, tgt_lab = sentence_label_division(tgt_sents)
data = pd.DataFrame([tgt_sent, tgt_lab]).transpose()
data.columns = ['tgt', 'tgt_tag']

src_sent, src_lab = sentence_label_division(src_sents)
target = list(data.tgt)

## Size mismatch between erroneous sentences and clean sentences
Due to the size mismatch between original and normalized sentences, we compare the number of matched strings on each source sentences based on the target sentences and choose a source sentence that contains the most matched characters for each target sentence

In [7]:
def match_string(s1, s2):
    count = 0
    for c1, c2 in zip(s1, s2):
        if c1 == c2:
            count += 1
    return count

In [8]:
source = []
for i in range(len(target)):
    max_value = 0
    max_index = -1 
    for j in range(len(src_sent)):
        tgt_split = target[i].split()
        src_split = src_sent[j].split()
        count = match_string(tgt_split, src_split)
        if count > max_value:
            max_value = count
            max_index = j 
    source.append(src_sent[max_index])      

In [None]:
data['src'] = source
data['tgt'] = target

In [None]:
data.head()

In [None]:
data.tgt_tag.fillna(value=np.nan, inplace=True)
data = data[data['tgt_tag'].notna()]

In [None]:
data.to_csv('./term2/data/swell_parallel_corpus.csv', index=None)

In [None]:
# To check the length of the sentences in SweLL gold corpus

src_len = [len(s.split()) for s in data['src']]
tgt_len = [len(s.split()) for s in data['tgt']]

print('src text minimum length : {}'.format(np.min(src_len)))
print('src text maximum length : {}'.format(np.max(src_len)))
print('src text average length : {}'.format(np.mean(src_len)))
print('tgt text minimum length : {}'.format(np.min(tgt_len)))
print('tgt text maximum length : {}'.format(np.max(tgt_len)))
print('tgt text average length: {}'.format(np.mean(tgt_len)))

In [None]:
#train/validation/test ratio 70%/10%/20%
train, valid, test = np.split(data.sample(frac=1), 
                              [int(.7 * len(data)), 
                               int(.8 * len(data))])

In [None]:
del test['tgt_tag'] 
del train['tgt_tag']
del valid['tgt_tag']

In [None]:
# save files for model training

train.to_csv('term2/data/train.tsv', sep='\t', index=None, header=['src', 'tgt'])
valid.to_csv('term2/data/valid.tsv', sep='\t', index=None, header=['src', 'tgt'])
test.to_csv('term2/data/test.tsv', sep='\t', index=None, header=['src', 'tgt'])