In [1]:
import json
from collections import Counter
import pandas as pd
import xml.etree.ElementTree as ET
import timeit
import os.path

In [2]:
# 1. Processing SwELL data for analysis

In [3]:
# filter data to get essays containing S-Msubj errors

data_src = "./SweLL_release_v1/SweLL_Gold/SweLL_Gold/swellData.json"
correction_data_src = "SweLL_correctionTags_502essays.json"

result = {}
with open(data_src, 'r') as data, open(correction_data_src, 'r') as data2:
    json_data = json.load(data)
    json_data2 = json.load(data2)
    for essay_id in json_data:
        corrections = json.loads(json_data2[essay_id])
        # get edges where label 'S-Msubj' exists
        essay_corrs = []
        for corr in corrections["edges"].values():
            if "S-Msubj" in corr["labels"]:
                essay_corrs.append(corr)
        if len(essay_corrs) > 0:
            result[essay_id] = {'source':corrections["source"], 'target':corrections["target"], 'edges':essay_corrs}

In [4]:
# define functions for constructing sentences 

end_tokens = ".␤"

def get_target_sentence(essay_data, edge_id):
    target_sentence = []
    for item in essay_data['target']:
        if item['id'] == edge_id:
            token = item
            break

    token_id = essay_data['target'].index(token)
    next_token = essay_data['target'][token_id]

    while end_tokens.find(next_token['text'].strip()) < 0 and token_id < len(essay_data['target']): 
        next_token = essay_data['target'][token_id]
        target_sentence.append(next_token['text'])
        token_id += 1

    prev_token_id = essay_data['target'].index(token) - 1
    prev_token = essay_data['target'][prev_token_id]
    prev = []
    while end_tokens.find(prev_token['text'].strip()) < 0: 
        prev.append(prev_token['text'])
        prev_token_id -= 1
        prev_token = essay_data['target'][prev_token_id]
    prev.reverse()

    token_index = len(prev)
    target_sentence = prev + target_sentence

    return "".join(target_sentence), target_sentence[token_index], (prev_token_id+1)

def get_source_sentence(essay_data, edge_id, start_token_id):
    source_sentence = []
    start_token_id = f's{start_token_id}'

    start_token = None
    for item in essay_data['source']:
        if item['id'] == start_token_id:
            start_token = item
            break
    
    if start_token == None:
        return
    
    token_id = essay_data['source'].index(start_token)
    next_token = essay_data['source'][token_id]

    while end_tokens.find(next_token['text'].strip()) < 0 and token_id < len(essay_data['source']): 
        next_token = essay_data['source'][token_id]
        source_sentence.append(next_token['text'])
        token_id += 1
        
    prev_token_id = essay_data['source'].index(start_token) - 1
    prev_token = essay_data['source'][prev_token_id]
    prev = []

    while end_tokens.find(prev_token['text'].strip()) < 0 and prev_token_id > -1:
        prev.append(prev_token['text'])
        prev_token_id -= 1
        prev_token = essay_data['source'][prev_token_id]
    prev.reverse()

    return "".join(prev + source_sentence)


In [5]:
# compose target-source pairs of sentences
# NOTE: this is not completely accurate, sometimes the source sentence does not corresponde to the actual source

missing_tokens = []
target_source_pairs = []

for key in result:
    for edge in result[key]['edges']:
        target_sentence, missing_token, start_token_id = get_target_sentence(result[key], edge['id'].replace("e-", ""))
        missing_tokens.append(missing_token)
        source_sentence = get_source_sentence(result[key], edge['id'].replace("e-", ""), start_token_id)
        target_source_pairs.append((target_sentence, source_sentence, missing_token.strip(), key))

In [None]:
# analyze how/when/why S-Msubj errors occur/subject is missing

# list tokens that were missing
token_counts = Counter(missing_tokens).most_common()
token_df = pd.DataFrame(token_counts, columns=['token', 'count'])
token_df

In [7]:
# define function for printing a table of target-source pairs, 
# optional parameter to view sentences based on given token

def show_sent_pairs(missing_token=None):
    pd.set_option("display.max_rows", None, "display.max_columns", None) 
    pd.set_option('display.max_colwidth', None)

    d = {'target': [], 'source': [], 'key': []}

    for tsp in target_source_pairs:
        if missing_token == None or tsp[2] == missing_token:
            d['target'].append(tsp[0])
            d['source'].append(tsp[1])
            d['key'].append(tsp[3])

    df = pd.DataFrame(data=d)
    df.style.set_properties(**{'text-align': 'left'})
    return df

In [None]:
show_sent_pairs('jag')

In [9]:
# 2. Sparv-tagging
# modified from the given script

In [10]:
import json

# URL to the Sparv 2 API:
sparv_url = "https://ws.spraakbanken.gu.se/ws/sparv/v2/"

# Optional settings, specifying what analysis should be done (in this case compound analysis only):
# Check https://ws.spraakbanken.gu.se/ws/sparv/v2/#settings for more info
sparv_settings = json.dumps({
    "corpus": "Korpusnamn",
    "lang": "sv",
    "textmode": "plain",
    "positional_attributes": {
        "dependency_attributes":["ref","dephead","deprel"],
        "lexical_attributes": [
            "pos",
            "msd",
            "lemma"
        ]
    },
    "text_attributes": {
        "readability_metrics": [
            "lix",
            "ovix",
            "nk"
        ]
    }
})

query_parameters = {"text": None, "settings": sparv_settings}


#-------------------------------------------------------------------
# Example using Python's built-in urllib.request
# https://docs.python.org/3/library/urllib.request.html
#-------------------------------------------------------------------

import urllib.request
import urllib.parse

def sparv_urllib_req(text):
    query_parameters = {"text": text, "settings": sparv_settings}
    enc_query = urllib.parse.urlencode(query_parameters).encode("UTF-8")
    resp = urllib.request.urlopen(sparv_url, data=enc_query).read()
    return resp.decode("UTF-8")


#-------------------------------------------------------------------
# Example using the more intuitive but third-pary Requests library
# https://docs.python-requests.org/en/master/
#-------------------------------------------------------------------

import requests

def sparv_req(text):
    query_parameters = {"text": text, "settings": sparv_settings}
    response = requests.get(sparv_url, params=query_parameters)
    return response.text

In [11]:
# define function for parsing xml response

def get_sentence_pos(tagged_sentence):
    root = ET.fromstring(tagged_sentence)
    word_tags = root.findall("./corpus/text/paragraph/sentence/")
    #for wt in word_tags:
    #    print(wt.attrib, wt.text) 
    return [{'deprel': wt.attrib['deprel'], 'msd': wt.attrib['msd']} for wt in word_tags] #wt.attrib['msd'],

In [160]:
# 3. Corrupt data - final scrip in corrupt.py

In [None]:
# load data for corruption
tree = ET.parse('leo_coctaill.xml')
root = tree.getroot()
clean_data = [" ".join([word.text for word in sent]) for sent in root]

In [None]:
def find_orig_sent(modified_sent):
    for i, sent in enumerate(clean_sentences):
        if modified_sent.lower() in sent.lower():
            print(i, sent)
            break

In [None]:
subj_tags = ['SS', 'SP', 'ES', 'FS'] # labels from https://cl.lingfil.uu.se/~nivre/swedish_treebank/dep.html
corrupt_sentences = [] # NOTE: not all sentences will necessarily be corrupt 

start_time = timeit.default_timer()
start_i = 0

if os.path.isfile('corrupted_extra.txt'):
    with open('corrupted_extra.txt', 'r') as f:
        file_lines = f.readlines()
        if len(file_lines) > 0:
            start_i = int(file_lines[-1].split("\t")[0]) + 1
            
    with open('corrupted_extra.txt', 'a+') as corrupted_data:
        for i, sentence in enumerate(clean_data[start_i:], start_i):
            elapsed = timeit.default_timer() - start_time
            
            print(f"\rprocessing sentence {i} / {len(clean_data)} time elapsed: {elapsed}", end="")
            
            try:
                tagged = sparv_req(sentence)
                prevelapsed = elapsed
            except:
                print("Unexpected error:", sys.exc_info()[0])
                time.sleep(60)
                continue
            tagged_dict = get_sentence_pos(tagged)
            
            # NOTE: only one corruption is applied per sentence

            # find subjects in sentence
            sent_subjs = [i for i, item in enumerate(tagged_dict) if 'SUB' in item['msd']]
            sentence_split = sentence.split()

            # case: subject has already been mentioned
            # NOTE: not sure if this is the best way to check if subjects are same
            if len(sent_subjs) == 2 and sentence_split[sent_subjs[0]].lower() == sentence_split[sent_subjs[1]].lower():
                del sentence_split[sent_subjs[1]] # remove the second occurence of the subject
                corrupted_data.write(f'{i}\t{" ".join(sentence_split)}\n')
                continue

            # case: vad som
            # simply check for "vad som" - deprel label of som seems to be Unclassifiable grammatical function
            if "vad som" in sentence:
                som_indices = [i for i, x in enumerate(sentence_split) if x == "som"]
                som_index = som_indices[0]
                # check that "som" follows "vad" e.g in sentences like "Människor som gör vad som helst"
                if len(som_indices) > 1:
                    for i in som_indices:
                        if sentence_split[som_index-1].lower() == "vad":
                            break
                        som_index = i
                del sentence_split[sentence_split.index("som")]
                corrupted_data.write(f'{i}\t{" ".join(sentence_split)}\n')
                continue

            # case: drop prounoun subject
            # SS(PN) MS .... -> MS ....
            for sub in sent_subjs:
                if 'PN' in tagged_dict[sub]['msd'] and (sub + 1) < len(tagged_dict) and 'VB' in tagged_dict[sub + 1]['msd']:
                    del sentence_split[sub]
                    # capitalise new first token
                    if sub == 0:
                        sentence_split[0] = sentence_split[0].capitalize()
                    corrupted_data.write(f'{i}\t{" ".join(sentence_split)}\n')
                    break