# Data Preprocessing

This jupyter notebook helps to preprocess the data so that it can be used by the flask app.

Input: Scientific publication

Steps:

1. Create directories
1. Manually add paper and delete the part which should not be annotated.
1. Outomatically split up paper in paragraphs of 200 words and save each separately as a txt file.

Output: txt files containing 200-words-paragraphs

## 1. Create directories and add paper.

Create a new directory to create a new batch of hits (example name: 'batch1_argumentAnnotation'). Inside this directory, create another directory called 'entire_paper'.

In [None]:
# import necessary modules
import os
import json
import glob
import ntpath
from nltk.tokenize import sent_tokenize
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize import TreebankWordTokenizer

In [None]:
# detect the current working directory and print it
current_path = os.getcwd()
#print ("The current working directory is %s" % current_path)
batch_directory_name = 'batch1_argumentComponents'
# define the name of the directory to be created
entire_paper_path = os.path.join(current_path, batch_directory_name, "entire_paper")

In [None]:
# try to create the directories
try:
    os.makedirs(entire_paper_path)
except OSError:
    print("Creation of the directory %s failed" % entire_paper_path)
else:
    print("Successfully created the directory %s " % entire_paper_path)


## 2. Specify which part of the paper should be annotated.
Add the entire paper as a txt file to the folder /batch1/entire_paper.
Define which spans should not be annotated by the MTurk workers (such as for example: title, authors, abstract, keywords, references). Add the indices of the start- and the end-characted as tuples to the 'excluded_spans' list.

In order to know the exact character indices you can copy the span (always from the beginning of the paper) and paste it to the 'to_be_excluded_span' string. This will tell you the length of the string which corresponds to index (with regard to the entire paper) of the last character of the string. 

In [None]:
to_be_excluded_span = """<paste text here to count characters>"""

print("The pasted text span consistes of " + str(len(list(to_be_excluded_span))) + " characters.")

In [None]:
# define which spans should be excluded from the txt-file. Add the indices of the start- and the end-characted as tuples to the list for all spans to be excluded.
# make sure that the spans are sorted, not overlapping, not out of bounds with regard to the entire paper length

excluded_spans = [
    (0, 2739)
]
#   (56252, 62243)


## 3. Outomatically split up paper in paragraphs of 200 words and save each separately as a txt file.

In [None]:
def get_tokens_from_string(offset, text, paragraph_nr_of_characters):
    # Has offset (indices of the start character of the given text) and text as input and creates a dict which containes a dictionary for each token indicating the original start and end indices of the token.
    tokens = {}
    
    # get the end indices of all sentences
    sentences_ends = [end for start, end in list(PunktSentenceTokenizer().span_tokenize(text))]

    # tokenize the given text and transform it to a list. The TreebankWordTokenizer does not discard the punctuation, therefore, we do it manaully.
    text_tokenized = list(TreebankWordTokenizer().span_tokenize(text))

    text_tokenized_discarded_punctuation = []
    for start, end in text_tokenized:
        # loop trough all tokens
        if end in sentences_ends and text[end-1] == "." and (end-start > 1):
            # if a token is the end of the sentence and contains a punctuation which is not yet discarded, split up token to two: one for the punctuation and one for the rest
            text_tokenized_discarded_punctuation.append((start, end-1))
            text_tokenized_discarded_punctuation.append((end-1, end))
        else:
            text_tokenized_discarded_punctuation.append((start, end))


    # loop trough all tokens and for each create the dict before appending it to the tokens dict which will be returned by this function
    token_id = 0
    for start, end in text_tokenized_discarded_punctuation:
        token_dict = {
            "id": token_id,
            "token": text[start:end],
            "start": offset + start,
            "end": offset + end,
        }
        tokens[token_id] = token_dict
        token_id += 1
    return tokens


def split_text_in_paragraphs(entire_paper_name, text, excluded_spans, min_paragraph_size):
    """ Splits up a text into paragraphs containing at least n words and makes sure that text is not split in the middle of a sentence.
    This method uses the split() function which splits up a text into words (delimiter=''). Alternatively the nltk function word_tokenize() could be used. However, the word_tokenize() finds words and punctuation in a string which is why it splits up text in more fragments than the split() method.
    """

    entire_paper_all_characters = list(text)
    entire_paper_all_words = text.split()
    entire_paper_all_sentences = list(PunktSentenceTokenizer().span_tokenize(text))
    paragraphs = {
        "entire_paper_name": entire_paper_name,
        "entire_paper_nr_of_sentences": len(entire_paper_all_sentences),
        "entire_paper_nr_of_words": len(entire_paper_all_words),
        "entire_paper_nr_of_characters": len(entire_paper_all_characters),
        "entire_paper_nr_of_characters_excluded": sum([(end-start) for start, end in excluded_spans]),
        "excluded_spans": excluded_spans,
        "text_to_annotate": {}
    }


    text_spans = []

    end_index_previous = 0
    
    if len(excluded_spans) > 0:
        for start_index, end_index in excluded_spans:
            if end_index_previous < start_index:
                text_spans.append({"start": end_index_previous, "end": start_index, "excluded": False})
            
            text_spans.append({"start": start_index, "end": end_index, "excluded": True})
            
            end_index_previous = end_index

    if end_index_previous < len(entire_paper_all_characters):
        text_spans.append({"start": end_index_previous, "end": len(entire_paper_all_characters), "excluded": False})


    
    paragraph_counter = 0


    # loop through all not excluded text spans
    for span in [s for s in text_spans if s["excluded"] is False]:
        
        paragraph_nr_of_sentences = 0
        paragraph_start = None
        paragraph_nr_of_words = 0
        
        span_offset = span["start"]
        span_text = text[span["start"]:span["end"]]
        span_characters = list(span_text)
        span_words = span_text.split()
        span_sentences = [(start+span_offset, end+span_offset) for start, end in list(PunktSentenceTokenizer().span_tokenize(span_text))]

        for sentence_start, sentence_end in span_sentences:
            paragraph_nr_of_sentences += 1
            # set the start-character of the paragraph if is is not set yet
            if paragraph_start is None:
                paragraph_start = sentence_start
            

            sentence_text = text[sentence_start:sentence_end]
            # count the number of words contained in this sentence
            sentence_nr_of_words = len(sentence_text.split())

            sentence_nr_of_characters = sentence_end - sentence_start

            paragraph_nr_of_words += sentence_nr_of_words

            if (paragraph_nr_of_words >= min_paragraph_size):
                
            
                paragraph_name = "paragraph_" + str(paragraph_counter)
                tokens = get_tokens_from_string(paragraph_start, text[paragraph_start:sentence_end], sentence_end - paragraph_start)
                paragraphs["text_to_annotate"][paragraph_name] = {
                    "start": paragraph_start,
                    "end": sentence_end,
                    "nr_of_characters": sentence_end - paragraph_start,
                    "nr_of_words": paragraph_nr_of_words,
                    "nr_of_tokens": len(tokens),
                    "nr_of_sentences": paragraph_nr_of_sentences,
                    "tokens": tokens
                }
                
                paragraph_counter += 1
                paragraph_nr_of_sentences = 0
                paragraph_start = None
                paragraph_nr_of_words = 0

        if (paragraph_nr_of_words > 0):
            # if the were one or more sentences left because they contained less than n words, join the words they contain and add them as the last paragraph.
            #yield ' '.join(all_words[paragraph_length_previous:paragraph_length_previous+paragraph_length])
            paragraph_name = "paragraph_" + str(paragraph_counter)
            tokens = get_tokens_from_string(paragraph_start, text[paragraph_start:sentence_end], sentence_end - paragraph_start)
            paragraphs["text_to_annotate"][paragraph_name] = {
                "start": paragraph_start,
                "end": sentence_end,
                "nr_of_characters": sentence_end - paragraph_start,
                "nr_of_words": paragraph_nr_of_words,
                "nr_of_tokens": len(tokens),
                "nr_of_sentences": paragraph_nr_of_sentences,
                "tokens": tokens
            }

            paragraph_counter += 1
            paragraph_nr_of_sentences = 0
            paragraph_start = None
            paragraph_nr_of_words = 0
    

    return paragraphs


# get list of all txt files in new entire_paper_path
directory_content = glob.glob(os.path.join(entire_paper_path, '*.txt'))
if len(directory_content) == 0:
    print("There is no txt file in the directory %s . Please add the paper (in the txt format) to the directory, as expalined in section 2." % entire_paper_path)
else:
    #print(directory_content)
    # loop trough all papers in the directory
    for paper_path in directory_content:
        paper_name = os.path.splitext(ntpath.basename(paper_path))[0]
        with open(paper_path) as paper:
            text = paper.read()

            min_paragraph_size = 200
            paragraphs = split_text_in_paragraphs(paper_name, text, excluded_spans, min_paragraph_size)
            

            paragraph_counter = 0
            shortes_paragraph_len = None
            longest_paragraph_len = None
            paragraph_with_least_sentences = None
            paragraph_with_most_sentences = None


            for paragraph_name, paragraph_dict in paragraphs["text_to_annotate"].items():
                paragraph_nr_of_words = paragraph_dict["nr_of_words"]
                paragraph_nr_of_sentences = paragraph_dict["nr_of_sentences"]
                if shortes_paragraph_len is None or paragraph_nr_of_words < shortes_paragraph_len:
                    shortes_paragraph_len = paragraph_nr_of_words
                if longest_paragraph_len is None or paragraph_nr_of_words > longest_paragraph_len:
                    longest_paragraph_len = paragraph_nr_of_words
                if paragraph_with_least_sentences is None or paragraph_nr_of_sentences < paragraph_with_least_sentences:
                    paragraph_with_least_sentences = paragraph_nr_of_sentences
                if paragraph_with_most_sentences is None or paragraph_nr_of_sentences > paragraph_with_most_sentences:
                    paragraph_with_most_sentences = paragraph_nr_of_sentences
                
            new_filename = paper_name + "_ToBeAnnotated" + ".json"
            new_filepath = os.path.join(current_path, batch_directory_name, new_filename)
            with open(new_filepath, 'w') as new_json_file:
                new_json_file.write(json.dumps(paragraphs, indent=2))
            

            print("The text contains %s characters in total." % paragraphs["entire_paper_nr_of_characters"])
            print("The text contains %s words in total." % paragraphs["entire_paper_nr_of_words"])
            print("The text contains %s sentences in total." % paragraphs["entire_paper_nr_of_sentences"])
            print("The text was split up in %s paragraphs." % len(paragraphs["text_to_annotate"]))
            print("Paragraphs contain %s characters on average." % (paragraphs["entire_paper_nr_of_characters"]/len(paragraphs["text_to_annotate"])))
            print("Paragraphs contain %s words on average." % (paragraphs["entire_paper_nr_of_words"]/len(paragraphs["text_to_annotate"])))
            print("Paragraphs contain %s sentences on average." % (paragraphs["entire_paper_nr_of_sentences"]/len(paragraphs)))
            print("The shortest paragraph contains %s words." % shortes_paragraph_len)
            print("The longest paragraph contains %s words." % longest_paragraph_len)
            print("The paragraph with the least sentences contains %s sentences." % paragraph_with_least_sentences)
            print("The paragraph with the moset sentences contains %s sentences." % paragraph_with_most_sentences)
            print("The preprocessing result containing all %s paragraphs was saved as a json file (filename=%s) in the folder %s." % (len(paragraphs["text_to_annotate"]), new_filename, batch_directory_name))