In [190]:
import json
import glob
import pandas as pd
import re
import numpy as np
import ast
import os
from os import listdir
from os.path import isfile, join
import matplotlib
import matplotlib.pyplot as plt
from sentence_splitter import SentenceSplitter, split_text_into_sentences
from transformers import PegasusForConditionalGeneration, PegasusTokenizerFast, PegasusTokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import textwrap
from PIL import Image, ImageDraw, ImageFont
from imgaug import augmenters as iaa
import imgaug as ia
import cv2

In [2]:
# The function below will sort the files in numberical order 
numbers = re.compile(r'(\d+)')
def numericalSort(value):
    parts = numbers.split(value)
    parts[1::2] = map(int, parts[1::2])
    return parts

In [3]:
etd_path = sorted(glob.glob('/home/mchou001/etds_json/*.json'), key = numericalSort)

In [33]:
class Preprocessor:
    def __init__(self, json_files):
        self.json_files = json_files
        #self.dedication_ = dedication_
        #self.acknowledgement_ = acknowledgement_
        #self.general_abs = general_abs
    
    ## the function will return parsed lines from json files
    def json_text_parser(self):
        counter = 0
        list_line = []
        for filename in self.json_files:
            with open(filename, encoding='utf-8', mode='r') as files:
                data = json.loads(files.read())
                counter = counter + 1
                _text = []
                for i in range(len(data)):
                    text = data[i].get("Line_and_BB")
                    _text.append(text)
                for i, data in enumerate(_text):
                    line_text = [text['Line'].strip('\n') for text in data]
                    list_line.append(line_text)
        
        return list_line
    
    ## the function will return parsed bounding box information from json files
    def json_bbox_parser(self):
        counter = 0
        list_bbox = []
        for filename in self.json_files:
            with open(filename, encoding='utf-8', mode='r') as files:
                data = json.loads(files.read())
                counter = counter + 1
                _text = []
                for i in range(len(data)):
                    text = data[i].get("Line_and_BB")
                    _text.append(text)
                for i, data in enumerate(_text):
                    bbox_text = [bbox['Bounding Box'] for bbox in data]
                    list_bbox.append(bbox_text)
        
        return list_bbox
    
    '''
    We are using 'ast' module for preprocessing task of the class labels.
    The ast module helps Python applications to process trees of the Python abstract syntax grammar.
    We utilized the ast module to grammatically identify the sentences and applied 'sentence splitter' module 
    on paragraphs to get a list of sentences.
    The preprocessing functions for each classes returns a list.
    '''
    
    def preprocess_dedication(self, dedication_):
        self.dedication_ = dedication_
        text_dedication = ast.literal_eval(self.dedication_)
        text_dedication = (" ").join(text_dedication)
        splitter = SentenceSplitter(language='en')
        dedication_list = splitter.split(text_dedication)
        return dedication_list    
    
    def preprocess_ack(self, acknowledgement_):
        self.acknowledgement_ = acknowledgement_
        text_ack = ast.literal_eval(self.acknowledgement_)
        text_ack = (" ").join(text_ack)
        splitter = SentenceSplitter(language='en')
        ack_list = splitter.split(text_ack)
        return ack_list
    
    def preprocess_gabs(self, general_abs):
        self.general_abs = general_abs
        splitter = SentenceSplitter(language='en')
        gabs_list = splitter.split(self.general_abs)
        return gabs_list
        

In [21]:
## loading the paraphrasing model

#references: https://arxiv.org/abs/1912.08777

model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

In [185]:
class Augmentation:
    
    def get_paraphrased_sentences(self, input_text, num_return_sequences):
        self.input_text = input_text
        self.num_return_sequences = num_return_sequences        
        batch = tokenizer.prepare_seq2seq_batch([self.input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
        translated = model.generate(**batch,max_length=60,num_beams=10, num_return_sequences=self.num_return_sequences, temperature=1.5)
        paraphrased_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
        return paraphrased_text       
    
    def paraphrased_text(self, class_text_list):
        self.class_text_list = class_text_list
        phrases_text = []
        for ele in self.class_text_list:
            phrase = self.get_paraphrased_sentences(ele, 1)
            phrases_text.append(phrase)
        
        para_phrases = [' '.join(x) for x in phrases_text]
        paraphrase_ = [' '.join(x for x in para_phrases)] ## combine  the splitted lists into a paragraph
        paraphrase_text = str(paraphrase_).strip('[]').strip("'")
        paraphrase_strip = paraphrase_text.strip('""')
        return paraphrase_strip
    
    def wrap_text(self, text):
        self.text = text
        new_phrase = textwrap.wrap(self.text, width=90)
        string = ''
        for ele in new_phrase[0:]:
            string = string + ele + '\n'
        return string       

In [186]:
file = 'Label-Dedication/03.png'
W,H = (2360, 3200)
def text_on_img(text, size):
    font = ImageFont.truetype('NimbusMonoPS-Bold.otf', size)
    image = Image.new(mode = "RGB", size = (W, H), color = "white")
    draw = ImageDraw.Draw(image)
    w, h = draw.textsize(text)
    ## Adjust the texual position in a page and draw the text on a image
    draw.text(((W-w)/10,(H-h)/6), text, font=font, fill=(0,0,0), spacing=60) 
    image.save(file)

    return image

In [194]:
if __name__ == "__main__":
    
    parser = Preprocessor(etd_path)
    
    etd_lines = parser.json_text_parser()
    etd_bbox = parser.json_bbox_parser()
    labels_ = pd.read_csv("labels.csv", encoding = 'utf-8')
    etd_label = labels_['labels']
    res_list = [list(item) for item in list(zip(etd_lines, etd_bbox, etd_label))]
    dataframe = pd.DataFrame(res_list, columns = ['text', 'bbox', 'class'])
    dataframe.to_csv('ETD_aug.csv', index = False)
    df1 = pd.read_csv('ETD_aug.csv')
    
    df1.set_index("class", inplace = True)
    
    augmentation = Augmentation()
    
    ## Dedication ###
    label_dedication = df1.loc["Label-Dedication"]
    phrases_dedication = label_dedication['text']
    dedication = parser.preprocess_dedication(phrases_dedication[2])  ## change the index values which corresponds to different ETD samples
    dedication_paraphrase = augmentation.paraphrased_text(dedication)
    dedication_text_wrap = augmentation.wrap_text(dedication_paraphrase)
    dedication_title = "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tDedication\n" + dedication_text_wrap
    
    ## Acknowledgement ##
    label_ack = df1.loc["Label-Acknowledgement"]
    phrases_ack = label_ack['text']
    ack = parser.preprocess_ack(phrases_ack[0]) ## change the index values which corresponds to different ETD samples
    ack_paraphrase = augmentation.paraphrased_text(ack)
    ack_text_wrap = augmentation.wrap_text(ack_paraphrase)
    ack_title = "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tACKNOWLEDGEMENTS\n" + ack_text_wrap
    #print(ack_title)
       
    ## General Abstract ##
    label_gabs = df1.loc["Label-GeneralAbstract"]
    phrase_gabs = label_gabs['text']
    text0 = ast.literal_eval(phrase_gabs[0])
    text1 = ast.literal_eval(phrase_gabs[1])
    text2 = ast.literal_eval(phrase_gabs[2])
    text3 = ast.literal_eval(phrase_gabs[3])
    phrases_gabs = (" ").join(text0) + (" ").join(text1) + (" ").join(text2) + (" ").join(text3)
    gabs = parser.preprocess_gabs(phrases_gabs)
    gabs_paraphrase = augmentation.paraphrased_text(gabs)
    gabs_text_wrap = augmentation.wrap_text(gabs_paraphrase)
    gabs_title = "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tSUMMARY\n" + gabs_text_wrap
    text_on_img(dedication_title,43)
    
    '''For now we need to change the directory '''
    ##image augmentation
    seq = iaa.Sequential([
        iaa.Affine(rotate=(-5, 5)),
        iaa.AdditiveGaussianNoise(scale=(10, 60)),
        iaa.SaltAndPepper(p=0.1),
        iaa.GaussianBlur(sigma=0.5),
        iaa.LinearContrast(alpha=1),
        iaa.PerspectiveTransform(scale=0.025, keep_size=True)
    ], random_order = True)
    
    img_path_dedication = sorted(glob.glob('/home/mchou001/Label-Dedication/*.png'), key = numericalSort)
    for n, images in enumerate(img_path_dedication[0:3]):
        aug_image = seq(image=cv2.imread(images))
        cv2.imwrite('/home/mchou001/Label-Dedication/aug_images/aug{}.png'.format(n),aug_image)

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

