In [176]:
import os
import json

In [177]:
def read_metadata(path):
    try:
        with open(path, 'r') as file:
            data = json.load(file)
            return data
    except FileNotFoundError:
        print(f"File not found: {path}")
    except json.JSONDecodeError:
        print(f"Error decoding JSON in file: {path}")

In [178]:
def read_text(path):
    try:
        with open(path, 'r', encoding='utf-8') as file:
            data = file.read()
            return data
    except FileNotFoundError:
        print(f"File not found: {path}")
    except json.JSONDecodeError:
        print(f"Error decoding JSON in file: {path}")

In [179]:
def swap_adjacent_paragraphs(text):
    pairs = []
    paragraphs = text.split('\n')
    
    for i in range(len(paragraphs)-1):
        pairs.append(paragraphs[i+1] + '\n' + paragraphs[i])
        
    return pairs

In [180]:
def jump_same_author_paragraphs(text, metadata):
    pairs = []
    paragraphs = text.split('\n')
    changes = metadata['changes']
    changes_len = len(changes)
    
    if changes_len <= 1:
        return []
    
    i = 0
    while i < changes_len:
        j = i+1
        while j < changes_len and changes[i] == 0 and changes[j] == 0:
            pairs.append(paragraphs[i] + '\n' + paragraphs[j+1])
            j += 1
        i += 1
    
    return pairs

In [190]:
def combine_all_different_author_paragraphs(text, metadata):
    pairs = []
    paragraphs = text.split('\n')
    authors = metadata['authors']
    paragraphs_len = len(paragraphs)
    
    if paragraphs_len <= 2 or authors < paragraphs_len:
        return []
    
    for i in range(paragraphs_len):
        for j in range(i+2, paragraphs_len):
            pairs.append(paragraphs[i] + '\n' + paragraphs[j])
            
    return pairs

In [182]:
EASY_DIR = './data/easy/train/'
MEDIUM_DIR = './data/medium/train/'
HARD_DIR = './data/hard/train/'
DIFFICULTY_DIRS = [EASY_DIR, MEDIUM_DIR, HARD_DIR]

AUGMENTED_DIRS = ['./augmented/easy/train/', './augmented/medium/train/', './augmented/hard/train/'] 

In [183]:
new_file_counter = [10000]

In [184]:
def create_new_file(file_dir, text, metadata):
    text_file_path = file_dir + f'problem-{new_file_counter[0]}.txt'
    metadata_file_path = file_dir + f'truth-problem-{new_file_counter[0]}.json'
    
    new_file_counter[0] += 1
    
    with open(text_file_path, 'w', encoding='utf-8') as text_file:
        text_file.write(text)
        
    with open(metadata_file_path, 'w') as metadata_file:
        json.dump(metadata, metadata_file, indent=2)

In [185]:
def create_json_metadata(authors, changes):
    return {
        'authors': authors,
        'changes': changes
    }

In [188]:
def perform_data_augmentation():
    for original_dir, augmented_dir in zip(DIFFICULTY_DIRS, AUGMENTED_DIRS):
        files = os.listdir(original_dir)
        problems = [problem for problem in files if problem[0] == 'p']
        truths = [truth for truth in files if truth[0] == 't']
        
        if not os.path.exists(augmented_dir):
            os.makedirs(augmented_dir)

        for i in range(len(problems)):
            # TODO: remove `if` for original execution
            if i > 5:
                break
            
            text_file_name = original_dir + problems[i]
            metadata_file_name = original_dir + truths[i]

            text = read_text(text_file_name)
            metadata = read_metadata(metadata_file_name)
            changes = metadata['changes']
            
            swapped = swap_adjacent_paragraphs(text)
            if len(swapped) != len(changes):
                continue
            
            for num_of_p in range(len(swapped)):
                new_authors = 1 if changes[num_of_p] == 0 else 2
                new_changes = [changes[num_of_p]]
                create_new_file(
                    augmented_dir, 
                    swapped[num_of_p], 
                    create_json_metadata(new_authors, new_changes))
            
            jumped = jump_same_author_paragraphs(text, metadata)
            for num_of_p in range(len(jumped)):
                create_new_file(
                augmented_dir,
                jumped[num_of_p],
                create_json_metadata(1, [0]))
                
            combined = combine_all_different_author_paragraphs(text, metadata)
            for num_of_p in range(len(combined)):
                create_new_file(
                augmented_dir,
                combined[num_of_p],
                create_json_metadata(2, [1]))
        

In [189]:
perform_data_augmentation()

./data/easy/train/problem-100.txt
./data/easy/train/problem-100.txt
./data/easy/train/problem-100.txt
./data/easy/train/problem-100.txt
./data/easy/train/problem-100.txt
./data/easy/train/problem-100.txt
