# Generate Greek Training / Validation Dataset

## Combination of Greek Words

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import csv
import random
from collections import Counter

In [2]:
greek_corpus = pd.read_csv('greek.wordlist', sep="\t", header=None, quoting=csv.QUOTE_NONE)
eng_corpus = pd.read_csv('eng.wordlist', sep="\t", header=None, quoting=csv.QUOTE_NONE)
dictionary = pd.read_csv('greek.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)

In [3]:
char_dict = dictionary.iloc[:124]
symbol_dict = dictionary.iloc[124:]
symbol_dict = symbol_dict.reset_index(drop=True)

In [4]:
greek_corpus

Unnamed: 0,0
0,και
1,να
2,το
3,του
4,την
...,...
143955,ΚΑΚΟΗΘΟΥΣ
143956,ΗΛΙΑΧΤΙΔΑ
143957,ΠΡΕΣΒΕΥΕΙΣ
143958,ΕΦΑΜΙΛΛΗ


In [5]:
eng_corpus

Unnamed: 0,0
0,the
1,of
2,and
3,to
4,a
...,...
439145,Cycling:
439146,variation).
439147,people.'
439148,Minhtu


In [6]:
dictionary

Unnamed: 0,0
0,Α
1,Β
2,Γ
3,Δ
4,Ε
...,...
161,+
162,","
163,-
164,.


In [7]:
char_dict

Unnamed: 0,0
0,Α
1,Β
2,Γ
3,Δ
4,Ε
...,...
119,V
120,W
121,X
122,Y


In [8]:
symbol_dict

Unnamed: 0,0
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


## Data Cleansing for Greek Dataset

In [9]:
dict_list = list(dictionary[0])

remove_list = []

for i, txt in enumerate(greek_corpus[0]):
    try:
        for j in txt:
            if j in dict_list:
                pass
            else:
                remove_list.append(i)
                continue
    except:
        remove_list.append(i)

In [10]:
greek_corpus_avalible = greek_corpus.drop(index=remove_list)
greek_corpus_avalible = greek_corpus_avalible.reset_index(drop=True)
greek_corpus_avalible

Unnamed: 0,0
0,και
1,να
2,το
3,του
4,την
...,...
143828,ΚΑΚΟΗΘΟΥΣ
143829,ΗΛΙΑΧΤΙΔΑ
143830,ΠΡΕΣΒΕΥΕΙΣ
143831,ΕΦΑΜΙΛΛΗ


## Data Cleansing for English Dataset

In [11]:
dict_list = list(dictionary[0])

remove_list = []

for i, txt in enumerate(eng_corpus[0]):
    try:
        for j in txt:
            if j in dict_list:
                pass
            else:
                remove_list.append(i)
                continue
    except:
        remove_list.append(i)

In [12]:
eng_corpus_avalible = eng_corpus.drop(index=remove_list)
eng_corpus_avalible = eng_corpus_avalible.reset_index(drop=True)
eng_corpus_avalible

Unnamed: 0,0
0,the
1,of
2,and
3,to
4,a
...,...
399651,Cycling:
399652,variation).
399653,people.'
399654,Minhtu


# Generate Combined Text

In [23]:
def dataset_generation(n_sample, output_txt, print_only=False):
    
    with open(output_txt, 'w') as z:

        for i in range(n_sample):
            
            text = ''
            
            # token length between 1 - 3
            
            n_token = random.randint(1,3)
            
            for i in range(n_token):
                
                # whether symbol will add to token or not with 0.5 prob
                
                symbol = random.randint(0,1)
                
                # whether greek will be chosen or not with 0.7 prob
                
                lang_select = random.random()
                
                if lang_select > 0.3:
                    append_text = greek_corpus_avalible.iloc[random.randint(0,len(greek_corpus_avalible)-1)][0]
                else:
                    append_text = eng_corpus_avalible.iloc[random.randint(0,len(eng_corpus_avalible)-1)][0]
                
                if symbol:
                    append_symbol = symbol_dict.iloc[random.randint(0,len(symbol_dict)-1)][0]
                    
                    # the symbol will be added to the front or to the end of the token
                    
                    sym_loc_head = random.randint(0,1)
                    
                    if sym_loc_head:
                        append_text = append_symbol + append_text

                if i == 0:
                    text = append_text
                else:
                    text = append_text + ' ' + text
                
            # Pick the random length of token as subset
                
            text_len = random.randint(1, min(24,len(text)))
            string = text[0:text_len]
            
            if_add_dict_char = random.randint(0,1)
            
            # Add dictionay char to dataset in order to make all char inside
            
            if if_add_dict_char:
            
                add_dict_char_loc = random.randint(0,1)

                add_dict_char = dictionary.iloc[random.randint(0,len(dictionary)-1)][0]

                if add_dict_char_loc:
                    append_text = add_dict_char + string
                else:
                    append_text = string + add_dict_char
                    
            else:
                append_text = string
                            
            if not print_only:
                z.write(append_text)
                z.write('\n')
            else:
                print(append_text)
                
def check_dataset_char_in_dict(corpus):

    lst = []

    corpus_string = ''.join(str(x) for x in corpus)
    for letter in corpus_string:
        lst.append(letter)

    wordCount = dict(Counter(lst))

    return set(dictionary[0].append(pd.Series([' ']))) - set(list(wordCount.keys())), wordCount

def check_text_len_dist(corpus):
    text_len = corpus.apply(len)
    lenCount = dict(Counter(text_len))
    
    return lenCount

def sort_dict(input_dict):
    return {k: v for k, v in sorted(input_dict.items(), key=lambda item: item[1])}

## Training Dataset

In [17]:
dataset_generation(80000, 'train_corpus.txt', print_only=False)

In [18]:
train_whole_corpus = pd.read_csv('train_corpus.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)[0]
train_check_char_in_dict, train_wordCount = check_dataset_char_in_dict(train_whole_corpus)

Check characters are not in the corpus

In [24]:
train_check_char_in_dict

set()

Word count distribution

In [25]:
sort_dict(train_wordCount)

{'ϑ': 227,
 'Ύ': 229,
 'Ό': 233,
 'ϐ': 233,
 'Ά': 235,
 'Ώ': 236,
 'Έ': 241,
 'ΰ': 243,
 'ϱ': 244,
 'ϰ': 248,
 'Ί': 250,
 'Ή': 256,
 'ϋ': 259,
 'Ϋ': 262,
 'Ϊ': 335,
 'ϊ': 360,
 'X': 383,
 'Z': 386,
 'Q': 405,
 'q': 483,
 'Y': 488,
 'j': 511,
 'J': 622,
 'z': 693,
 'x': 783,
 'V': 791,
 'Ψ': 817,
 'K': 845,
 'ψ': 850,
 '9': 886,
 '2': 888,
 '}': 891,
 '\\': 906,
 'U': 908,
 '~': 914,
 '&': 918,
 '1': 922,
 '=': 925,
 '0': 928,
 '7': 930,
 '+': 931,
 '#': 932,
 '@': 934,
 '`': 941,
 '8': 941,
 '$': 945,
 '3': 945,
 '>': 948,
 '5': 948,
 '{': 951,
 '^': 957,
 '4': 960,
 '%': 976,
 '<': 987,
 '6': 994,
 ']': 996,
 '|': 1001,
 'W': 1020,
 '*': 1021,
 '[': 1046,
 '_': 1049,
 ';': 1091,
 '?': 1135,
 '!': 1183,
 '/': 1189,
 'F': 1257,
 'G': 1301,
 'H': 1318,
 '"': 1432,
 "'": 1466,
 ':': 1468,
 ')': 1474,
 'O': 1535,
 'ζ': 1546,
 'N': 1562,
 'Ζ': 1567,
 'L': 1616,
 'w': 1616,
 'B': 1682,
 'I': 1707,
 'v': 1721,
 'D': 1779,
 'k': 1794,
 'ώ': 1827,
 'Ξ': 1837,
 'ξ': 1869,
 'T': 1949,
 'P': 1961,

Text Length Distribution

In [26]:
sort_dict(check_text_len_dist(train_whole_corpus))

{25: 498,
 23: 1080,
 24: 1095,
 22: 1310,
 21: 1453,
 20: 1586,
 19: 1770,
 18: 1827,
 17: 2099,
 16: 2239,
 15: 2387,
 14: 2599,
 13: 2737,
 12: 3081,
 1: 3261,
 11: 3288,
 10: 3653,
 9: 4168,
 8: 4666,
 7: 5130,
 6: 5534,
 5: 5962,
 3: 6144,
 4: 6169,
 2: 6264}

## Validation Set

In [27]:
dataset_generation(20000, 'validate_corpus.txt', print_only=False)

In [28]:
valid_whole_corpus = pd.read_csv('validate_corpus.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)[0]
valid_check_char_in_dict, valid_wordCount = check_dataset_char_in_dict(valid_whole_corpus)

Check characters are not in the corpus

In [29]:
valid_check_char_in_dict

set()

Word Count Distribution

In [30]:
sort_dict(valid_wordCount)

{'ΰ': 46,
 'ϐ': 53,
 'Ί': 57,
 'Ά': 58,
 'ϰ': 63,
 'ϱ': 63,
 'Ώ': 66,
 'ϋ': 70,
 'Ό': 73,
 'Ύ': 74,
 'ϑ': 74,
 'Ή': 75,
 'Έ': 75,
 'ϊ': 79,
 'Ϋ': 81,
 'Z': 86,
 'Q': 89,
 'X': 92,
 'Ϊ': 94,
 'Y': 115,
 'q': 123,
 'j': 125,
 'J': 144,
 'z': 184,
 'ψ': 188,
 '1': 195,
 '2': 199,
 'V': 202,
 'K': 204,
 'x': 207,
 'U': 209,
 '=': 211,
 '^': 213,
 '@': 214,
 '>': 218,
 'Ψ': 218,
 '}': 219,
 '`': 222,
 '+': 225,
 '4': 225,
 '$': 228,
 '8': 233,
 '|': 234,
 '7': 234,
 '%': 238,
 '~': 238,
 '3': 239,
 '*': 239,
 '6': 239,
 '<': 240,
 '{': 240,
 '\\': 243,
 '5': 243,
 '#': 245,
 '9': 248,
 '0': 249,
 '&': 250,
 ']': 262,
 '_': 265,
 'W': 265,
 ';': 271,
 '?': 277,
 '[': 280,
 '!': 282,
 'G': 303,
 '/': 333,
 'H': 343,
 'F': 347,
 ':': 349,
 "'": 357,
 '"': 361,
 'O': 362,
 'Ζ': 385,
 ')': 390,
 'N': 396,
 'w': 403,
 'L': 410,
 'ζ': 416,
 'Ξ': 422,
 'B': 430,
 'I': 433,
 'k': 437,
 'D': 440,
 'ξ': 448,
 'v': 450,
 'T': 468,
 'M': 478,
 'ώ': 489,
 'E': 495,
 'f': 498,
 'P': 502,
 '(': 520,
 'R': 

Text Length Distribution

In [31]:
sort_dict(check_text_len_dist(valid_whole_corpus))

{25: 122,
 24: 244,
 23: 318,
 22: 339,
 21: 340,
 20: 413,
 19: 447,
 18: 448,
 17: 515,
 16: 595,
 15: 609,
 14: 663,
 13: 713,
 12: 754,
 1: 774,
 11: 855,
 10: 914,
 9: 970,
 8: 1122,
 7: 1260,
 6: 1368,
 5: 1502,
 4: 1514,
 3: 1597,
 2: 1604}

# Combine Dataset With Same Tag

Run this part after running `main.py` to generate the ocr images.

In [207]:
def merge_same_tag(input_txt, output_txt):
    dataset = pd.read_csv(input_txt, sep="\t", header=None, names=['path', 'text'], quoting=csv.QUOTE_NONE)
    dataset_merge = pd.DataFrame(
        dataset
        .groupby(dataset['text'])['path']
        .apply(lambda x: '[%s]' % ', '.join('"' + x + '"'))
    )
    dataset_merge = dataset_merge.reset_index(drop=False)
    dataset_merge = dataset_merge[["path", "text"]]
    with open(output_txt, 'w') as z:
        for i in range(len(dataset_merge)):
            path, text = dataset_merge.iloc[i]['path'], dataset_merge.iloc[i]['text']
            append_text = path + '\t' + text
            z.write(append_text)
            z.write('\n')

In [209]:
merge_same_tag("train_pre.txt", "train.txt")