# Generate Greek Training / Validation Dataset

## Combination of Greek Words

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import csv
import random
from collections import Counter

from sklearn.model_selection import train_test_split

In [2]:
greek_corpus = pd.read_csv('greek.wordlist', sep="\t", header=None, quoting=csv.QUOTE_NONE)
eng_corpus = pd.read_csv('eng.wordlist', sep="\t", header=None, quoting=csv.QUOTE_NONE)
dictionary = pd.read_csv('greek_dict.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)

In [3]:
char_dict = dictionary.iloc[:124]
symbol_dict = dictionary.iloc[124:]
symbol_dict = symbol_dict.reset_index(drop=True)

In [4]:
greek_corpus

Unnamed: 0,0
0,και
1,να
2,το
3,του
4,την
...,...
143955,ΚΑΚΟΗΘΟΥΣ
143956,ΗΛΙΑΧΤΙΔΑ
143957,ΠΡΕΣΒΕΥΕΙΣ
143958,ΕΦΑΜΙΛΛΗ


In [5]:
eng_corpus

Unnamed: 0,0
0,the
1,of
2,and
3,to
4,a
...,...
439145,Cycling:
439146,variation).
439147,people.'
439148,Minhtu


In [6]:
dictionary

Unnamed: 0,0
0,Α
1,Β
2,Γ
3,Δ
4,Ε
...,...
161,+
162,","
163,-
164,.


In [7]:
char_dict

Unnamed: 0,0
0,Α
1,Β
2,Γ
3,Δ
4,Ε
...,...
119,V
120,W
121,X
122,Y


In [8]:
symbol_dict

Unnamed: 0,0
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


## Data Cleansing for Greek Dataset

In [9]:
dict_list = list(dictionary[0])

remove_list = []

for i, txt in enumerate(greek_corpus[0]):
    try:
        if len(txt) > 25:
            remove_list.append(i)
        else:
            for j in txt:
                if j in dict_list:
                    pass
                else:
                    remove_list.append(i)
                    continue
    except:
        remove_list.append(i)

In [11]:
greek_corpus_avalible = greek_corpus.drop(index=remove_list)
greek_corpus_avalible = greek_corpus_avalible.reset_index(drop=True)
greek_corpus_avalible

Unnamed: 0,0
0,και
1,να
2,το
3,του
4,την
...,...
143826,ΚΑΚΟΗΘΟΥΣ
143827,ΗΛΙΑΧΤΙΔΑ
143828,ΠΡΕΣΒΕΥΕΙΣ
143829,ΕΦΑΜΙΛΛΗ


In [12]:
greek_train, greek_valid = train_test_split(greek_corpus_avalible, train_size=0.8)
greek_train = greek_train.reset_index(drop=True)
greek_valid = greek_valid.reset_index(drop=True)

In [13]:
greek_train, greek_valid

(                   0
 0          παχύσαρκο
 1         υποδηλώνει
 2       εξοικειωθούν
 3            ΝΤΙΟΓΚΟ
 4       ΚΙΝΗΤΙΚΟΤΗΤΑ
 ...              ...
 115059  ΕΥΠΡΟΣΔΕΚΤΟΙ
 115060      ΡΙΖΩΜΕΝΟ
 115061  διακεκριμένη
 115062       πολεμοσ
 115063        έτοιμα
 
 [115064 rows x 1 columns],
                    0
 0        ΝΕΟΕΛΛΗΝΙΚΟ
 1      υπογραμμίσεις
 2       απαιτούμενης
 3       επιχειρήματα
 4       προληπτικούς
 ...              ...
 28762     ισραηλινές
 28763        σκόρδας
 28764  συγκρούστηκαν
 28765       ΕΠΟΜΕΝΕΣ
 28766    γκορμπατσώφ
 
 [28767 rows x 1 columns])

## Data Cleansing for English Dataset

In [14]:
dict_list = list(dictionary[0])

remove_list = []

for i, txt in enumerate(eng_corpus[0]):
    try:
        if len(txt) > 25:
            remove_list.append(i)
        else:
            for j in txt:
                if j in dict_list:
                    pass
                else:
                    remove_list.append(i)
                    continue
    except:
        remove_list.append(i)

In [15]:
eng_corpus_avalible = eng_corpus.drop(index=remove_list)
eng_corpus_avalible = eng_corpus_avalible.reset_index(drop=True)
eng_corpus_avalible

Unnamed: 0,0
0,the
1,of
2,and
3,to
4,a
...,...
399651,Cycling:
399652,variation).
399653,people.'
399654,Minhtu


In [16]:
eng_train, eng_valid = train_test_split(eng_corpus_avalible, train_size=0.8)
eng_train = eng_train.reset_index(drop=True)
eng_valid = eng_valid.reset_index(drop=True)

In [17]:
eng_train, eng_valid

(                   0
 0            erasers
 1       Collegeboxes
 2            Assisi,
 3           Snippet:
 4             mater,
 ...              ...
 319719        beaker
 319720        BETTIE
 319721     similar).
 319722         Swank
 319723        REGENT
 
 [319724 rows x 1 columns],
                     0
 0            Salaries
 1               Taleo
 2             Wesley,
 3               RS...
 4           Friction,
 ...               ...
 79927       Paintable
 79928       outcomes?
 79929         [China]
 79930         central
 79931  commissioners.
 
 [79932 rows x 1 columns])

In [49]:
training_target_size = 800000
validation_target_size = 200000

In [51]:
training_aug_size = training_target_size - greek_train.shape[0] - eng_train.shape[0]
validation_aug_size = validation_target_size - greek_valid.shape[0] - eng_valid.shape[0]

In [53]:
dataset_summary = pd.DataFrame(
    {
        "Dataset": ["Greek", "English", "Augmentation", "Total"],
        "Training Dataset": [greek_train.shape[0], eng_train.shape[0], training_aug_size, training_target_size],
        "Validation Dataset": [greek_valid.shape[0], eng_valid.shape[0], validation_aug_size, validation_target_size],
    }
)
dataset_summary

Unnamed: 0,Dataset,Training Dataset,Validation Dataset
0,Greek,115064,28767
1,English,319724,79932
2,Augmentation,365212,91301
3,Total,800000,200000


# Generate Combined Text

In [23]:
def dataset_generation(n_sample, greek_text, eng_text, output_txt, print_only=False):
    
    with open(output_txt, 'w') as z:

        for i in range(n_sample):
            
            text = ''
            
            # token length between 2 - 5
            
            n_token = random.randint(2, 5)
            
            for i in range(n_token):
                
                # whether symbol will add to token or not with 0.5 prob
                
                symbol = random.randint(0,1)
                
                # whether greek will be chosen or not with 0.7 prob
                
                lang_select = random.random()
                
                if lang_select > 0.3:
                    append_text = greek_text.iloc[random.randint(0,len(greek_text)-1)][0]
                else:
                    append_text = eng_text.iloc[random.randint(0,len(eng_text)-1)][0]
                
                if symbol:
                    append_symbol = symbol_dict.iloc[random.randint(0,len(symbol_dict)-1)][0]
                    
                    # the symbol will be added to the front or to the end of the token
                    
                    sym_loc_head = random.randint(0,1)
                    
                    if sym_loc_head:
                        append_text = append_symbol + append_text

                if i == 0:
                    text = append_text
                else:
                    text = append_text + ' ' + text
                
            # Pick the random length of token as subset
                
            text_len = random.randint(1, min(24,len(text)))
            string = text[0:text_len]
            
            if_add_dict_char = random.randint(0,1)
            
            # Add dictionay char to dataset in order to make all char inside
            
            if if_add_dict_char:
            
                add_dict_char_loc = random.randint(0,1)

                add_dict_char = dictionary.iloc[random.randint(0,len(dictionary)-1)][0]

                if add_dict_char_loc:
                    append_text = add_dict_char + string
                else:
                    append_text = string + add_dict_char
                    
            else:
                append_text = string
                
            append_text = append_text[0:min(25, len(text))]
                            
            if not print_only:
                z.write(append_text)
                z.write('\n')
            else:
                print(append_text)
                
        for j in greek_text[0]:
            z.write(j)
            z.write('\n')

        for k in eng_text[0]:
            z.write(k)
            z.write('\n')
                
def check_dataset_char_in_dict(corpus):

    lst = []

    corpus_string = ''.join(str(x) for x in corpus)
    for letter in corpus_string:
        lst.append(letter)

    wordCount = dict(Counter(lst))

    return set(dictionary[0].append(pd.Series([' ']))) - set(list(wordCount.keys())), wordCount

def check_text_len_dist(corpus):
    text_len = corpus.astype(str).str.len()
    lenCount = dict(Counter(text_len))
    
    return lenCount

def sort_dict(input_dict):
    return {k: v for k, v in sorted(input_dict.items(), key=lambda item: item[1])}

## Training Dataset

In [30]:
dataset_generation(training_aug_size, greek_train, eng_train, 'train_corpus.txt', print_only=False)

In [31]:
train_whole_corpus = pd.read_csv('train_corpus.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)[0]
train_check_char_in_dict, train_wordCount = check_dataset_char_in_dict(train_whole_corpus)

Check characters are not in the corpus

In [32]:
train_check_char_in_dict

set()

Word count distribution

In [33]:
sort_dict(train_wordCount)

{'Ή': 1053,
 'Έ': 1066,
 'Ώ': 1068,
 'Ί': 1079,
 'ΰ': 1080,
 'ϰ': 1081,
 'Ά': 1110,
 'ϐ': 1112,
 'ϱ': 1118,
 'Ό': 1126,
 'ϑ': 1141,
 'ϋ': 1178,
 'Ύ': 1195,
 'Ϋ': 1219,
 'Ϊ': 1953,
 'ϊ': 1968,
 'Q': 3301,
 'Z': 3591,
 'X': 3815,
 '3': 4802,
 '9': 4852,
 '8': 4865,
 '4': 4867,
 '7': 4886,
 '^': 4897,
 '6': 4899,
 '`': 4936,
 '%': 4936,
 '0': 4939,
 '5': 4944,
 '}': 4945,
 '1': 4957,
 '2': 4982,
 '~': 4985,
 '#': 5135,
 '\\': 5222,
 '$': 5245,
 '@': 5272,
 'q': 5280,
 '{': 5333,
 '+': 5355,
 '=': 5420,
 '&': 5458,
 '<': 5675,
 'ψ': 5763,
 '_': 5932,
 '*': 5943,
 '>': 5970,
 'Ψ': 6005,
 '|': 6049,
 'j': 6217,
 'Y': 6532,
 'J': 7029,
 ']': 7063,
 '[': 7606,
 '/': 9867,
 ';': 9969,
 '?': 10108,
 'V': 10614,
 'z': 10842,
 '!': 10863,
 'K': 11169,
 'ζ': 11528,
 'Ζ': 11636,
 'x': 11880,
 'U': 11960,
 'Ξ': 13023,
 'ξ': 13170,
 'W': 13324,
 'ώ': 15223,
 '"': 15281,
 'F': 17706,
 "'": 17976,
 'G': 18203,
 'H': 19321,
 ')': 20402,
 ':': 20572,
 'β': 20692,
 'Β': 20714,
 'ή': 21171,
 '(': 21377,
 'O

Text Length Distribution

In [34]:
sort_dict(check_text_len_dist(train_whole_corpus))

{25: 5353,
 1: 8455,
 24: 11209,
 23: 11591,
 22: 11981,
 21: 12380,
 20: 13163,
 19: 13622,
 18: 14231,
 17: 14898,
 16: 17485,
 15: 19675,
 2: 20368,
 14: 23165,
 13: 28221,
 3: 34572,
 12: 35972,
 11: 45896,
 4: 46810,
 10: 56910,
 5: 61800,
 9: 67119,
 6: 72685,
 8: 75181,
 7: 77258}

## Validation Set

In [41]:
dataset_generation(validation_aug_size, greek_valid, eng_valid,'validate_corpus.txt', print_only=False)

In [42]:
valid_whole_corpus = pd.read_csv('validate_corpus.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)[0]
valid_check_char_in_dict, valid_wordCount = check_dataset_char_in_dict(valid_whole_corpus)

Check characters are not in the corpus

In [43]:
valid_check_char_in_dict

set()

Word Count Distribution

In [44]:
sort_dict(valid_wordCount)

{'ϱ': 240,
 'ϑ': 243,
 'Έ': 249,
 'ϰ': 252,
 'Ώ': 262,
 'Ή': 269,
 'Ό': 270,
 'Ύ': 274,
 'ΰ': 278,
 'Ά': 278,
 'ϐ': 283,
 'Ί': 283,
 'ϋ': 285,
 'Ϋ': 331,
 'ϊ': 467,
 'Ϊ': 537,
 'Q': 771,
 'Z': 897,
 'X': 907,
 '4': 1181,
 '6': 1196,
 '2': 1198,
 '$': 1210,
 '`': 1212,
 '9': 1218,
 '^': 1228,
 '~': 1232,
 '5': 1236,
 '@': 1241,
 '%': 1244,
 '1': 1260,
 '8': 1266,
 'q': 1268,
 '#': 1279,
 '\\': 1280,
 '{': 1283,
 '7': 1291,
 '0': 1291,
 '3': 1292,
 '}': 1294,
 '=': 1305,
 '+': 1317,
 'ψ': 1368,
 'Ψ': 1369,
 '&': 1372,
 '|': 1438,
 '<': 1460,
 '>': 1488,
 '_': 1502,
 '*': 1512,
 'j': 1562,
 'Y': 1584,
 ']': 1722,
 'J': 1835,
 '[': 1872,
 ';': 2437,
 '/': 2440,
 'V': 2636,
 'z': 2646,
 '?': 2661,
 '!': 2747,
 'K': 2837,
 'ζ': 2919,
 'Ζ': 2949,
 'U': 3051,
 'x': 3118,
 'Ξ': 3248,
 'ξ': 3308,
 'W': 3453,
 '"': 3723,
 'ώ': 3956,
 'F': 4451,
 'G': 4478,
 "'": 4537,
 'H': 4831,
 ':': 5089,
 'β': 5129,
 ')': 5134,
 'Β': 5181,
 'ή': 5239,
 '(': 5441,
 'O': 5995,
 'L': 6079,
 'ύ': 6120,
 'φ': 6186

Text Length Distribution

In [45]:
sort_dict(check_text_len_dist(valid_whole_corpus))

{25: 1321,
 1: 2149,
 24: 2733,
 23: 2871,
 22: 3005,
 21: 3132,
 20: 3338,
 19: 3475,
 18: 3574,
 17: 3786,
 16: 4309,
 15: 4933,
 2: 5103,
 14: 5809,
 13: 7068,
 3: 8460,
 12: 8980,
 11: 11522,
 4: 11783,
 10: 14298,
 5: 15438,
 9: 16872,
 6: 17971,
 8: 18807,
 7: 19263}

# Combine Dataset With Same Tag

Run this part after running `main.py` to generate the ocr images.

In [46]:
def merge_same_tag(input_txt, output_txt):
    dataset = pd.read_csv(input_txt, sep="\t", header=None, names=['path', 'text'], quoting=csv.QUOTE_NONE)
    dataset_merge = pd.DataFrame(
        dataset
        .groupby(dataset['text'])['path']
        .apply(lambda x: '[%s]' % ', '.join('"' + x + '"'))
    )
    dataset_merge = dataset_merge.reset_index(drop=False)
    dataset_merge = dataset_merge[["path", "text"]]
    with open(output_txt, 'w') as z:
        for i in range(len(dataset_merge)):
            path, text = dataset_merge.iloc[i]['path'], dataset_merge.iloc[i]['text']
            append_text = path + '\t' + text
            z.write(append_text)
            z.write('\n')

In [209]:
merge_same_tag("train_pre.txt", "train.txt")