# Generate Greek Training / Validation Dataset

## Combination of Greek Words

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import csv
import random
from collections import Counter

from sklearn.model_selection import train_test_split

In [2]:
def check_dataset_char_in_dict(corpus):

    lst = []

    corpus_string = ''.join(str(x) for x in corpus)
    for letter in corpus_string:
        lst.append(letter)

    wordCount = dict(Counter(lst))

    return set(dictionary[0].append(pd.Series([' ']))) - set(list(wordCount.keys())), wordCount

def check_text_len_dist(corpus):
    text_len = corpus.astype(str).str.len()
    lenCount = dict(Counter(text_len))
    
    return lenCount

def sort_dict(input_dict):
    return {k: v for k, v in sorted(input_dict.items(), key=lambda item: item[1])}

In [7]:
greek_corpus = pd.read_csv('greek.wordlist', sep="\t", header=None, quoting=csv.QUOTE_NONE)
eng_corpus = pd.read_csv('eng.wordlist', sep="\t", header=None, quoting=csv.QUOTE_NONE)
dictionary = pd.read_csv('greek_dict.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)

In [8]:
char_dict = dictionary.iloc[:124]
symbol_dict = dictionary.iloc[124:]
symbol_dict = symbol_dict.reset_index(drop=True)

In [9]:
greek_corpus

Unnamed: 0,0
0,και
1,να
2,το
3,του
4,την
...,...
143955,ΚΑΚΟΗΘΟΥΣ
143956,ΗΛΙΑΧΤΙΔΑ
143957,ΠΡΕΣΒΕΥΕΙΣ
143958,ΕΦΑΜΙΛΛΗ


In [10]:
eng_corpus

Unnamed: 0,0
0,the
1,of
2,and
3,to
4,a
...,...
439145,Cycling:
439146,variation).
439147,people.'
439148,Minhtu


In [11]:
dictionary

Unnamed: 0,0
0,Α
1,Β
2,Γ
3,Δ
4,Ε
...,...
161,+
162,","
163,-
164,.


In [12]:
char_dict

Unnamed: 0,0
0,Α
1,Β
2,Γ
3,Δ
4,Ε
...,...
119,V
120,W
121,X
122,Y


In [13]:
symbol_dict

Unnamed: 0,0
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


## Data Cleansing for Greek Dataset

In [14]:
dict_list = list(dictionary[0])

remove_list = []

for i, txt in enumerate(greek_corpus[0]):
    try:
        if len(txt) > 25:
            remove_list.append(i)
        else:
            for j in txt:
                if j in dict_list:
                    pass
                else:
                    remove_list.append(i)
                    continue
    except:
        remove_list.append(i)

In [15]:
greek_corpus_avalible = greek_corpus.drop(index=remove_list)
greek_corpus_avalible = greek_corpus_avalible.reset_index(drop=True)
greek_corpus_avalible

Unnamed: 0,0
0,και
1,να
2,το
3,του
4,την
...,...
143826,ΚΑΚΟΗΘΟΥΣ
143827,ΗΛΙΑΧΤΙΔΑ
143828,ΠΡΕΣΒΕΥΕΙΣ
143829,ΕΦΑΜΙΛΛΗ


In [16]:
greek_train, greek_valid = train_test_split(greek_corpus_avalible, train_size=0.8)
greek_train = greek_train.reset_index(drop=True)
greek_valid = greek_valid.reset_index(drop=True)

In [17]:
greek_train, greek_valid

(                    0
 0       προσομοιώσεις
 1            ΣΛΑΒΙΚΩΝ
 2            κόκκινης
 3              ΣΗΚΩΣΕ
 4              ΧΑΡΤΕΣ
 ...               ...
 115059         βουτιά
 115060       τεραστία
 115061  εξαφανίστηκαν
 115062           βίδα
 115063     ΕΥΑΙΣΘΗΤΗΣ
 
 [115064 rows x 1 columns],
                    0
 0          ΑΝΑΛΥΣΕΩΝ
 1         ΚΑΤΑΣΤΡΟΦΗ
 2              ΞΕΥΡΩ
 3      ευρεσιτεχνίες
 4          πασιφανές
 ...              ...
 28762      ΑΡΣΑΚΕΙΟΥ
 28763   βαρβαρότητας
 28764    αποθεματικά
 28765    ΚΑΤΑΒΑΛΟΥΜΕ
 28766   ΠΑΡΑΓΩΓΙΚΟΥΣ
 
 [28767 rows x 1 columns])

## Data Cleansing for English Dataset

In [18]:
dict_list = list(dictionary[0])

remove_list = []

for i, txt in enumerate(eng_corpus[0]):
    try:
        if len(txt) > 25:
            remove_list.append(i)
        else:
            for j in txt:
                if j in dict_list:
                    pass
                else:
                    remove_list.append(i)
                    continue
    except:
        remove_list.append(i)

In [19]:
eng_corpus_avalible = eng_corpus.drop(index=remove_list)
eng_corpus_avalible = eng_corpus_avalible.reset_index(drop=True)
eng_corpus_avalible

Unnamed: 0,0
0,the
1,of
2,and
3,to
4,a
...,...
399651,Cycling:
399652,variation).
399653,people.'
399654,Minhtu


In [20]:
eng_train, eng_valid = train_test_split(eng_corpus_avalible, train_size=0.8)
eng_train = eng_train.reset_index(drop=True)
eng_valid = eng_valid.reset_index(drop=True)

In [21]:
eng_train, eng_valid

(              0
 0          BAD!
 1        dentin
 2        LADDER
 3       about?"
 4       indeed!
 ...         ...
 319719  Toffler
 319720    [Ltd.
 319721      wt.
 319722     Meda
 319723   Topic]
 
 [319724 rows x 1 columns],
                    0
 0               sods
 1              Kale,
 2              rage,
 3            Majeure
 4             (knee)
 ...              ...
 79927   Enterprise's
 79928      epidemic.
 79929        Mascots
 79930     Subscribe!
 79931  TechnicalUser
 
 [79932 rows x 1 columns])

In [26]:
dataset_summary = pd.DataFrame(
    {
        "Dataset": ["Greek", "English", "Total"],
        "Training Dataset": [greek_train.shape[0], eng_train.shape[0], greek_train.shape[0] + eng_train.shape[0]],
        "Validation Dataset": [greek_valid.shape[0], eng_valid.shape[0], greek_valid.shape[0] + eng_valid.shape[0]],
    }
)
dataset_summary

Unnamed: 0,Dataset,Training Dataset,Validation Dataset
0,Greek,115064,28767
1,English,319724,79932
2,Total,434788,108699


# Generate Combined Text

In [23]:
def dataset_generation(n_sample, greek_text, eng_text, output_txt, print_only=False):
    
    with open(output_txt, 'w') as z:

        for i in range(n_sample):
            
            text = ''
            
            # token length between 2 - 5
            
            n_token = random.randint(2, 5)
            
            for i in range(n_token):
                
                # whether symbol will add to token or not with 0.5 prob
                
                symbol = random.randint(0,1)
                
                # whether greek will be chosen or not with 0.7 prob
                
                lang_select = random.random()
                
                if lang_select > 0.3:
                    append_text = greek_text.iloc[random.randint(0,len(greek_text)-1)][0]
                else:
                    append_text = eng_text.iloc[random.randint(0,len(eng_text)-1)][0]
                
                if symbol:
                    append_symbol = symbol_dict.iloc[random.randint(0,len(symbol_dict)-1)][0]
                    
                    # the symbol will be added to the front or to the end of the token
                    
                    sym_loc_head = random.randint(0,1)
                    
                    if sym_loc_head:
                        append_text = append_symbol + append_text

                if i == 0:
                    text = append_text
                else:
                    text = append_text + ' ' + text
                
            # Pick the random length of token as subset
                
            text_len = random.randint(1, min(24,len(text)))
            string = text[0:text_len]
            
            if_add_dict_char = random.randint(0,1)
            
            # Add dictionay char to dataset in order to make all char inside
            
            if if_add_dict_char:
            
                add_dict_char_loc = random.randint(0,1)

                add_dict_char = dictionary.iloc[random.randint(0,len(dictionary)-1)][0]

                if add_dict_char_loc:
                    append_text = add_dict_char + string
                else:
                    append_text = string + add_dict_char
                    
            else:
                append_text = string
                
            append_text = append_text[0:min(25, len(text))]
                            
            if not print_only:
                z.write(append_text)
                z.write('\n')
            else:
                print(append_text)
                
        for j in greek_text[0]:
            z.write(j)
            z.write('\n')

        for k in eng_text[0]:
            z.write(k)
            z.write('\n')

## Training Dataset

In [30]:
dataset_generation(training_aug_size, greek_train, eng_train, 'train_corpus.txt', print_only=False)

In [31]:
train_whole_corpus = pd.read_csv('train_corpus.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)[0]
train_check_char_in_dict, train_wordCount = check_dataset_char_in_dict(train_whole_corpus)

Check characters are not in the corpus

In [32]:
train_check_char_in_dict

set()

Word count distribution

In [33]:
sort_dict(train_wordCount)

{'Ή': 1053,
 'Έ': 1066,
 'Ώ': 1068,
 'Ί': 1079,
 'ΰ': 1080,
 'ϰ': 1081,
 'Ά': 1110,
 'ϐ': 1112,
 'ϱ': 1118,
 'Ό': 1126,
 'ϑ': 1141,
 'ϋ': 1178,
 'Ύ': 1195,
 'Ϋ': 1219,
 'Ϊ': 1953,
 'ϊ': 1968,
 'Q': 3301,
 'Z': 3591,
 'X': 3815,
 '3': 4802,
 '9': 4852,
 '8': 4865,
 '4': 4867,
 '7': 4886,
 '^': 4897,
 '6': 4899,
 '`': 4936,
 '%': 4936,
 '0': 4939,
 '5': 4944,
 '}': 4945,
 '1': 4957,
 '2': 4982,
 '~': 4985,
 '#': 5135,
 '\\': 5222,
 '$': 5245,
 '@': 5272,
 'q': 5280,
 '{': 5333,
 '+': 5355,
 '=': 5420,
 '&': 5458,
 '<': 5675,
 'ψ': 5763,
 '_': 5932,
 '*': 5943,
 '>': 5970,
 'Ψ': 6005,
 '|': 6049,
 'j': 6217,
 'Y': 6532,
 'J': 7029,
 ']': 7063,
 '[': 7606,
 '/': 9867,
 ';': 9969,
 '?': 10108,
 'V': 10614,
 'z': 10842,
 '!': 10863,
 'K': 11169,
 'ζ': 11528,
 'Ζ': 11636,
 'x': 11880,
 'U': 11960,
 'Ξ': 13023,
 'ξ': 13170,
 'W': 13324,
 'ώ': 15223,
 '"': 15281,
 'F': 17706,
 "'": 17976,
 'G': 18203,
 'H': 19321,
 ')': 20402,
 ':': 20572,
 'β': 20692,
 'Β': 20714,
 'ή': 21171,
 '(': 21377,
 'O

Text Length Distribution

In [34]:
sort_dict(check_text_len_dist(train_whole_corpus))

{25: 5353,
 1: 8455,
 24: 11209,
 23: 11591,
 22: 11981,
 21: 12380,
 20: 13163,
 19: 13622,
 18: 14231,
 17: 14898,
 16: 17485,
 15: 19675,
 2: 20368,
 14: 23165,
 13: 28221,
 3: 34572,
 12: 35972,
 11: 45896,
 4: 46810,
 10: 56910,
 5: 61800,
 9: 67119,
 6: 72685,
 8: 75181,
 7: 77258}

## Validation Set

In [41]:
dataset_generation(validation_aug_size, greek_valid, eng_valid,'validate_corpus.txt', print_only=False)

In [42]:
valid_whole_corpus = pd.read_csv('validate_corpus.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)[0]
valid_check_char_in_dict, valid_wordCount = check_dataset_char_in_dict(valid_whole_corpus)

Check characters are not in the corpus

In [43]:
valid_check_char_in_dict

set()

Word Count Distribution

In [44]:
sort_dict(valid_wordCount)

{'ϱ': 240,
 'ϑ': 243,
 'Έ': 249,
 'ϰ': 252,
 'Ώ': 262,
 'Ή': 269,
 'Ό': 270,
 'Ύ': 274,
 'ΰ': 278,
 'Ά': 278,
 'ϐ': 283,
 'Ί': 283,
 'ϋ': 285,
 'Ϋ': 331,
 'ϊ': 467,
 'Ϊ': 537,
 'Q': 771,
 'Z': 897,
 'X': 907,
 '4': 1181,
 '6': 1196,
 '2': 1198,
 '$': 1210,
 '`': 1212,
 '9': 1218,
 '^': 1228,
 '~': 1232,
 '5': 1236,
 '@': 1241,
 '%': 1244,
 '1': 1260,
 '8': 1266,
 'q': 1268,
 '#': 1279,
 '\\': 1280,
 '{': 1283,
 '7': 1291,
 '0': 1291,
 '3': 1292,
 '}': 1294,
 '=': 1305,
 '+': 1317,
 'ψ': 1368,
 'Ψ': 1369,
 '&': 1372,
 '|': 1438,
 '<': 1460,
 '>': 1488,
 '_': 1502,
 '*': 1512,
 'j': 1562,
 'Y': 1584,
 ']': 1722,
 'J': 1835,
 '[': 1872,
 ';': 2437,
 '/': 2440,
 'V': 2636,
 'z': 2646,
 '?': 2661,
 '!': 2747,
 'K': 2837,
 'ζ': 2919,
 'Ζ': 2949,
 'U': 3051,
 'x': 3118,
 'Ξ': 3248,
 'ξ': 3308,
 'W': 3453,
 '"': 3723,
 'ώ': 3956,
 'F': 4451,
 'G': 4478,
 "'": 4537,
 'H': 4831,
 ':': 5089,
 'β': 5129,
 ')': 5134,
 'Β': 5181,
 'ή': 5239,
 '(': 5441,
 'O': 5995,
 'L': 6079,
 'ύ': 6120,
 'φ': 6186

Text Length Distribution

In [45]:
sort_dict(check_text_len_dist(valid_whole_corpus))

{25: 1321,
 1: 2149,
 24: 2733,
 23: 2871,
 22: 3005,
 21: 3132,
 20: 3338,
 19: 3475,
 18: 3574,
 17: 3786,
 16: 4309,
 15: 4933,
 2: 5103,
 14: 5809,
 13: 7068,
 3: 8460,
 12: 8980,
 11: 11522,
 4: 11783,
 10: 14298,
 5: 15438,
 9: 16872,
 6: 17971,
 8: 18807,
 7: 19263}

# Combine Dataset With Same Tag

Run this part after running `main.py` to generate the ocr images.

In [77]:
def merge_same_tag(input_txt, output_txt):
    dataset = pd.read_csv(input_txt, sep="\t", header=None, names=['path', 'text'], quoting=csv.QUOTE_NONE)
    dataset_merge = pd.DataFrame(
        dataset
        .groupby(dataset['text'])['path']
        .apply(lambda x: '[%s]' % ', '.join('"folder/' + x + '"'))
    )
    dataset_merge = dataset_merge.reset_index(drop=False)
    dataset_merge = dataset_merge[["path", "text"]]
    with open(output_txt, 'w') as z:
        for i in range(len(dataset_merge)):
            path, text = dataset_merge.iloc[i]['path'], dataset_merge.iloc[i]['text']
            append_text = path + '\t' + text
            z.write(append_text)
            z.write('\n')

In [78]:
merge_same_tag("train_pre.txt", "train.txt")

## Add Dataset For Robust Training

In [38]:
def dataset_generation_v3(n_sample, greek_text, eng_text, output_txt, print_only=False):
    
    with open(output_txt, 'w') as z:

        for i in range(round(n_sample * 0.7)):
            
            text = ''
            
            # token length between 2 - 4
            
            n_token = random.randint(2, 4)
            
            for i in range(n_token):
                
                # whether symbol will add to token or not with 0.5 prob
                
                symbol = random.randint(0,1)
                
                # whether greek will be chosen or not with 0.1 prob
                
                lang_select = random.random()
                
                if lang_select > 0.1:
                    append_text = greek_text.iloc[random.randint(0,len(greek_text)-1)][0]
                else:
                    append_text = eng_text.iloc[random.randint(0,len(eng_text)-1)][0]
                
                if symbol:
                    append_symbol = symbol_dict.iloc[random.randint(0,len(symbol_dict)-1)][0]
                    
                    # the symbol will be added to the front or to the end of the token
                    
                    sym_loc_head = random.randint(0,1)
                    
                    if sym_loc_head:
                        append_text = append_symbol + append_text

                if i == 0:
                    text = append_text
                else:
                    text = append_text + ' ' + text
                    
            string = text
            
            if_add_dict_char = random.randint(0,1)
            
            # Add dictionay char to dataset in order to make all char inside
            
            if if_add_dict_char:
            
                add_dict_char_loc = random.randint(0,1)

                add_dict_char = dictionary.iloc[random.randint(0,len(dictionary)-1)][0]

                if add_dict_char_loc:
                    append_text = add_dict_char + string
                else:
                    append_text = string + add_dict_char
                    
            else:
                append_text = string
                
            append_text = append_text[0:min(25, len(text))]
                            
            if not print_only:
                z.write(append_text)
                z.write('\n')
            else:
                print(append_text)      

        for i in range(round(n_sample * 0.3)):
            
            text = ''

            # token length between 2 - 4

            n_token = random.randint(2, 4)

            for i in range(n_token):

                # whether symbol will add to token or not with 0.5 prob

                symbol = random.randint(0,1)

                append_text = eng_text.iloc[random.randint(0,len(eng_text)-1)][0]

                if symbol:
                    append_symbol = symbol_dict.iloc[random.randint(0,len(symbol_dict)-1)][0]

                    # the symbol will be added to the front or to the end of the token

                    sym_loc_head = random.randint(0,1)

                    if sym_loc_head:
                        append_text = append_symbol + append_text

                if i == 0:
                    text = append_text
                else:
                    text = append_text + ' ' + text

            string = text

            if_add_dict_char = random.randint(0,1)

            # Add dictionay char to dataset in order to make all char inside

            if if_add_dict_char:

                add_dict_char_loc = random.randint(0,1)

                add_dict_char = dictionary.iloc[random.randint(0,len(dictionary)-1)][0]

                if add_dict_char_loc:
                    append_text = add_dict_char + string
                else:
                    append_text = string + add_dict_char

            else:
                append_text = string

            append_text = append_text[0:min(25, len(text))]

            if not print_only:
                z.write(append_text)
                z.write('\n')
            else:
                print(append_text)

In [46]:
dataset_generation_v3(800000, greek_train, eng_train, 'train_v3_gen.txt', print_only=False)
dataset_generation_v3(200000, greek_valid, eng_valid, 'valid_v3_gen.txt', print_only=False)

In [47]:
whole_corpus = pd.read_csv('train_v3_gen.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)[0]
check_char_in_dict, wordCount = check_dataset_char_in_dict(whole_corpus)
check_char_in_dict, sort_dict(wordCount), sort_dict(check_text_len_dist(whole_corpus))

(set(),
 {'Ώ': 1154,
  'Ύ': 1174,
  'Έ': 1174,
  'ϱ': 1191,
  'Ή': 1210,
  'ϰ': 1221,
  'ϑ': 1227,
  'ΰ': 1235,
  'Ό': 1244,
  'Ί': 1245,
  'Ά': 1250,
  'ϐ': 1275,
  'ϋ': 1508,
  'Ϋ': 1538,
  'Ϊ': 3500,
  'ϊ': 3631,
  'Q': 4819,
  'Z': 5334,
  'X': 5515,
  'q': 7732,
  'j': 9536,
  'Y': 9767,
  'J': 10741,
  '1': 13825,
  '5': 13895,
  '^': 13910,
  '4': 13922,
  '2': 13938,
  '7': 13980,
  '6': 13995,
  '3': 14050,
  '`': 14050,
  '8': 14052,
  'Ψ': 14053,
  '}': 14065,
  '0': 14081,
  '9': 14086,
  '%': 14114,
  '~': 14168,
  'ψ': 14212,
  '\\': 14247,
  '#': 14357,
  '@': 14429,
  '$': 14535,
  '{': 14635,
  '&': 14714,
  '+': 14770,
  '=': 14886,
  '<': 15286,
  '>': 15499,
  '|': 15730,
  '_': 15803,
  '*': 15881,
  'V': 16355,
  'K': 16974,
  'z': 17210,
  ']': 17358,
  '[': 18393,
  'x': 18433,
  'U': 18780,
  'W': 20896,
  '/': 21771,
  ';': 21775,
  '?': 21921,
  '!': 23414,
  'G': 28031,
  'F': 28090,
  '"': 29743,
  'Ζ': 30166,
  'ζ': 30259,
  'H': 30279,
  'Ξ': 31882,
  'ξ'

In [5]:
def merge_same_tag(input_txt, output_txt):
    dataset = pd.read_csv(input_txt, sep="\t", header=None, names=['path', 'text'], quoting=csv.QUOTE_NONE)
    dataset_merge = pd.DataFrame(
        dataset
        .groupby(dataset['text'])['path']
        .apply(lambda x: '[%s]' % ', '.join('"' + x + '"'))
    )
    dataset_merge = dataset_merge.reset_index(drop=False)
    dataset_merge = dataset_merge[["path", "text"]]
    with open(output_txt, 'w') as z:
        for i in range(len(dataset_merge)):
            path, text = dataset_merge.iloc[i]['path'], dataset_merge.iloc[i]['text']
            append_text = path + '\t' + text
            z.write(append_text)
            z.write('\n')
            
merge_same_tag("train_pre.txt", "train.txt")