## Combination of Greek Words

In [21]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import random
import pandas as pd
import csv
from collections import Counter

from itertools import islice

def take(n, iterable):
    """Return the first n items of the iterable as a list."""
    return list(islice(iterable, n))

In [3]:
greek_corpus = pd.read_csv('Greek_wordlist.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)

In [11]:
dictionary = pd.read_csv('greek_char.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)
char_dict = dictionary.iloc[:72]
symbol_dict = dictionary.iloc[72:]
symbol_dict = symbol_dict.reset_index(drop=True)

In [12]:
greek_corpus

Unnamed: 0,0
0,και
1,να
2,το
3,του
4,την
...,...
143955,ΚΑΚΟΗΘΟΥΣ
143956,ΗΛΙΑΧΤΙΔΑ
143957,ΠΡΕΣΒΕΥΕΙΣ
143958,ΕΦΑΜΙΛΛΗ


In [13]:
dictionary

Unnamed: 0,0
0,Α
1,Β
2,Γ
3,Δ
4,Ε
...,...
109,+
110,","
111,-
112,.


In [14]:
char_dict

Unnamed: 0,0
0,Α
1,Β
2,Γ
3,Δ
4,Ε
...,...
67,ΰ
68,ϰ
69,ϱ
70,ϐ


In [15]:
symbol_dict

Unnamed: 0,0
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


## Data Cleansing for Greek Dataset

In [16]:
dict_list = list(dictionary[0])

remove_list = []

for i, txt in enumerate(greek_corpus[0]):
    try:
        for j in txt:
            if j in dict_list:
                pass
            else:
                remove_list.append(i)
                continue
    except:
        remove_list.append(i)

In [17]:
greek_corpus_avalible = greek_corpus.drop(index=remove_list)
greek_corpus_avalible = greek_corpus_avalible.reset_index(drop=True)
greek_corpus_avalible

Unnamed: 0,0
0,και
1,να
2,το
3,του
4,την
...,...
143812,ΚΑΚΟΗΘΟΥΣ
143813,ΗΛΙΑΧΤΙΔΑ
143814,ΠΡΕΣΒΕΥΕΙΣ
143815,ΕΦΑΜΙΛΛΗ


In [18]:
with open('Greek_corpus.txt', 'w') as z:

    for i in greek_corpus_avalible[0]:
        z.write(i)
        z.write('\n')

# Generate Combined Text

In [22]:
def dataset_generation(n_sample, output_txt, print_only=False):
    
    with open(output_txt, 'w') as z:

        for i in range(n_sample):
            
            text = ''
            
            # token length between 1 - 3
            
            n_token = random.randint(1,3)
            
            for i in range(n_token):
                
                # whether symbol will add to token or not with 0.5 prob
                
                symbol = random.randint(0,1)
                
                # whether greek will be chosen or not with 0.7 prob
                
                lang_select = random.random()
                
                append_text = greek_corpus_avalible.iloc[random.randint(0,len(greek_corpus_avalible)-1)][0]
                
                if symbol:
                    append_symbol = symbol_dict.iloc[random.randint(0,len(symbol_dict)-1)][0]
                    
                    # the symbol will be added to the front or to the end of the token
                    
                    sym_loc_head = random.randint(0,1)
                    
                    if sym_loc_head:
                        append_text = append_symbol + append_text

                if i == 0:
                    text = append_text
                else:
                    text = append_text + ' ' + text
                
            # Pick the random length of token as subset
                
            text_len = random.randint(1, min(24,len(text)))
            string = text[0:text_len]
            
            if_add_dict_char = random.randint(0,1)
            
            # Add dictionay char to dataset in order to make all char inside
            
            if if_add_dict_char:
            
                add_dict_char_loc = random.randint(0,1)

                add_dict_char = dictionary.iloc[random.randint(0,len(dictionary)-1)][0]

                if add_dict_char_loc:
                    append_text = add_dict_char + string
                else:
                    append_text = string + add_dict_char
                    
            else:
                append_text = string
                            
            if not print_only:
                z.write(append_text)
                z.write('\n')
            else:
                print(append_text)
                
def check_dataset_char_in_dict(corpus):

    corpus_string = ''.join(str(x) for x in corpus)

    lst = []

    for letter in corpus_string:
        lst.append(letter)

    wordCount = dict(Counter(lst))

    return set(list(wordCount.keys())) - set(dictionary[0].append(pd.Series([' ']))), wordCount

## Training Dataset

In [24]:
dataset_generation(80000, 'train_corpus.txt', print_only=False)

In [25]:
train_whole_corpus = pd.read_csv('train_corpus.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)[0]
train_check_char_in_dict, train_wordCount = check_dataset_char_in_dict(train_whole_corpus)

In [26]:
train_check_char_in_dict

set()

In [27]:
take(5, train_wordCount.items())

[('ν', 17928), ('τ', 19981), ('ρ', 18724), ('έ', 5606), ('π', 15739)]

## Validation Set

In [28]:
dataset_generation(20000, 'validate_corpus.txt', print_only=False)

In [29]:
valid_whole_corpus = pd.read_csv('validate_corpus.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)[0]
valid_check_char_in_dict, valid_wordCount = check_dataset_char_in_dict(valid_whole_corpus)

In [30]:
valid_check_char_in_dict

set()

In [31]:
take(5, valid_wordCount.items())

[('Μ', 3224), ('Ε', 7382), ('Ο', 7207), ('Ν', 4508), ('Ω', 1791)]

# Data Distribution

In [32]:
train_wordCount

{'ν': 17928,
 'τ': 19981,
 'ρ': 18724,
 'έ': 5606,
 'π': 15739,
 'ε': 24406,
 'α': 32773,
 'ι': 21681,
 'Α': 39459,
 'Υ': 15115,
 'Σ': 22847,
 'Τ': 20182,
 'Ρ': 18917,
 'Λ': 11627,
 'Ι': 27969,
 ' ': 36538,
 'ό': 4868,
 'σ': 16844,
 'ω': 4879,
 'Ε': 30302,
 'Ω': 7119,
 'Η': 11663,
 'Θ': 4766,
 '.': 1027,
 'η': 8345,
 'δ': 7577,
 'ά': 7147,
 'λ': 11459,
 'ο': 23891,
 '$': 996,
 'Ν': 18234,
 'Ο': 28908,
 '}': 1061,
 'Γ': 7202,
 'Μ': 13161,
 'υ': 11274,
 'ϋ': 401,
 'Φ': 4770,
 'χ': 4823,
 'ύ': 4053,
 'Π': 16175,
 'θ': 4607,
 'ζ': 2223,
 'γ': 7226,
 '9': 979,
 'Κ': 15181,
 '[': 1023,
 '8': 1063,
 'φ': 4885,
 'μ': 13155,
 'κ': 15288,
 'ή': 3450,
 '5': 1017,
 '/': 1074,
 'Β': 4051,
 '3': 1079,
 '{': 1073,
 'Δ': 7612,
 '^': 973,
 '(': 1086,
 '!': 1055,
 'ξ': 2515,
 "'": 1055,
 '7': 1040,
 'Ξ': 2562,
 'Χ': 4836,
 ')': 999,
 '_': 1033,
 'ΰ': 358,
 '=': 1085,
 'Ψ': 1191,
 ':': 998,
 '0': 1032,
 '%': 1046,
 'ς': 5829,
 'Ζ': 2159,
 '|': 1075,
 '*': 1056,
 '4': 1052,
 'β': 3961,
 'ώ': 2623,
 ']': 1

In [33]:
valid_wordCount

{'Μ': 3224,
 'Ε': 7382,
 'Ο': 7207,
 'Ν': 4508,
 'Ω': 1791,
 'δ': 1907,
 'ι': 5596,
 ')': 268,
 'Κ': 3873,
 'Τ': 5025,
 'Ι': 6888,
 'Α': 9943,
 'Σ': 5633,
 'Φ': 1185,
 'Β': 1017,
 'Υ': 3795,
 '7': 248,
 'κ': 3901,
 'ο': 5885,
 'υ': 2836,
 'ρ': 4607,
 '^': 232,
 'ν': 4518,
 'ε': 6177,
 'τ': 5106,
 'α': 8093,
 'ά': 1762,
 'χ': 1237,
 ' ': 9068,
 'Γ': 1703,
 '4': 249,
 '(': 262,
 'φ': 1176,
 'π': 3961,
 'ή': 886,
 'σ': 4187,
 'ς': 1403,
 'Χ': 1201,
 'ό': 1224,
 'γ': 1722,
 '=': 263,
 'λ': 2870,
 'ώ': 679,
 'Λ': 2917,
 'Η': 2851,
 'Ρ': 4711,
 '$': 265,
 '5': 242,
 'Π': 4021,
 '!': 251,
 '\\': 226,
 'θ': 1223,
 'Δ': 1883,
 'Ξ': 661,
 ',': 276,
 'ύ': 1042,
 'ω': 1196,
 'ί': 1470,
 'ζ': 534,
 'μ': 3222,
 '|': 242,
 '/': 246,
 'έ': 1357,
 '9': 272,
 '2': 276,
 'η': 2171,
 '"': 260,
 '#': 238,
 'β': 989,
 '*': 266,
 '0': 262,
 '8': 242,
 'Ϊ': 133,
 'ξ': 682,
 'Ψ': 306,
 '-': 285,
 'Θ': 1143,
 '@': 256,
 '[': 240,
 'ϊ': 146,
 '1': 257,
 '?': 272,
 'ψ': 321,
 'Ώ': 81,
 'Ή': 87,
 ']': 262,
 'ϱ': 7

# Check Image Dataset

In [None]:
train_whole_imageset = pd.read_csv('train.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)
train_whole_imageset.columns = ['path', 'text']
check_dataset_char_in_dict(train_whole_imageset['text'])[0]

In [None]:
valid_whole_imageset = pd.read_csv('validation.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)
valid_whole_imageset.columns = ['path', 'text']
check_dataset_char_in_dict(valid_whole_imageset['text'])[0]

In [None]:
check_dataset_char_in_dict(train_whole_imageset['text'])[1]

In [None]:
check_dataset_char_in_dict(valid_whole_imageset['text'])[1]