# Generate Dataset Greek More Data

## Combination of Greek Words

In [153]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import csv
from collections import Counter

from itertools import islice

def take(n, iterable):
    """Return the first n items of the iterable as a list."""
    return list(islice(iterable, n))

In [84]:
greek_corpus = pd.read_csv('Greek_wordlist.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)

In [47]:
dictionary = pd.read_csv('greek_eng_dict_v2.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)
char_dict = dictionary.iloc[:124]
symbol_dict = dictionary.iloc[124:]
symbol_dict = symbol_dict.reset_index(drop=True)

In [24]:
greek_corpus

Unnamed: 0,0
0,και
1,να
2,το
3,του
4,την
...,...
143955,ΚΑΚΟΗΘΟΥΣ
143956,ΗΛΙΑΧΤΙΔΑ
143957,ΠΡΕΣΒΕΥΕΙΣ
143958,ΕΦΑΜΙΛΛΗ


In [198]:
dictionary

Unnamed: 0,0
0,Α
1,Β
2,Γ
3,Δ
4,Ε
...,...
161,+
162,","
163,-
164,.


In [48]:
char_dict

Unnamed: 0,0
0,Α
1,Β
2,Γ
3,Δ
4,Ε
...,...
119,V
120,W
121,X
122,Y


In [28]:
symbol_dict

Unnamed: 0,0
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


## Data Cleansing for Greek Dataset

In [6]:
dict_list = list(dictionary[0])

remove_list = []

for i, txt in enumerate(greek_corpus[0]):
    try:
        for j in txt:
            if j in dict_list:
                pass
            else:
                remove_list.append(i)
                continue
    except:
        remove_list.append(i)

In [7]:
greek_corpus_avalible = greek_corpus.drop(index=remove_list)
greek_corpus_avalible = greek_corpus_avalible.reset_index(drop=True)
greek_corpus_avalible

Unnamed: 0,0
0,και
1,να
2,το
3,του
4,την
...,...
143828,ΚΑΚΟΗΘΟΥΣ
143829,ΗΛΙΑΧΤΙΔΑ
143830,ΠΡΕΣΒΕΥΕΙΣ
143831,ΕΦΑΜΙΛΛΗ


In [8]:
with open('Greek_corpus_v2.txt', 'w') as z:

    for i in greek_corpus_avalible[0]:
        z.write(i)
        z.write('\n')

## Data Cleansing for English Dataset

In [9]:
eng_corpus = pd.read_csv('eng.wordlist', sep="\t", header=None, quoting=csv.QUOTE_NONE)

In [10]:
eng_corpus

Unnamed: 0,0
0,the
1,of
2,and
3,to
4,a
...,...
439145,Cycling:
439146,variation).
439147,people.'
439148,Minhtu


In [11]:
dict_list = list(dictionary[0])

remove_list = []

for i, txt in enumerate(eng_corpus[0]):
    try:
        for j in txt:
            if j in dict_list:
                pass
            else:
                remove_list.append(i)
                continue
    except:
        remove_list.append(i)

In [12]:
eng_corpus_avalible = eng_corpus.drop(index=remove_list)
eng_corpus_avalible = eng_corpus_avalible.reset_index(drop=True)
eng_corpus_avalible

Unnamed: 0,0
0,the
1,of
2,and
3,to
4,a
...,...
399651,Cycling:
399652,variation).
399653,people.'
399654,Minhtu


In [13]:
with open('English_corpus.txt', 'w') as z:

    for i in eng_corpus_avalible[0]:
        z.write(i)
        z.write('\n')

# Generate Combined Text

In [180]:
def dataset_generation(n_sample, output_txt, print_only=False):
    
    with open(output_txt, 'w') as z:

        for i in range(n_sample):
            
            text = ''
            
            # token length between 1 - 3
            
            n_token = random.randint(1,3)
            
            for i in range(n_token):
                
                # whether symbol will add to token or not with 0.5 prob
                
                symbol = random.randint(0,1)
                
                # whether greek will be chosen or not with 0.7 prob
                
                lang_select = random.random()
                
                if lang_select > 0.3:
                    append_text = greek_corpus_avalible.iloc[random.randint(0,len(greek_corpus_avalible)-1)][0]
                else:
                    append_text = eng_corpus_avalible.iloc[random.randint(0,len(eng_corpus_avalible)-1)][0]
                
                if symbol:
                    append_symbol = symbol_dict.iloc[random.randint(0,len(symbol_dict)-1)][0]
                    
                    # the symbol will be added to the front or to the end of the token
                    
                    sym_loc_head = random.randint(0,1)
                    
                    if sym_loc_head:
                        append_text = append_symbol + append_text

                if i == 0:
                    text = append_text
                else:
                    text = append_text + ' ' + text
                
            # Pick the random length of token as subset
                
            text_len = random.randint(1, min(24,len(text)))
            string = text[0:text_len]
            
            if_add_dict_char = random.randint(0,1)
            
            # Add dictionay char to dataset in order to make all char inside
            
            if if_add_dict_char:
            
                add_dict_char_loc = random.randint(0,1)

                add_dict_char = dictionary.iloc[random.randint(0,len(dictionary)-1)][0]

                if add_dict_char_loc:
                    append_text = add_dict_char + string
                else:
                    append_text = string + add_dict_char
                    
            else:
                append_text = string
                            
            if not print_only:
                z.write(append_text)
                z.write('\n')
            else:
                print(append_text)
                
def check_dataset_char_in_dict(corpus):

    corpus_string = ''.join(str(x) for x in corpus)

    lst = []

    for letter in corpus_string:
        lst.append(letter)

    wordCount = dict(Counter(lst))

    return set(list(wordCount.keys())) - set(dictionary[0].append(pd.Series([' ']))), wordCount

## Training Dataset

In [143]:
dataset_generation(80000, 'train_corpus_v2.txt', print_only=False)

In [181]:
train_whole_corpus = pd.read_csv('train_corpus_v2.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)[0]
train_check_char_in_dict, train_wordCount = check_dataset_char_in_dict(train_whole_corpus)

In [182]:
train_check_char_in_dict

set()

In [183]:
take(5, train_wordCount.items())

[('2', 867), ('W', 970), ('A', 2729), ('T', 2022), ('E', 2114)]

## Validation Set

In [159]:
dataset_generation(20000, 'validate_corpus_v2.txt', print_only=False)

In [184]:
valid_whole_corpus = pd.read_csv('validate_corpus_v2.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)[0]
valid_check_char_in_dict, valid_wordCount = check_dataset_char_in_dict(valid_whole_corpus)

In [185]:
valid_check_char_in_dict

set()

In [186]:
take(5, valid_wordCount.items())

[('α', 5693), ('ν', 3264), ('θ', 854), ('ρ', 3320), ('Ξ', 448)]

# Data Distribution

In [164]:
train_wordCount

{'2': 867,
 'W': 970,
 'A': 2729,
 'T': 2022,
 'E': 2114,
 'R': 2057,
 'C': 2629,
 'α': 23291,
 'κ': 10699,
 'λ': 8271,
 'σ': 12044,
 'ό': 3495,
 'ν': 12922,
 'ρ': 13140,
 'ο': 17226,
 'μ': 9244,
 'd': 4613,
 'e': 15388,
 'c': 5500,
 'i': 11544,
 'v': 1849,
 'έ': 4037,
 'ι': 15452,
 ' ': 37854,
 'Μ': 9364,
 'Ε': 21317,
 'Γ': 5154,
 'Α': 27829,
 'a': 12709,
 's': 9095,
 'u': 5021,
 'm': 4267,
 'β': 2868,
 'B': 1758,
 'Π': 11195,
 'Ρ': 13495,
 'Ν': 12668,
 'Ι': 19666,
 'p': 3952,
 'y': 2375,
 'j': 544,
 'J': 626,
 'ε': 17521,
 'δ': 5377,
 'o': 10534,
 'l': 7434,
 '.': 2971,
 'φ': 3337,
 '(': 1991,
 'Β': 2748,
 'χ': 3392,
 'ά': 4974,
 'π': 11245,
 'υ': 7948,
 'θ': 3457,
 'ύ': 2901,
 '>': 966,
 'τ': 14295,
 'ή': 2460,
 'ς': 4018,
 'Ο': 20254,
 'Κ': 10862,
 'Υ': 10617,
 'n': 9381,
 'r': 10652,
 ',': 2986,
 'Ω': 5146,
 'Τ': 14163,
 'ί': 4438,
 'b': 2514,
 'F': 1267,
 '~': 945,
 'η': 6085,
 'Ό': 256,
 ')': 1453,
 'Δ': 5381,
 'Σ': 15929,
 '3': 899,
 '_': 1016,
 '-': 2107,
 'ω': 3465,
 'Λ': 833

In [165]:
valid_wordCount

{'α': 5693,
 'ν': 3264,
 'θ': 854,
 'ρ': 3320,
 'Ξ': 448,
 'σ': 3053,
 'υ': 2032,
 'γ': 1298,
 'κ': 2746,
 'ί': 1069,
 'ε': 4458,
 'ι': 3972,
 ' ': 9253,
 '_': 233,
 'G': 286,
 'a': 3110,
 'l': 1903,
 'i': 2877,
 'b': 659,
 'o': 2696,
 'n': 2403,
 'g': 767,
 'p': 972,
 't': 2304,
 'h': 990,
 'δ': 1317,
 'τ': 3572,
 'μ': 2334,
 'π': 2777,
 'Π': 2838,
 'Ρ': 3365,
 'Ο': 4971,
 'Ε': 5217,
 'Ι': 4902,
 'Λ': 2043,
 'Γ': 1252,
 'Μ': 2281,
 'Ν': 3127,
 'χ': 808,
 'ο': 4264,
 'C': 652,
 'H': 344,
 'E': 568,
 'Σ': 3874,
 'Κ': 2595,
 'Τ': 3549,
 'Ω': 1257,
 'ω': 859,
 '{': 245,
 'λ': 1975,
 'B': 423,
 'c': 1517,
 'm': 1123,
 'u': 1297,
 'y': 630,
 '.': 754,
 'η': 1552,
 'ό': 900,
 '?': 305,
 'Φ': 860,
 'Ζ': 377,
 'φ': 874,
 'έ': 978,
 ')': 385,
 '$': 221,
 'T': 507,
 'R': 498,
 'A': 660,
 '5': 232,
 'M': 476,
 's': 2297,
 'Α': 6976,
 'Δ': 1362,
 '!': 274,
 'Υ': 2608,
 'Η': 2048,
 'd': 1078,
 'ή': 637,
 'ς': 1091,
 '[': 286,
 'ύ': 736,
 '6': 230,
 '&': 252,
 'O': 388,
 ',': 722,
 '%': 232,
 'e': 3

# Check Image Dataset

In [188]:
train_whole_imageset = pd.read_csv('train.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)
train_whole_imageset.columns = ['path', 'text']
check_dataset_char_in_dict(train_whole_imageset['text'])[0]

set()

In [189]:
valid_whole_imageset = pd.read_csv('validation.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)
valid_whole_imageset.columns = ['path', 'text']
check_dataset_char_in_dict(valid_whole_imageset['text'])[0]

set()

In [190]:
check_dataset_char_in_dict(train_whole_imageset['text'])[1]

{'8': 875,
 '1': 860,
 'C': 2455,
 'o': 9527,
 'n': 8585,
 's': 8399,
 'u': 4625,
 'l': 6882,
 't': 8237,
 'a': 11672,
 '!': 1060,
 'S': 2929,
 'I': 1707,
 'P': 1894,
 '.': 2641,
 ' ': 27767,
 ':': 1402,
 'Α': 24081,
 'Υ': 9022,
 'Τ': 12221,
 'Ο': 16896,
 'Κ': 9312,
 'Σ': 13429,
 'Ρ': 11742,
 'Ε': 18633,
 'Φ': 3135,
 '|': 901,
 'Π': 10082,
 'Ι': 16840,
 'Β': 2477,
 'Λ': 7169,
 'm': 3833,
 'Θ': 2976,
 'Ω': 4421,
 'Ν': 10632,
 'τ': 12708,
 'η': 5331,
 'ρ': 12007,
 'ο': 15155,
 'υ': 7065,
 'μ': 8023,
 'έ': 3614,
 'ν': 11302,
 'ω': 3098,
 'A': 2582,
 'Ί': 252,
 'e': 14002,
 'r': 9771,
 'c': 5157,
 'h': 3560,
 "'": 1578,
 'Δ': 4860,
 'f': 1868,
 'i': 10727,
 ']': 953,
 'π': 10339,
 'κ': 9688,
 'ι': 13856,
 '_': 921,
 'T': 1874,
 'p': 3701,
 '3': 934,
 'δ': 4827,
 'λ': 7478,
 'α': 20738,
 'ή': 2207,
 'ς': 3281,
 'Μ': 7746,
 'H': 1254,
 'ΰ': 186,
 '0': 760,
 'd': 4127,
 'B': 1722,
 'J': 620,
 '9': 940,
 'Χ': 3054,
 'Y': 514,
 'χ': 3094,
 'σ': 10839,
 '&': 852,
 ',': 2687,
 'D': 1559,
 'F': 11

In [191]:
check_dataset_char_in_dict(valid_whole_imageset['text'])[1]

{'ϰ': 63,
 'Λ': 1818,
 'Υ': 2267,
 'Κ': 2334,
 'Α': 6031,
 'ο': 3778,
 '/': 293,
 'ά': 1165,
 'γ': 1190,
 'ν': 2825,
 'η': 1359,
 ' ': 6911,
 '"': 330,
 'Ν': 2693,
 'Ε': 4744,
 'Β': 625,
 'Σ': 3329,
 'Ι': 4204,
 '`': 185,
 'υ': 1839,
 'π': 2705,
 'ε': 4108,
 'ρ': 3205,
 'β': 587,
 'α': 5195,
 'τ': 3247,
 'ι': 3520,
 'κ': 2455,
 'ή': 557,
 'f': 490,
 'r': 2472,
 'a': 2884,
 'P': 443,
 '&': 244,
 '-': 482,
 'Ω': 1079,
 'F': 280,
 'l': 1629,
 'n': 2120,
 'g': 837,
 'ύ': 620,
 'C': 556,
 'p': 899,
 'h': 938,
 'o': 2422,
 'e': 3474,
 'B': 371,
 'Π': 2523,
 'Ρ': 2866,
 'Ο': 4250,
 'Τ': 3015,
 'Θ': 702,
 'Μ': 2062,
 'Ζ': 299,
 'Η': 1743,
 'N': 362,
 'c': 1340,
 't': 2043,
 'i': 2707,
 'u': 1209,
 'y': 464,
 '.': 652,
 'A': 552,
 'E': 452,
 'ϋ': 70,
 'S': 760,
 '^': 233,
 '(': 459,
 'K': 203,
 '\\': 245,
 '0': 219,
 'b': 575,
 'k': 368,
 '{': 182,
 'λ': 1892,
 'μ': 2123,
 'Χ': 688,
 'Δ': 1176,
 'σ': 2839,
 'χ': 815,
 'Φ': 816,
 'x': 178,
 'm': 992,
 ':': 297,
 '*': 225,
 'έ': 907,
 'v': 412,
 