# Generate Greek Training / Validation Dataset

## Combination of Greek Words

In [153]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import csv
from collections import Counter

from itertools import islice

def take(n, iterable):
    """Return the first n items of the iterable as a list."""
    return list(islice(iterable, n))

In [84]:
greek_corpus = pd.read_csv('Greek_wordlist.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)

In [47]:
dictionary = pd.read_csv('greek_eng_dict_v2.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)
char_dict = dictionary.iloc[:124]
symbol_dict = dictionary.iloc[124:]
symbol_dict = symbol_dict.reset_index(drop=True)

In [24]:
greek_corpus

Unnamed: 0,0
0,και
1,να
2,το
3,του
4,την
...,...
143955,ΚΑΚΟΗΘΟΥΣ
143956,ΗΛΙΑΧΤΙΔΑ
143957,ΠΡΕΣΒΕΥΕΙΣ
143958,ΕΦΑΜΙΛΛΗ


In [198]:
dictionary

Unnamed: 0,0
0,Α
1,Β
2,Γ
3,Δ
4,Ε
...,...
161,+
162,","
163,-
164,.


In [48]:
char_dict

Unnamed: 0,0
0,Α
1,Β
2,Γ
3,Δ
4,Ε
...,...
119,V
120,W
121,X
122,Y


In [28]:
symbol_dict

Unnamed: 0,0
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


## Data Cleansing for Greek Dataset

In [6]:
dict_list = list(dictionary[0])

remove_list = []

for i, txt in enumerate(greek_corpus[0]):
    try:
        for j in txt:
            if j in dict_list:
                pass
            else:
                remove_list.append(i)
                continue
    except:
        remove_list.append(i)

In [7]:
greek_corpus_avalible = greek_corpus.drop(index=remove_list)
greek_corpus_avalible = greek_corpus_avalible.reset_index(drop=True)
greek_corpus_avalible

Unnamed: 0,0
0,και
1,να
2,το
3,του
4,την
...,...
143828,ΚΑΚΟΗΘΟΥΣ
143829,ΗΛΙΑΧΤΙΔΑ
143830,ΠΡΕΣΒΕΥΕΙΣ
143831,ΕΦΑΜΙΛΛΗ


In [8]:
with open('Greek_corpus_v2.txt', 'w') as z:

    for i in greek_corpus_avalible[0]:
        z.write(i)
        z.write('\n')

## Data Cleansing for English Dataset

In [9]:
eng_corpus = pd.read_csv('eng.wordlist', sep="\t", header=None, quoting=csv.QUOTE_NONE)

In [10]:
eng_corpus

Unnamed: 0,0
0,the
1,of
2,and
3,to
4,a
...,...
439145,Cycling:
439146,variation).
439147,people.'
439148,Minhtu


In [11]:
dict_list = list(dictionary[0])

remove_list = []

for i, txt in enumerate(eng_corpus[0]):
    try:
        for j in txt:
            if j in dict_list:
                pass
            else:
                remove_list.append(i)
                continue
    except:
        remove_list.append(i)

In [12]:
eng_corpus_avalible = eng_corpus.drop(index=remove_list)
eng_corpus_avalible = eng_corpus_avalible.reset_index(drop=True)
eng_corpus_avalible

Unnamed: 0,0
0,the
1,of
2,and
3,to
4,a
...,...
399651,Cycling:
399652,variation).
399653,people.'
399654,Minhtu


In [13]:
with open('English_corpus.txt', 'w') as z:

    for i in eng_corpus_avalible[0]:
        z.write(i)
        z.write('\n')

# Generate Combined Text

In [214]:
def dataset_generation(n_sample, output_txt, print_only=False):
    
    with open(output_txt, 'w') as z:

        for i in range(n_sample):
            
            text = ''
            
            # token length between 1 - 3
            
            n_token = random.randint(1,3)
            
            for i in range(n_token):
                
                # whether symbol will add to token or not with 0.5 prob
                
                symbol = random.randint(0,1)
                
                # whether greek will be chosen or not with 0.7 prob
                
                lang_select = random.random()
                
                if lang_select > 0.3:
                    append_text = greek_corpus_avalible.iloc[random.randint(0,len(greek_corpus_avalible)-1)][0]
                else:
                    append_text = eng_corpus_avalible.iloc[random.randint(0,len(eng_corpus_avalible)-1)][0]
                
                if symbol:
                    append_symbol = symbol_dict.iloc[random.randint(0,len(symbol_dict)-1)][0]
                    
                    # the symbol will be added to the front or to the end of the token
                    
                    sym_loc_head = random.randint(0,1)
                    
                    if sym_loc_head:
                        append_text = append_symbol + append_text

                if i == 0:
                    text = append_text
                else:
                    text = append_text + ' ' + text
                
            # Pick the random length of token as subset
                
            text_len = random.randint(1, min(24,len(text)))
            string = text[0:text_len]
            
            if_add_dict_char = random.randint(0,1)
            
            # Add dictionay char to dataset in order to make all char inside
            
            if if_add_dict_char:
            
                add_dict_char_loc = random.randint(0,1)

                add_dict_char = dictionary.iloc[random.randint(0,len(dictionary)-1)][0]

                if add_dict_char_loc:
                    append_text = add_dict_char + string
                else:
                    append_text = string + add_dict_char
                    
            else:
                append_text = string
                            
            if not print_only:
                z.write(append_text)
                z.write('\n')
            else:
                print(append_text)
                
def check_dataset_char_in_dict(corpus):

    corpus_string = ''.join(str(x) for x in corpus)

    lst = []

    for letter in corpus_string:
        lst.append(letter)

    wordCount = dict(Counter(lst))

    return set(list(wordCount.keys())) - set(dictionary[0].append(pd.Series([' ']))), wordCount

def sort_dict(input_dict):
    return {k: v for k, v in sorted(input_dict.items(), key=lambda item: item[1])}

## Training Dataset

In [143]:
dataset_generation(80000, 'train_corpus_v2.txt', print_only=False)

In [181]:
train_whole_corpus = pd.read_csv('train_corpus_v2.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)[0]
train_check_char_in_dict, train_wordCount = check_dataset_char_in_dict(train_whole_corpus)

In [182]:
train_check_char_in_dict

set()

In [183]:
take(5, train_wordCount.items())

[('2', 867), ('W', 970), ('A', 2729), ('T', 2022), ('E', 2114)]

## Validation Set

In [159]:
dataset_generation(20000, 'validate_corpus_v2.txt', print_only=False)

In [184]:
valid_whole_corpus = pd.read_csv('validate_corpus_v2.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)[0]
valid_check_char_in_dict, valid_wordCount = check_dataset_char_in_dict(valid_whole_corpus)

In [185]:
valid_check_char_in_dict

set()

In [186]:
take(5, valid_wordCount.items())

[('α', 5693), ('ν', 3264), ('θ', 854), ('ρ', 3320), ('Ξ', 448)]

# Data Distribution

In [215]:
sort_dict(train_wordCount)

{'Ώ': 211,
 'ΰ': 211,
 'ϱ': 223,
 'Ά': 235,
 'Ί': 235,
 'ϑ': 236,
 'Ύ': 239,
 'Έ': 240,
 'ϰ': 248,
 'ϐ': 254,
 'Ό': 256,
 'ϋ': 257,
 'Ή': 262,
 'Ϋ': 267,
 'Ϊ': 340,
 'Z': 368,
 'X': 369,
 'ϊ': 373,
 'Q': 398,
 'q': 491,
 'Y': 536,
 'j': 544,
 'J': 626,
 'z': 731,
 'V': 766,
 'x': 831,
 'K': 840,
 'U': 863,
 'Ψ': 863,
 '2': 867,
 '}': 884,
 '6': 886,
 'ψ': 886,
 '=': 892,
 '0': 897,
 '3': 899,
 '^': 900,
 '&': 902,
 '@': 902,
 '%': 913,
 '4': 915,
 '7': 921,
 '1': 927,
 '5': 931,
 '`': 933,
 '|': 942,
 '~': 945,
 '8': 945,
 '$': 946,
 '*': 948,
 '<': 956,
 '\\': 961,
 '{': 963,
 '>': 966,
 'W': 970,
 '#': 971,
 '+': 986,
 '9': 987,
 '_': 1016,
 ']': 1038,
 '[': 1045,
 '?': 1086,
 ';': 1113,
 '!': 1149,
 '/': 1214,
 'F': 1267,
 'G': 1293,
 'H': 1352,
 '"': 1386,
 ')': 1453,
 'O': 1476,
 ':': 1493,
 'Ζ': 1514,
 'w': 1537,
 "'": 1567,
 'N': 1583,
 'L': 1598,
 'ζ': 1671,
 'D': 1712,
 'k': 1734,
 'B': 1758,
 'I': 1789,
 'ώ': 1839,
 'Ξ': 1847,
 'v': 1849,
 'ξ': 1855,
 'P': 1987,
 '(': 1991,
 

In [216]:
sort_dict(valid_wordCount)

{'Έ': 45,
 'Ί': 50,
 'ϑ': 51,
 'Ό': 51,
 'Ύ': 54,
 'ϐ': 56,
 'Ά': 56,
 'ϱ': 59,
 'Ή': 59,
 'Ώ': 60,
 'Ϋ': 61,
 'ϰ': 62,
 'ϋ': 66,
 'ΰ': 67,
 'Q': 82,
 'ϊ': 83,
 'Ϊ': 86,
 'X': 100,
 'Z': 109,
 'Y': 120,
 'q': 121,
 'j': 140,
 'V': 171,
 'J': 175,
 'z': 175,
 '`': 193,
 'x': 200,
 'U': 201,
 '~': 211,
 'ψ': 214,
 'K': 215,
 'Ψ': 216,
 '7': 218,
 '3': 219,
 '$': 221,
 '2': 225,
 '0': 227,
 '9': 228,
 '#': 229,
 '6': 230,
 '1': 231,
 '5': 232,
 '%': 232,
 '_': 233,
 '|': 235,
 '=': 236,
 '}': 240,
 '8': 240,
 '*': 241,
 '^': 243,
 '{': 245,
 'W': 246,
 '>': 246,
 '\\': 247,
 '&': 252,
 '@': 256,
 '<': 261,
 '+': 262,
 ']': 272,
 '!': 274,
 '4': 275,
 'G': 286,
 '[': 286,
 '/': 289,
 ';': 294,
 '?': 305,
 'F': 340,
 '"': 342,
 'H': 344,
 'L': 351,
 'N': 360,
 ':': 364,
 'w': 372,
 'Ζ': 377,
 ')': 385,
 'O': 388,
 'v': 394,
 "'": 399,
 'I': 402,
 'ζ': 402,
 'D': 411,
 'B': 423,
 'k': 429,
 'Ξ': 448,
 '(': 470,
 'P': 474,
 'M': 476,
 'ξ': 477,
 'R': 498,
 'ώ': 501,
 'T': 507,
 '-': 523,
 'f'

# Check Image Dataset

In [188]:
train_whole_imageset = pd.read_csv('train.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)
train_whole_imageset.columns = ['path', 'text']
check_dataset_char_in_dict(train_whole_imageset['text'])[0]

set()

In [189]:
valid_whole_imageset = pd.read_csv('validation.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)
valid_whole_imageset.columns = ['path', 'text']
check_dataset_char_in_dict(valid_whole_imageset['text'])[0]

set()

In [218]:
sort_dict(check_dataset_char_in_dict(train_whole_imageset['text'])[1])

{'ϑ': 179,
 'ΰ': 186,
 'Ώ': 204,
 'ϰ': 205,
 'Ύ': 235,
 'Ό': 237,
 'Ά': 246,
 'Έ': 248,
 'Ί': 252,
 'Ή': 255,
 'ϋ': 261,
 'Ϋ': 297,
 'Ϊ': 320,
 'Z': 335,
 'ϊ': 353,
 'X': 354,
 'Q': 389,
 'q': 424,
 'Y': 514,
 'j': 516,
 'J': 620,
 'z': 633,
 'V': 736,
 '0': 760,
 'Ψ': 760,
 'K': 782,
 'U': 782,
 '}': 784,
 'x': 791,
 '`': 803,
 '%': 814,
 'ψ': 816,
 '=': 822,
 '6': 822,
 '4': 829,
 '^': 831,
 '2': 837,
 '~': 843,
 '$': 850,
 '&': 852,
 '1': 860,
 '@': 866,
 '8': 875,
 '<': 885,
 '5': 890,
 '*': 891,
 '>': 894,
 '7': 898,
 '|': 901,
 'W': 916,
 '_': 921,
 '{': 928,
 '3': 934,
 '#': 934,
 '\\': 939,
 '9': 940,
 '+': 942,
 ']': 953,
 '[': 985,
 '?': 1030,
 '!': 1060,
 ';': 1065,
 'F': 1106,
 '/': 1125,
 'G': 1202,
 'H': 1254,
 'Ζ': 1311,
 ')': 1341,
 '"': 1373,
 'N': 1386,
 'O': 1395,
 ':': 1402,
 'w': 1422,
 'L': 1524,
 'ζ': 1542,
 'D': 1559,
 "'": 1578,
 'k': 1600,
 'ώ': 1602,
 'Ξ': 1661,
 'I': 1707,
 'B': 1722,
 'ξ': 1727,
 'v': 1748,
 'M': 1845,
 'f': 1868,
 'T': 1874,
 'P': 1894,
 '

In [220]:
sort_dict(check_dataset_char_in_dict(valid_whole_imageset['text'])[1])

{'ϑ': 51,
 'Ώ': 51,
 'Ό': 60,
 'ΰ': 60,
 'Ή': 61,
 'ϰ': 63,
 'Ά': 65,
 'ϋ': 70,
 'Ϋ': 70,
 'Ί': 74,
 'Έ': 77,
 'Ύ': 78,
 'Z': 91,
 'ϊ': 92,
 'Ϊ': 98,
 'Q': 99,
 'X': 104,
 'q': 110,
 'Y': 120,
 'j': 123,
 'z': 140,
 'V': 145,
 'J': 168,
 'x': 178,
 '{': 182,
 '`': 185,
 '}': 188,
 '3': 189,
 '_': 199,
 'K': 203,
 '4': 206,
 ']': 207,
 '2': 208,
 '7': 209,
 '9': 210,
 'U': 210,
 'W': 211,
 '<': 213,
 '8': 214,
 '#': 215,
 '|': 216,
 '$': 217,
 '%': 218,
 '0': 219,
 '6': 223,
 '5': 224,
 '*': 225,
 'ψ': 227,
 'Ψ': 232,
 '^': 233,
 '?': 235,
 '=': 237,
 '1': 237,
 '+': 239,
 '>': 243,
 '&': 244,
 '\\': 245,
 '[': 246,
 '@': 251,
 '~': 256,
 'G': 273,
 ';': 275,
 'F': 280,
 '/': 293,
 ':': 297,
 'Ζ': 299,
 '!': 305,
 'H': 326,
 '"': 330,
 "'": 341,
 ')': 353,
 'N': 362,
 'O': 365,
 'k': 368,
 'L': 370,
 'B': 371,
 'w': 373,
 'D': 384,
 'ξ': 393,
 'Ξ': 394,
 'ζ': 395,
 'I': 402,
 'v': 412,
 'M': 442,
 'ώ': 442,
 'P': 443,
 'E': 452,
 'T': 454,
 '(': 459,
 'y': 464,
 '-': 482,
 'f': 490,
 'R

# Combine Dataset With Same Tag

In [207]:
def merge_same_tag(input_txt, output_txt):
    dataset = pd.read_csv(input_txt, sep="\t", header=None, quoting=csv.QUOTE_NONE)
    dataset.columns = ['path', 'text']
    dataset_merge = pd.DataFrame(
        dataset
        .groupby(dataset['text'])['path']
        .apply(lambda x: '[%s]' % ', '.join('"' + x + '"'))
    )
    dataset_merge = dataset_merge.reset_index(drop=False)
    dataset_merge = dataset_merge[["path", "text"]]
    with open(output_txt, 'w') as z:
        for i in range(len(dataset_merge)):
            path, text = dataset_merge.iloc[i]['path'], dataset_merge.iloc[i]['text']
            append_text = path + '\t' + text
            z.write(append_text)
            z.write('\n')

In [209]:
merge_same_tag("train.txt", "train_v2.txt")