# Generate Dataset Greek More Data

## Combination of Greek Words

In [136]:
import pandas as pd
import csv
from collections import Counter

from itertools import islice

def take(n, iterable):
    """Return the first n items of the iterable as a list."""
    return list(islice(iterable, n))

In [84]:
greek_corpus = pd.read_csv('Greek_wordlist.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)

In [47]:
dictionary = pd.read_csv('greek_eng_dict_v2.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)
char_dict = dictionary.iloc[:124]
symbol_dict = dictionary.iloc[124:]
symbol_dict = symbol_dict.reset_index(drop=True)

In [24]:
greek_corpus

Unnamed: 0,0
0,και
1,να
2,το
3,του
4,την
...,...
143955,ΚΑΚΟΗΘΟΥΣ
143956,ΗΛΙΑΧΤΙΔΑ
143957,ΠΡΕΣΒΕΥΕΙΣ
143958,ΕΦΑΜΙΛΛΗ


In [25]:
dictionary

Unnamed: 0,0
0,Α
1,Β
2,Γ
3,Δ
4,Ε
...,...
161,+
162,","
163,-
164,.


In [48]:
char_dict

Unnamed: 0,0
0,Α
1,Β
2,Γ
3,Δ
4,Ε
...,...
119,V
120,W
121,X
122,Y


In [28]:
symbol_dict

Unnamed: 0,0
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


## Data Cleansing for Greek Dataset

In [6]:
dict_list = list(dictionary[0])

remove_list = []

for i, txt in enumerate(greek_corpus[0]):
    try:
        for j in txt:
            if j in dict_list:
                pass
            else:
                remove_list.append(i)
                continue
    except:
        remove_list.append(i)

In [7]:
greek_corpus_avalible = greek_corpus.drop(index=remove_list)
greek_corpus_avalible = greek_corpus_avalible.reset_index(drop=True)
greek_corpus_avalible

Unnamed: 0,0
0,και
1,να
2,το
3,του
4,την
...,...
143828,ΚΑΚΟΗΘΟΥΣ
143829,ΗΛΙΑΧΤΙΔΑ
143830,ΠΡΕΣΒΕΥΕΙΣ
143831,ΕΦΑΜΙΛΛΗ


In [8]:
with open('Greek_corpus_v2.txt', 'w') as z:

    for i in greek_corpus_avalible[0]:
        z.write(i)
        z.write('\n')

## Data Cleansing for English Dataset

In [9]:
eng_corpus = pd.read_csv('eng.wordlist', sep="\t", header=None, quoting=csv.QUOTE_NONE)

In [10]:
eng_corpus

Unnamed: 0,0
0,the
1,of
2,and
3,to
4,a
...,...
439145,Cycling:
439146,variation).
439147,people.'
439148,Minhtu


In [11]:
dict_list = list(dictionary[0])

remove_list = []

for i, txt in enumerate(eng_corpus[0]):
    try:
        for j in txt:
            if j in dict_list:
                pass
            else:
                remove_list.append(i)
                continue
    except:
        remove_list.append(i)

In [12]:
eng_corpus_avalible = eng_corpus.drop(index=remove_list)
eng_corpus_avalible = eng_corpus_avalible.reset_index(drop=True)
eng_corpus_avalible

Unnamed: 0,0
0,the
1,of
2,and
3,to
4,a
...,...
399651,Cycling:
399652,variation).
399653,people.'
399654,Minhtu


In [13]:
with open('English_corpus.txt', 'w') as z:

    for i in eng_corpus_avalible[0]:
        z.write(i)
        z.write('\n')

# Generate Combined Text

In [142]:
def dataset_generation(n_sample, output_txt, print_only=False):
    
    with open(output_txt, 'w') as z:

        for i in range(n_sample):
            
            text = ''
            
            # token length between 1 - 3
            
            n_token = random.randint(1,3)
            
            for i in range(n_token):
                
                # whether symbol will add to token or not with 0.5 prob
                
                symbol = random.randint(0,1)
                
                # whether greek will be chosen or not with 0.7 prob
                
                lang_select = random.random()
                
                if lang_select > 0.3:
                    append_text = greek_corpus_avalible.iloc[random.randint(0,len(greek_corpus_avalible)-1)][0]
                else:
                    append_text = eng_corpus_avalible.iloc[random.randint(0,len(eng_corpus_avalible)-1)][0]
                
                if symbol:
                    append_symbol = symbol_dict.iloc[random.randint(0,len(symbol_dict)-1)][0]
                    
                    # the symbol will be added to the front or to the end of the token
                    
                    sym_loc_head = random.randint(0,1)
                    
                    if sym_loc_head:
                        append_text = append_symbol + append_text

                if i == 0:
                    text = append_text
                else:
                    text = append_text + ' ' + text
                
            # Pick the random length of token as subset
                
            text_len = random.randint(1, min(24,len(text)))
            string = text[0:text_len]
            
            if_add_dict_char = random.randint(0,1)
            
            # Add dictionay char to dataset in order to make all char inside
            
            if if_add_dict_char:
            
                add_dict_char_loc = random.randint(0,1)

                add_dict_char = dictionary.iloc[random.randint(0,len(dictionary)-1)][0]

                if add_dict_char_loc:
                    append_text = add_dict_char + string
                else:
                    append_text = string + add_dict_char
                    
            else:
                append_text = string
                            
            if not print_only:
                z.write(append_text)
                z.write('\n')
            else:
                print(append_text)

## Training Dataset

In [143]:
dataset_generation(80000, 'train_corpus_v2.txt', print_only=False)

In [144]:
whole_corpus = pd.read_csv('train_corpus_v2.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)[0]
corpus_string = ''.join(str(x) for x in whole_corpus)

lst = []
 
for letter in corpus_string:
    lst.append(letter)
    
wordCount = dict(Counter(lst))

set(list(wordCount.keys())) - set(dictionary[0])

{' '}

In [145]:
take(5, wordCount.items())

[('2', 867), ('W', 970), ('A', 2729), ('T', 2022), ('E', 2114)]

## Validation Set

In [146]:
dataset_generation(20000, 'validate_corpus_v2.txt', print_only=False)

In [147]:
whole_corpus = pd.read_csv('validate_corpus_v2.txt', sep="\t", header=None, quoting=csv.QUOTE_NONE)[0]
corpus_string = ''.join(str(x) for x in whole_corpus)

lst = []
 
for letter in corpus_string:
    lst.append(letter)
    
wordCount = dict(Counter(lst))

set(list(wordCount.keys())) - set(dictionary[0])

{' '}

In [148]:
take(5, wordCount.items())

[('r', 2685), ('o', 2682), ('e', 3880), ('l', 1836), (' ', 9282)]