### 1. Load and Clean Data

In [15]:
from translate.storage.tmx import tmxfile
import unicodedata
import re
import string
from pickle import dump
from numpy import array

In [2]:
def read(filepath):
    with open(filepath, 'rb') as fin:
        tmx_file = tmxfile(fin, 'en', 'ar')
    
    pairs = []
    for node in tmx_file.unit_iter():
        i = []
        i.append(node.source)
        i.append(node.target)
        pairs.append(i)
    print("Number of pairs = {}".format(len(pairs)))
    return pairs

In [3]:
def save(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

In [4]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    
    # remove numbers
    w = re.sub(r"[0-9]", "", w)
    
    w = w.rstrip().strip()
    return w

def nepali_preprocess_sentence(w):
    # remove numbers
    w = re.sub(r"[०१२३४५६७८९]", "", w)
    
    w = w.rstrip().strip()
    return w

def max_length(lines):
    return max(len(line.split()) for line in lines)

In [5]:
def clean(dataset):
    cleaned_pairs = []
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for line in dataset:
        # remove punctuation from each token
        line1 = [word.translate(table) for word in line[0].split(' ')]
        line2 = [word.translate(table) for word in line[1].split(' ')]        
        
        en_line = [preprocess_sentence(w) for w in line1 if len(preprocess_sentence(w)) != 0]        
        ne_line = [nepali_preprocess_sentence(w) for w in line2 if len(nepali_preprocess_sentence(w)) != 0]
        
        cleaned_pairs.append([' '.join(en_line), ' '.join(ne_line)])
    return cleaned_pairs

In [6]:
def load_dataset():
    pairs = []
    filenames = ["corpus/project_save.tmx", "corpus/NP2.tmx", "corpus/NP8.tmx", "opus_corpus/compiled/tico19v2020/en-ne.tmx"]
#     filenames = ["corpus/project_save.tmx"]
    for filename in filenames:
        pairs.extend(read(filename))
    
    cleaned_pairs = clean(pairs)
    
    # discard sentence with length > 30
    reduced_pairs = []
    for pair in cleaned_pairs:
        if len(pair[0].strip()) == 0 or len(pair[1].strip()) == 0 or max_length(pair) > 30:
            continue
        reduced_pairs.append(pair)
    return reduced_pairs

In [7]:
dataset = load_dataset()
print("No. of data: {}".format(len(dataset)))

Number of pairs = 1060
Number of pairs = 50
Number of pairs = 1054
Number of pairs = 3070
No. of data: 3981


In [8]:
for i in range(20):
    print("[{}] => [{}]".format(dataset[i][0], dataset[i][1]))

[although agriculture sector contributes the most to gross national production the development in irrigation has been less than expected] => [देशको कुल राष्ट्रिय उत्पादनमा कृषि क्षेत्रको योगदान सर्वाधिक रहे तापनि सित्र्चाइ क्षेत्रमा अपेक्षित विकास भएको छैन।]
[the agriculture sector has still to rely largely on rainfall] => [सित्र्चाइको लागि अझै पनि कृषिक्षेत्र वर्षा माथि निर्भर रहनुपरेको छ।]
[while planning for irrigation development it is also necessary to maintain coordination between irrigation and agricultural production programmes towards achieving maximum benefit] => [सित्र्चाइ विकासका योजना तर्जुमा गर्दा अधिकतम लाभ प्राप्त गर्नें दृष्टिकोणबाट सित्र्चाइ तथा कृषि उत्पादनका कार्यक्रमहरु बीच सामञ्जस्यता कायम गर्नु पनि जरुरी छ।]
[progress during the seventh plan department of irrigation and agriculture development bank have together developed irrigation facilities in ha of land during the seventh plan period] => [सातौं योजनाको प्रगति सातौं योजनावधिमा सित्र्चाइ विभाग एवं कृषि विकास बै

In [16]:
save(array(dataset), 'english-nepali.pkl')

Saved: english-nepali.pkl


### Split dataset

In [18]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle

# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

# load dataset
raw_dataset = load_clean_sentences('english-nepali.pkl')

# # reduce dataset size
n_sentences = 3981
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:3000], dataset[3000:]
# save
save_clean_data(dataset, 'english-nepali-both.pkl')
save_clean_data(train, 'english-nepali-train.pkl')
save_clean_data(test, 'english-nepali-test.pkl')

Saved: english-nepali-both.pkl
Saved: english-nepali-train.pkl
Saved: english-nepali-test.pkl
