<a href="https://colab.research.google.com/github/imnotamr/Applied-Deep-Learning/blob/main/Applied_Deep_Learning_8_Machine_Language_Translation_Using_RNN_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import string                          # To process strings ---> uppercase, lowercase and punctuation
import re                              # For Regular Expressions
from pickle import dump                # To serialized, store and save data
from unicodedata import normalize      # For Unicode text processing ( for strange symbols )
import numpy as np

In [2]:
# Load document to memory
def load_document(filename):
  file = open(filename, mode='rt', encoding='utf-8') # rt ---> Read Text and utf-8 ---> encoding for text languages
  text = file.read()
  file.close()
  return text



# Split loaded document into sentences
def to_pairs(doc):
  lines = doc.strip().split('\n')
  pairs = [line.split('\t') for line in lines]
  return pairs



# Clean list of lines
def clean_pairs(lines):
  # A list to save pairs after cleaning
	cleaned = list()

  # Remove nonprintable characters with re library
	re_print = re.compile('[^%s]' % re.escape(string.printable))

  # Use the table to remove any punctuation from texts with string library
	table = str.maketrans('', '', string.punctuation)

	for pair in lines:
		clean_pair = list()

		for line in pair:
      # Processing texts that contain strange characters/symbols
			line = normalize('NFD', line).encode('ascii', 'ignore')
			line = line.decode('UTF-8') #line = line.decode('unicode-escape')

      # Split text into a list of words based on spaces
			line = line.split()
      # Convert every word to lowercase
			line = [word.lower() for word in line]
      # Remove punctuation marks from each word (in table i've created up ^)
			line = [word.translate(table) for word in line]
      # Remove any non printable character from the word
			line = [re_print.sub('', w) for w in line]
      # Make sure that the word consists of letters only, to remove words that contain numbers
			line = [word for word in line if word.isalpha()]
      # Convert the word list back to text (one sentence)
			clean_pair.append(' '.join(line))
		cleaned.append(clean_pair)
	return cleaned



 # Save the list of clean sentences to file
def save_clean_data(sentences, filename):
	dump(sentences, open(filename, 'wb')) # W--> write and B--> binary (since the pickle library work on files as binary data)
	print('Saved: %s' % filename)


 # Load dataset
filename = '/content/fra.txt'
doc = load_document(filename)

pairs = to_pairs(doc)
clean_pairs = clean_pairs(pairs)
save_clean_data(clean_pairs, 'english-french.pkl')

# Check
for i in range(100):
    print('[%s] => [%s]' % (clean_pairs[i][0], clean_pairs[i][1]))

Saved: english-french.pkl
[go] => [va]
[hi] => [salut]
[hi] => [salut]
[run] => [cours]
[run] => [courez]
[who] => [qui]
[wow] => [ca alors]
[fire] => [au feu]
[help] => [a laide]
[jump] => [saute]
[stop] => [ca suffit]
[stop] => [stop]
[stop] => [arretetoi]
[wait] => [attends]
[wait] => [attendez]
[go on] => [poursuis]
[go on] => [continuez]
[go on] => [poursuivez]
[hello] => [bonjour]
[hello] => [salut]
[i see] => [je comprends]
[i try] => [jessaye]
[i won] => [jai gagne]
[i won] => [je lai emporte]
[i won] => [jai gagne]
[oh no] => [oh non]
[attack] => [attaque]
[attack] => [attaquez]
[cheers] => [sante]
[cheers] => [a votre sante]
[cheers] => [merci]
[cheers] => [tchintchin]
[get up] => [levetoi]
[go now] => [va maintenant]
[go now] => [allezy maintenant]
[go now] => [vasy maintenant]
[got it] => [jai pige]
[got it] => [compris]
[got it] => [pige]
[got it] => [compris]
[got it] => [tas capte]
[hop in] => [monte]
[hop in] => [montez]
[hug me] => [serremoi dans tes bras]
[hug me] => 

In [3]:
from pickle import load
from pickle import dump
from numpy.random import shuffle

# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print(f'Saved: {filename}')

# load dataset
raw_dataset = load_clean_sentences('/content/english-french.pkl')

# Ensure the dataset isn't empty before proceeding
if len(raw_dataset) == 0:
    raise ValueError("Dataset is empty. Please check the input file.")

# Reduce dataset size (this line is optional; only reduce if necessary)
n_sentences = len(raw_dataset)
dataset = raw_dataset[:n_sentences]  # Ensure this slice is meaningful

# Shuffle the dataset
shuffle(dataset)

# Check if dataset is empty after shuffling
if len(dataset) == 0:
    raise ValueError("Dataset is empty after shuffling. Please check the input file.")

# Split dataset into training and testing sets
split_index = int(0.8 * len(dataset))  # 80% for training, 20% for testing

# Ensure both train and test datasets are not empty
train, test = dataset[:split_index], dataset[split_index:]
if len(train) == 0 or len(test) == 0:
    raise ValueError("Training or testing set is empty. Adjust the split ratio or ensure enough data.")

# Save the cleaned and split datasets
save_clean_data(dataset, 'english-french-both.pkl')
save_clean_data(train, 'english-french-train.pkl')
save_clean_data(test, 'english-french-test.pkl')

# Check the dataset sizes
print(f"Train set size: {len(train)}")
print(f"Test set size: {len(test)}")

Saved: english-french-both.pkl
Saved: english-french-train.pkl
Saved: english-french-test.pkl
Train set size: 14009
Test set size: 3503
