In [3]:
import urllib.request

# Define the file URL
file_url = "https://www.statmt.org/europarl/v7/fr-en.tgz"

# Define the destination file path (update to your local directory)
destination_file = "fr-en.tgz"

# Download the file
urllib.request.urlretrieve(file_url, destination_file)

print(f"File downloaded to {destination_file}")


File downloaded to fr-en.tgz


In [5]:
import tarfile
extraction_path = "fr-en"
# Extract the tar file
with tarfile.open(destination_file, 'r:gz') as tar_ref:
    tar_ref.extractall(extraction_path)

print(f"File extracted to {extraction_path}")

File extracted to fr-en


In [6]:
import pickle 
from pickle import dump

In [7]:
def load_doc(filename):
    file = open(filename, mode='rt', encoding='utf-8')
    text = file.read()
    file.close()
    return text

In [8]:
def to_sentences(doc):
    return doc.strip().split('\n')

In [9]:
def sentence_lengths(sentences):
    lengths = [len(s.split()) for s in sentences]
    return min(lengths), max(lengths)

In [10]:
#preprocessing and cleaning

In [16]:
# clean lines
import re
import string
import unicodedata
def clean_lines(lines):
	cleaned = list()
	# prepare regex for char filtering
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for line in lines:
		# normalize unicode characters
		line = unicodedata.normalize('NFD', line).encode('ascii', 'ignore')
		line = line.decode('UTF-8')
		# tokenize on white space
		line = line.split()
		# convert to lower case
		line = [word.lower() for word in line]
		# remove punctuation from each token
		line = [word.translate(table) for word in line]
		# remove non-printable chars from each token
		line = [re_print.sub('', w) for w in line]
		# remove tokens with numbers in them
		line = [word for word in line if word.isalpha()]
		# store as string
		cleaned.append(' '.join(line))
	return cleaned

In [17]:

# Load English data
filename = f'{extraction_path}/europarl-v7.fr-en.en'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('English data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))
cleanf = clean_lines(sentences)

# Save the cleaned data to a pickle file
pickle_filename = 'English.pkl'
with open(pickle_filename, 'wb') as outfile:
    pickle.dump(cleanf, outfile)
print(pickle_filename, "saved")

English data: sentences=2007723, min=0, max=668
English.pkl saved


In [18]:
filename = 'fr-en/europarl-v7.fr-en.fr'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('French data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))
cleanf = clean_lines(sentences)

# Save the cleaned French data to a pickle file
pickle_filename = 'French.pkl'
with open(pickle_filename, 'wb') as outfile:
    pickle.dump(cleanf, outfile)

print(pickle_filename, "saved")

French data: sentences=2007723, min=0, max=693
French.pkl saved


In [19]:
from pickle import load
from pickle import dump
from collections import Counter

In [20]:
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

def save_clean_sentences(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

In [21]:
def to_vocab(lines):
    vocab = Counter()
    for line in lines:
        tokens = line.split()
        vocab.update(tokens)
    return vocab

In [22]:
def trim_vocab(vocab, min_occurance):
    tokens = [k for k, c in vocab.items() if c >= min_occurance]
    return set(tokens)

In [23]:
def update_dataset(lines, vocab):
    new_lines = list()
    for line in lines:
        new_tokens = list()
        for token in line.split():
            if token in vocab:
                new_tokens.append(token)
            else:
                new_tokens.append('unk')

        new_line = ' '.join(new_tokens)
        new_lines.append(new_line)
    return new_line

In [25]:
filename = 'English.pkl'
lines = load_clean_sentences(filename)

vocab = to_vocab(lines)
print('English Vocabulary: %d' % len(vocab))
vocab = trim_vocab(vocab, 5)
print('New English Vocabulary: %d' % len(vocab))
lines = update_dataset(lines, vocab)
filename = 'english_vocab.pkl'
save_clean_sentences(lines, filename)

for i in range(20):
    print("line", i, ":", lines[i])

English Vocabulary: 105357
New English Vocabulary: 41746
Saved: english_vocab.pkl
line 0 : t
line 1 : h
line 2 : e
line 3 :  
line 4 : s
line 5 : i
line 6 : t
line 7 : t
line 8 : i
line 9 : n
line 10 : g
line 11 :  
line 12 : w
line 13 : a
line 14 : s
line 15 :  
line 16 : c
line 17 : l
line 18 : o
line 19 : s


In [26]:
filename = 'French.pkl'
lines = load_clean_sentences(filename)
# calculate vocabulary
vocab = to_vocab(lines)
print('French Vocabulary: %d' % len(vocab))
# reduce vocabulary
vocab = trim_vocab(vocab, 5)
print('New French Vocabulary: %d' % len(vocab))
# mark out of vocabulary words
lines = update_dataset(lines, vocab)
# save updated dataset
filename = 'french_vocab.pkl'
save_clean_sentences(lines, filename)
# spot check
for i in range(20):
        print("line",i,":",lines[i])

French Vocabulary: 141642
New French Vocabulary: 58800
Saved: french_vocab.pkl
line 0 : l
line 1 : a
line 2 :  
line 3 : s
line 4 : e
line 5 : a
line 6 : n
line 7 : c
line 8 : e
line 9 :  
line 10 : e
line 11 : s
line 12 : t
line 13 :  
line 14 : l
line 15 : e
line 16 : v
line 17 : e
line 18 : e
line 19 :  


In [27]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

In [28]:
#Example 1
"""reference = [['the', 'cat', 'likes', 'milk'], ['cat', 'likes' 'milk']]
candidate = ['the', 'cat', 'likes', 'milk']
score = sentence_bleu(reference, candidate)
print('Example 1', score)
#Example 2
reference = [['the', 'cat', 'likes', 'milk']]
candidate = ['the', 'cat', 'likes', 'milk']
score = sentence_bleu(reference, candidate)
print('Example 2', score)"""

Example 1 1.0
Example 2 1.0


In [29]:
#Google Trax

In [1]:
import os
import numpy as np
!pip install trax


Defaulting to user installation because normal site-packages is not writeable


In [3]:
pip install --upgrade trax

Defaulting to user installation because normal site-packages is not writeable
Collecting trax
  Obtaining dependency information for trax from https://files.pythonhosted.org/packages/9d/fd/9b8da163dd156c427591970b39e34cba1be028f979f4302f7a93d57dd84f/trax-1.4.1-py2.py3-none-any.whl.metadata
  Using cached trax-1.4.1-py2.py3-none-any.whl.metadata (1.7 kB)
INFO: pip is looking at multiple versions of trax to determine which version is compatible with other requirements. This could take a while.
  Obtaining dependency information for trax from https://files.pythonhosted.org/packages/9f/d5/6dacdd37a3be876df6ecd408f6bea00948cc3ee8d26cf569c844f2d9c82c/trax-1.4.0-py2.py3-none-any.whl.metadata
  Using cached trax-1.4.0-py2.py3-none-any.whl.metadata (1.7 kB)
  Obtaining dependency information for trax from https://files.pythonhosted.org/packages/b0/92/3ec523cf1307ba89b35e0e278d97ebe4f5197c352fd8643a70df89ff16e4/trax-1.3.9-py2.py3-none-any.whl.metadata
  Using cached trax-1.3.9-py2.py3-none-any.w

In [17]:
print(trax.__version__)

NameError: name 'trax' is not defined

In [6]:
pip install --upgrade trax jax numpy

Defaulting to user installation because normal site-packages is not writeable
Collecting trax
  Obtaining dependency information for trax from https://files.pythonhosted.org/packages/9d/fd/9b8da163dd156c427591970b39e34cba1be028f979f4302f7a93d57dd84f/trax-1.4.1-py2.py3-none-any.whl.metadata
  Using cached trax-1.4.1-py2.py3-none-any.whl.metadata (1.7 kB)
Collecting numpy
  Obtaining dependency information for numpy from https://files.pythonhosted.org/packages/9b/0f/022ca4783b6e6239a53b988a4d315d67f9ae7126227fb2255054a558bd72/numpy-2.0.0-cp311-cp311-win_amd64.whl.metadata
  Downloading numpy-2.0.0-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.9 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.9 kB ? eta -:--:--
     ------------ ------------------------- 20.5/60.9 kB 217.9 kB/s eta 0:00:01
     ------------------------- ------------ 41.0/60.9 kB 281.8 kB/s eta 0:00:01
     ------------------------------- ------ 51.2/

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tables 3.8.0 requires blosc2~=2.0.0, which is not installed.
numba 0.57.1 requires numpy<1.25,>=1.21, but you have numpy 1.26.4 which is incompatible.


In [8]:
import trax

ValueError: no signature found for builtin function <built-in function asarray>

In [9]:
model = trax.models.Transformers

NameError: name 'trax' is not defined