In [5]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu
!pip install spacy nltk transformers
import spacy

Collecting transformers
  Downloading transformers-4.46.2-py3-none-any.whl.metadata (44 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.2-cp312-cp312-win_amd64.whl.metadata (2.1 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp312-none-win_amd64.whl.metadata (3.9 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.3-cp312-none-win_amd64.whl.metadata (6.9 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.23.2->transformers)
  Downloading fsspec-2024.10.0-py3-none-any.whl.metadata (11 kB)
Downloading transformers-4.46.2-py3-none-any.whl (10.0 MB)
   ---------------------------------------- 0.0/10.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/10.0 MB ? eta -:--:--
   -- ------------------------------------- 0.5/10.0 MB 4.2 MB/s et

In [6]:
#loading dataset
data = pd.read_csv("eng-french.csv", nrows = 50)
data.columns = ['english', 'french']

# Clean 'english' column
data['english'] = data['english'].str.lower()
data['english'] = data['english'].apply(lambda x: re.sub(r"[^a-z\s]", "", x))
data['english'] = data['english'].apply(lambda x: re.sub(r"\s+", " ", x).strip())
# Clean 'french' column
data['french'] = data['french'].str.lower()
data['french'] = data['french'].apply(lambda x: re.sub(r"[^a-z\s]", "", x))
data['french'] = data['french'].apply(lambda x: re.sub(r"\s+", " ", x).strip())

data.head()

Unnamed: 0,english,french
0,hi,salut
1,run,cours
2,run,courez
3,who,qui
4,wow,a alors


In [7]:
#sub word tokenization
from transformers import AutoTokenizer

# Load a pre-trained tokenizer (e.g., BERT tokenizer)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize English and French columns
data['english_subtokens'] = data['english'].apply(lambda x: tokenizer.tokenize(x))
data['french_subtokens'] = data['french'].apply(lambda x: tokenizer.tokenize(x))

# View tokenized data
print(data[['english', 'english_subtokens']].head())


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


  english english_subtokens
0      hi              [hi]
1     run             [run]
2     run             [run]
3     who             [who]
4     wow             [wow]


In [12]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     - -------------------------------------- 0.5/12.8 MB 2.4 MB/s eta 0:00:06
     ----- ---------------------------------- 1.8/12.8 MB 3.9 MB/s eta 0:00:03
     ------- -------------------------------- 2.4/12.8 MB 4.2 MB/s eta 0:00:03
     -------- ------------------------------- 2.6/12.8 MB 3.1 MB/s eta 0:00:04
     ----------- ---------------------------- 3.7/12.8 MB 3.5 MB/s eta 0:00:03
     ------------- -------------------------- 4.5/12.8 MB 3.5 MB/s eta 0:00:03
     ------------- -------------------------- 4.5/12.8 MB 3.5 MB/s eta 0:00:03
     ---------------- ----------------------- 5.2/12.8 

In [13]:
#parsing into grammar Trees
nlp = spacy.load("en_core_web_sm")
# Extract dependency relationships (grammar trees)
data['english_grammar'] = data['english'].apply(lambda x: [(token.text, token.dep_, token.head.text) for token in nlp(x)])
data['french_grammar'] = data['french'].apply(lambda x: [(token.text, token.dep_, token.head.text) for token in nlp(x)])

# View grammar structures
print(data[['english', 'english_grammar']].head())


  english     english_grammar
0      hi    [(hi, ROOT, hi)]
1     run  [(run, ROOT, run)]
2     run  [(run, ROOT, run)]
3     who  [(who, ROOT, who)]
4     wow  [(wow, ROOT, wow)]
