In [2]:
import pandas as pd
import tensorflow as tf
import numpy as np
import re
dataset=pd.read_csv('../input/en-fr-translation-dataset/en-fr.csv')
dataset.head()

Unnamed: 0,en,fr
0,Changing Lives | Changing Society | How It Wor...,Il a transformé notre vie | Il a transformé la...
1,Site map,Plan du site
2,Feedback,Rétroaction
3,Credits,Crédits
4,Français,English


In [3]:
def english_preprocessing(data , col) : 
    data[col] = data[col].astype(str) 
    data[col] = data[col].apply(lambda x: x.lower())
    data[col] = data[col].apply(lambda x: re.sub("[^A-Za-z\s]","",x)) 
    data[col] = data[col].apply(lambda x: x.replace("\s+"," "))
    data[col] = data[col].apply(lambda x: " ".join([word for word in x.split()]))
    return data 
def french_preprocessing(data , col) : 
    data[col] = data[col].astype(str) 
    data[col] = data[col].apply(lambda x : x.lower()) 
    data[col] = data[col].apply(lambda x: re.sub(r'\d','',x))
    data[col] = data[col].apply(lambda x: re.sub(r'\s+',' ',x))
    data[col] = data[col].apply(lambda x: re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,।]", "", x))
    data[col] = data[col].apply(lambda x: x.strip()) 
    data[col] = "<sos> " + data[col] + " <eos>" 
    return data

In [4]:
from collections import Counter 
def tokenizer(col):
    if col=='en':
        sents = english_preprocessing(dataset[:100] , col)[col].tolist()  
    elif col=='fr':
         sents = french_preprocessing(dataset[:100] , col)[col].tolist()  
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=100 , oov_token = "<OOV>" , filters='!#$%&()*+,-/:;<=>@«»""[\\]^_`{|}~\t\n')
    tokenizer.fit_on_texts(sents) 
    tokenizer.word_index['<pad>'] = 0 
    tokenizer.index_word[0] = '<pad>' 
    vocab_to_idx = tokenizer.word_index 
    idx_to_vocab = tokenizer.index_word 
    seqs = tokenizer.texts_to_sequences(sents)  
    pad_seqs = tf.keras.preprocessing.sequence.pad_sequences(seqs , maxlen =100 , padding='post')
    return vocab_to_idx , idx_to_vocab , pad_seqs , tokenizer

In [5]:
en_vocab , en_inv_vocab , en_seqs , en_tokenizer = tokenizer('en')
fr_vocab , fr_inv_vocab , fr_seqs , fr_tokenizer = tokenizer('fr')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col] = data[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col] = data[col].apply(lambda x: x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col] = data[col].apply(lambda x: re.sub("[^A-Za-z\s]","",x))
A value is trying to be set on a copy of a slice from a 

In [6]:
model = tf.keras.Sequential([ tf.keras.layers.Embedding(input_dim=10000 ,output_dim=768, input_length=100)])
def embedder(text):
    global model
    cls_embedding = model(text)
    #Positional encoding 
    seq_len,d,n=100,768,10000
    P = np.zeros((seq_len, d))
    for k in range(seq_len):
        for i in np.arange(int(d/2)):
            denominator = np.power(n, 2*i/d)
            P[k, 2*i] = np.sin(k/denominator)
            P[k, 2*i+1] = np.cos(k/denominator)
    #Adding positional encoding
    cls_embedding += P
    return tf.expand_dims(cls_embedding, axis=0)
    



In [7]:
#encoder
Wqe=tf.random.normal(shape=(1,100, 768))
Wke=tf.random.normal(shape=(1,100, 768))
Wve=tf.random.normal(shape=(1,100, 768))
W0e=tf.random.normal(shape=(1,100, 768))
W1e=tf.random.normal(shape=(1,100, 768))
#decoder
Wqd=tf.random.normal(shape=(1,100, 768))
Wkd=tf.random.normal(shape=(1,100, 768))
Wvd=tf.random.normal(shape=(1,100 ,768))
W0d=tf.random.normal(shape=(1,100 ,768))
W1d=tf.random.normal(shape=(1,100 ,768))
W01d=tf.random.normal(shape=(1,100, 768))
W11d=tf.random.normal(shape=(1,100, 768))
#train
W0t=tf.random.normal(shape=(1,100, 768))

In [8]:
import math
def encoder(embed):
    # weights of query ,keys and values and other weights
    global Wqe,Wke,Wve,W0e,W1e
    #Calculating query ,keys and values
    Query=embed*Wqe
    Key=embed*Wke
    Value=embed*Wve
    # Scaled-Dot Product Attention
    scores = tf.matmul(Query, Key, transpose_b=True) / math.sqrt(tf.cast(768, tf.float32))
    weights = tf.keras.activations.softmax(scores)
    result=tf.matmul(weights, Value)
    #Add layer
    embed+=result
    #normalize the embeddings
    normalized_embeddings = tf.keras.layers.LayerNormalization()(embed)
    #feed forward layer from scratch
    Layer1=W0e*normalized_embeddings
    result1=tf.nn.relu(Layer1)
    result=W1e*normalized_embeddings
    #Add layer
    embed+=result
    #normalize the embeddings
    normalized_embeddings = tf.keras.layers.LayerNormalization()(embed)
    Key=normalized_embeddings*Wke
    Value=normalized_embeddings*Wve
    return [Key,Value]

In [9]:
def decoder(embed, arr):
    # weights of query ,keys and values and other weights
    global Wqd,Wkd,Wvd,W0d,W1d,W01d,W11d
    #Calculating query ,keys and values
    Query=embed*Wqd
    Key=embed*Wkd
    Value=embed*Wvd
    # Scaled-Dot Product Attention
    scores = tf.matmul(Query, Key, transpose_b=True) / math.sqrt(tf.cast(768, tf.float32))
    weights = tf.keras.activations.softmax(scores)
    result=tf.matmul(weights, Value)
    #Add layer
    embed+=result
    #normalize the embeddings
    normalized_embeddings = tf.keras.layers.LayerNormalization()(embed)
    #feed forward layer from scratch
    Layer1=W0d*normalized_embeddings
    result1=tf.nn.relu(Layer1)
    result=W1d*normalized_embeddings
    #Add layer
    embed+=result
    #normalize the embeddings
    embed = tf.keras.layers.LayerNormalization()(embed)
    #key ,value of encodings output
    Key,Value=arr
    Query=embed*Wqd
    scores = tf.matmul(Query, Key, transpose_b=True) / math.sqrt(tf.cast(768, tf.float32))
    weights = tf.keras.activations.softmax(scores)
    result=tf.matmul(weights, Value)
    #Add layer
    embed+=result
    #normalize the embeddings
    normalized_embeddings = tf.keras.layers.LayerNormalization()(embed)
    #feed forward layer from scratch
    Layer1=W01d*normalized_embeddings
    result1=tf.nn.relu(Layer1)
    result=W11d*normalized_embeddings
    #Add layer
    embed+=result
    return embed

In [21]:
def train(src , trg,opt):
    global model,tokenizer,W0t,Wqe,Wke,Wve,W0e,W1e,Wqd,Wkd,Wvd,W0d,W1d,W01d,W11d
    encoder_output=encoder(embedder(src))
    output_embed=embedder(trg)
    decoder_output=decoder(output_embed,encoder_output)
    Layer1=W0t*decoder_output
    result=tf.nn.softmax(Layer1, axis=-1)
    predicted_id = tf.cast(tf.argmax(result, axis=-1), tf.int64) 
    pred_sent = ' '.join([fr_tokenizer.index_word[idx] for idx in predicted_id[0].numpy() if idx != 0 and idx != 2 and idx !=3 and idx in fr_tokenizer.index_word.keys() ])
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=output_embed, logits=result))
    print(loss.numpy())
    #backpropagation (just for now)
    W0t+=loss.numpy()*opt
    Wqe+=loss.numpy()*opt
    Wke+=loss.numpy()*opt
    Wve+=loss.numpy()*opt
    W0e+=loss.numpy()*opt
    W1e+=loss.numpy()*opt
    Wqd+=loss.numpy()*opt
    Wkd+=loss.numpy()*opt
    Wvd+=loss.numpy()*opt
    W0d+=loss.numpy()*opt
    W1d+=loss.numpy()*opt
    W01d+=loss.numpy()*opt
    W11d+=loss.numpy()*opt
    return loss

In [11]:
train_set = tf.data.Dataset.from_tensor_slices((en_seqs , fr_seqs ))
train_set = train_set.shuffle(100).batch(15 , drop_remainder = True)

In [22]:
from tqdm.auto import tqdm 
def trainloop(EPOCHS,optimizer):
    for epoch in tqdm(range(EPOCHS)) :
        for src , trg in tqdm(train_set) : 
            for i in range(15):
                print(train(src[i],trg[i],optimizer))
trainloop(1,0.0001)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

1827.7269
tf.Tensor(1827.7269, shape=(), dtype=float32)
1827.756
tf.Tensor(1827.756, shape=(), dtype=float32)
1827.5547
tf.Tensor(1827.5547, shape=(), dtype=float32)
1827.9775
tf.Tensor(1827.9775, shape=(), dtype=float32)
1827.7852
tf.Tensor(1827.7852, shape=(), dtype=float32)
1827.7985
tf.Tensor(1827.7985, shape=(), dtype=float32)
1827.875
tf.Tensor(1827.875, shape=(), dtype=float32)
1827.812
tf.Tensor(1827.812, shape=(), dtype=float32)
1828.0078
tf.Tensor(1828.0078, shape=(), dtype=float32)
1827.841
tf.Tensor(1827.841, shape=(), dtype=float32)
1827.7812
tf.Tensor(1827.7812, shape=(), dtype=float32)
1828.2572
tf.Tensor(1828.2572, shape=(), dtype=float32)
1828.0353
tf.Tensor(1828.0353, shape=(), dtype=float32)
1827.6812
tf.Tensor(1827.6812, shape=(), dtype=float32)
1828.0063
tf.Tensor(1828.0063, shape=(), dtype=float32)
1827.9689
tf.Tensor(1827.9689, shape=(), dtype=float32)
1827.8912
tf.Tensor(1827.8912, shape=(), dtype=float32)
1828.2812
tf.Tensor(1828.2812, shape=(), dtype=float32)
