# Seq2seq model with testing

Another alternative of implementing Seq2Seq model 

In [None]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.utils import *
from tensorflow.keras.initializers import *
import tensorflow as tf
import time, random

In [None]:
#increase our GPU read batch size

gpus = tf.config.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

1 Physical GPUs, 1 Logical GPUs


In [None]:
from google.colab import  drive
drive.mount('/drive')

Mounted at /drive


In [None]:
import pandas as pd



import gensim

import numpy as np
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt

from gensim.models import Word2Vec, KeyedVectors   
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.regularizers import L1L2
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
#train dataset
train=pd.read_csv('/content/train.csv')
# val & test datasets
val=pd.read_csv('/content/val.csv')
test=pd.read_csv('/content/test.csv')

In [None]:
train

Unnamed: 0,RawText,ICD10
0,Thrombose veineuse profonde cuisse gauche,I802
1,Hémiplégie post-traumatique,S099
2,Masculinisation avec hyperplasie surrénale,E250
3,Hyperammoniémie cérébrale,E722
4,Fistule artérioveineuse congénitale périphériq...,Q257
...,...,...
181758,Prématurité 32 SA,P073
181759,Rétinopathie E14.3 malnutrition E12.3,H360
181760,Métastase pariétale,C798
181761,Lésion cutanée de la pinta,A673


## Cleaning the data

Here we will first install the FrenchLefffLemmatizer library which allows to perform a lemmatization in French

In [None]:
!pip install git+https://github.com/ClaudeCoulombe/FrenchLefffLemmatizer.git &> /dev/null

In [None]:
#import librairies 

import nltk
import string
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Then we initialize :

- the **stopwords**, which are words that appear very frequently but do not bring any meaning to the sentence (like "de", "le", "une")

- a **lemmatizer**, this object allows us to preserve the root of the words so that two words having the same strain will be considered as the same word (example: 'voisine' and 'voisinage' will both be changed to 'voisin')

In [None]:
french_stopwords = nltk.corpus.stopwords.words('french')
lemmatizer = FrenchLefffLemmatizer()

Afterwards we build our preprocessing function which will successively :

- remove the punctuation
- remove the numbers
- transform the sentences into a list of tokens (a list of words)
- remove stopwords (words that don't make sense)
lemmatize
- remove capital letters
- reform sentences with the remaining words

In [None]:
def French_Preprocess_listofSentence(listofSentence):
    preprocess_list = []
    for sentence in listofSentence :
        sentence_w_punct = "".join([i.lower() for i in sentence if i not in string.punctuation])

       # sentence_w_num = ''.join(i for i in sentence_w_punct if not i.isdigit())

        tokenize_sentence = nltk.tokenize.word_tokenize(sentence_w_punct)

        words_w_stopwords = [i for i in tokenize_sentence if i not in french_stopwords]

        words_lemmatize = (lemmatizer.lemmatize(w) for w in words_w_stopwords)

        sentence_clean = ' '.join(w for w in words_lemmatize)

        preprocess_list.append(sentence_clean)

    return preprocess_list

In [None]:
cleaned_train_text = French_Preprocess_listofSentence(train['RawText'])
cleaned_val_text = French_Preprocess_listofSentence(val['RawText'])
cleaned_test_text = French_Preprocess_listofSentence(test['RawText'])

In [None]:
train['cleaned_text']=cleaned_train_text
val['cleaned_text']=cleaned_val_text
test['cleaned_text']=cleaned_test_text

In [None]:
train.head()

Unnamed: 0,RawText,ICD10,cleaned_text
0,Thrombose veineuse profonde cuisse gauche,I802,thrombose veineuse profonde cuisse gauche
1,Hémiplégie post-traumatique,S099,hémiplégie posttraumatique
2,Masculinisation avec hyperplasie surrénale,E250,masculinisation hyperplasie surrénale
3,Hyperammoniémie cérébrale,E722,hyperammoniémie cérébrale
4,Fistule artérioveineuse congénitale périphériq...,Q257,fistule artérioveineuse congénitale périphériq...


For now, we will concatenate train, val and test datasets into one single dataset so that it'll be more performant for us to do Text Preprocessing

In [None]:
frames = [train, val, test]
dataset = pd.concat(frames, ignore_index=True, sort=False)

In [None]:
data=dataset[['cleaned_text', 'ICD10']]
data.to_csv(r'Corpus.txt', header=None, index=None, sep='\t', mode='a')
import re
string = open('Corpus.txt').read()
new_corpus = re.sub('"', '',string )
open('Corpus.txt', 'w').write(new_corpus)

8703507

In [None]:
#Hyperparameters
batch_size = 64
latent_dim = 256
num_samples = 1888260

In [None]:
#Vectorize the data.
input_texts = []
target_texts = []
input_chars = set()
target_chars = set()

with open('Corpus.txt', 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
for line in lines[: min(num_samples, len(lines) - 1)]:
    input_text, target_text = line.split('\t')
    target_text = '\t' + target_text + '\n'
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_chars:
            input_chars.add(char)
    for char in target_text:
        if char not in target_chars:
            target_chars.add(char)

input_chars = sorted(list(input_chars))
target_chars = sorted(list(target_chars))
num_encoder_tokens = len(input_chars)
num_decoder_tokens = len(target_chars)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

#Print size
print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 241763
Number of unique input tokens: 61
Number of unique output tokens: 38
Max sequence length for inputs: 162
Max sequence length for outputs: 6


In [None]:
#Define data for encoder and decoder
input_token_id = dict([(char, i) for i, char in enumerate(input_chars)])
target_token_id = dict([(char, i) for i, char in enumerate(target_chars)])

encoder_in_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype='float32')

decoder_in_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32')

decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32')

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_in_data[i, t, input_token_id[char]] = 1.
    for t, char in enumerate(target_text):
        decoder_in_data[i, t, target_token_id[char]] = 1.
        if t > 0:
            decoder_target_data[i, t - 1, target_token_id[char]] = 1

In [None]:
#Define and process the input sequence
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
#We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

#Using `encoder_states` set up the decoder as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
#Final model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
#Model Summary
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None, 61)]   0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None, 38)]   0           []                               
                                                                                                  
 lstm (LSTM)                    [(None, 256),        325632      ['input_1[0][0]']                
                                 (None, 256),                                                     
                                 (None, 256)]                                                     
                                                                                              

In [None]:
#Model data Shape
print("encoder_in_data shape:",encoder_in_data.shape)
print("decoder_in_data shape:",decoder_in_data.shape)
print("decoder_target_data shape:",decoder_target_data.shape)

encoder_in_data shape: (241763, 162, 61)
decoder_in_data shape: (241763, 6, 38)
decoder_target_data shape: (241763, 6, 38)


In [None]:
#Compiling and training the model
model.compile(optimizer='Adam',loss='categorical_crossentropy', metrics=['accuracy'] )
model.fit([encoder_in_data, decoder_in_data], decoder_target_data, batch_size = batch_size, epochs=50, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f30436ef100>

In [None]:
#Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

In [None]:
reverse_input_char_index = dict((i, char) for char, i in input_token_id.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_id.items())

#Define Decode Sequence
def decode_sequence(input_seq):
    #Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    #Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    #Get the first character of target sequence with the start character.
    target_seq[0, 0, target_token_id['\t']] = 1.

    #Sampling loop for a batch of sequences
    #(to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        #Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        #Exit condition: either hit max length
        #or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        #Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        #Update states
        states_value = [h, c]

    return decoded_sentence

In [None]:
for seq_index in range(20):
    input_seq = encoder_in_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: thrombose veineuse profonde cuisse gauche
Decoded sentence: T828

-
Input sentence: hémiplégie posttraumatique
Decoded sentence: G819

-
Input sentence: masculinisation hyperplasie surrénale
Decoded sentence: Q878

-
Input sentence: hyperammoniémie cérébrale
Decoded sentence: E230

-
Input sentence: fistule artérioveineuse congénitale périphérique vaisseau pulmonaires
Decoded sentence: I772

-
Input sentence: prothèse valvulaire mécanique
Decoded sentence: Z952

-
Input sentence: tumeur maligne partie molles fesse
Decoded sentence: C718

-
Input sentence: complexe xpcs
Decoded sentence: K918

-
Input sentence: sclérodermie acrosclérotique néonatale
Decoded sentence: L940

-
Input sentence: vih pelvienne femme nca inflammatoire compliquant grossesse affectant foetus nouveauné
Decoded sentence: J387

-
Input sentence: accident travail écrasement camion
Decoded sentence: W23

-
Input sentence: eternuement
Decoded sentence: A830

-
Input sentence: spondylarthrose radiculo