In [1]:
import tensorflow as tf
import transformers
import numpy as np

In [2]:
max_length = 32  # Maximum length of input sentence to the model.
batch_size = 32
epochs = 2

# Labels in our dataset.
labels = ["contradiction", "entailment", "neutral"]

In [4]:

class BertSemanticDataGenerator(tf.keras.utils.Sequence):
    """Generates batches of data.

    Args:
        sentence_pairs: Array of premise and hypothesis input sentences.
        labels: Array of labels.
        batch_size: Integer batch size.
        shuffle: boolean, whether to shuffle the data.
        include_targets: boolean, whether to incude the labels.

    Returns:
        Tuples `([input_ids, attention_mask, `token_type_ids], labels)`
        (or just `[input_ids, attention_mask, `token_type_ids]`
         if `include_targets=False`)
    """

    def __init__(
        self,
        sentence_pairs,
        labels,
        batch_size=batch_size,
        shuffle=True,
        include_targets=True,
        truncation = True
    ):
        self.sentence_pairs = sentence_pairs
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets
        # Load our BERT Tokenizer to encode the text.
        # We will use base-base-uncased pretrained model.
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "bert-base-uncased", do_lower_case=True
        )
        self.indexes = np.arange(len(self.sentence_pairs))
        self.on_epoch_end()

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.sentence_pairs) // self.batch_size

    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        sentence_pairs = self.sentence_pairs[indexes]

        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
        # encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            sentence_pairs.tolist(),
            add_special_tokens=True,
            max_length=max_length,
            return_attention_mask=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_tensors="tf",
            truncation=True
        )
        #print(encoded)
        # Convert batch of encoded features to numpy array.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

        # Set to true if data generator is used for training/validation.
        if self.include_targets:
            labels = np.array(self.labels[indexes], dtype="int32")
            return [input_ids, attention_masks, token_type_ids], labels
        else:
            return [input_ids, attention_masks, token_type_ids]

    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)


In [5]:
def create_model():
  #strategy = tf.distribute.MirroredStrategy()

  #with strategy.scope():
      # Encoded token ids from BERT tokenizer.
    input_ids = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="input_ids"
    )
    # Attention masks indicates to the model which tokens should be attended to.
    attention_masks = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="attention_masks"
    )
    # Token type ids are binary masks identifying different sequences in the model.
    token_type_ids = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="token_type_ids"
    )
        # Loading pretrained BERT model.

    bert_model = transformers.TFBertModel.from_pretrained("bert-base-uncased")
    # Freeze the BERT model to reuse the pretrained features without modifying them.
    bert_model.trainable = False

    sequence_output, pooled_output = bert_model(
        input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
    )
    # Add trainable layers on top of frozen layers to adapt the pretrained features on the new data.
    bi_lstm = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(64, return_sequences=True)
    )(sequence_output)
    # Applying hybrid pooling approach to bi_lstm sequence output.
    avg_pool = tf.keras.layers.GlobalAveragePooling1D()(bi_lstm)
    max_pool = tf.keras.layers.GlobalMaxPooling1D()(bi_lstm)
    concat = tf.keras.layers.concatenate([avg_pool, max_pool])
    dropout = tf.keras.layers.Dropout(0.3)(concat)
    output = tf.keras.layers.Dense(3, activation="softmax")(dropout)
    model = tf.keras.models.Model(
        inputs=[input_ids, attention_masks, token_type_ids], outputs=output
    )

    model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss="categorical_crossentropy",
        metrics=["acc"],
    )
    return model

model = create_model()
#print(f"Strategy: {strategy}")
model.summary()

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape     

In [6]:
path = "ml/modeles/bert-question-reponses/weights"
model.load_weights(path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x1c784c6b248>

In [8]:
def check_similarity(sentence1, sentence2):
    sentence_pairs = np.array([[str(sentence1), str(sentence2)]])
    test_data = BertSemanticDataGenerator(
        sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False,
    )

    proba = model.predict(test_data)[0]
    idx = np.argmax(proba)
    proba = f"{proba[idx]: .2f}%"
    pred = labels[idx]
    return pred, proba

In [9]:
sentence1 = "Do you want to eat"
sentence2 = "Do you want to see your family ?"
check_similarity(sentence1, sentence2)

('neutral', ' 0.70%')

In [11]:
sentence1 = "A soccer game with multiple males playing"
sentence2 = "Some men are playing a sport"
check_similarity(sentence1, sentence2)

('entailment', ' 0.95%')

In [12]:
sentence1 = "Tomorrow i am going to take the plane"
sentence2 = "I fligth by plane tomorrow"
check_similarity(sentence1, sentence2)

('entailment', ' 0.59%')

In [13]:
sentence1 = "Two women are observing something together."
sentence2 = "Two women are standing with their eyes closed."
check_similarity(sentence1, sentence2)

('entailment', ' 0.49%')

In [None]:
def question_answer(text):
    translated = translate(text,src='fr', dest='en')
    liste_similarity=[]
    #print("translated",translated)
    for i in range(len(fichier)):
        if len(liste_similarity) <3:
            pred,proba = check_similarity(fichier.iloc[i][0],translated,model_question_answer)
            #print("pred :",pred,"proba",proba)
            if pred == "entailment":
                all_reponses = fichier.iloc[i][1]
                temp_liste = all_reponses.split("/")
                for temp in temp_liste:
                    liste_similarity.append(temp)
        else:
            return liste_similarity

    return liste_similarity
question = "tu as faim ?"
liste_reponse = question_answer(question)

In [15]:
from googletrans import Translator  
import pandas as pd 

In [68]:
translator = Translator()
translated = translator.translate('Est-ce que tu as envie de manger?', src='fr', dest='en').text
print(translated)

Do you want to eat?


In [62]:
fichier = pd.read_csv("dataset_questions_reponses.txt", sep=";")

In [73]:
import time

liste_similarity = []
def find_all(texte,translated):
    if check_similarity(texte,translated)[0] == "entailment":
        #liste_similarity.append(texte)
        return texte
    else:
        return None
reponses =fichier.questions.apply(lambda x : find_all(x,translated))

In [53]:
liste_similarity=[]
for i in range(len(fichier)):
  if check_similarity(fichier.iloc[i][0],translated.text)[0] == "entailment":
    liste_similarity.append(fichier.iloc[i][1])

In [20]:
liste_similarity

[" Oui j'ai très faim/Non je n'ai pas faim",
 " Oui, j'ai faim/Oui, j'aimerais aller aux toilettes/Oui, j'ai besoin de quelqu'un/J'ai froid/J'ai chaud/Non, Merci ",
 ' Du poisson/De la viande/Des légumes/Des pâtes/Du riz/Du fromage/Du pain/Un yaourt/Des fruits/Des sucreries']

In [37]:
sentence_pairs = ["Two women are observing something together.","Two women are standing with their eyes closed."]
tokenizer = transformers.BertTokenizer.from_pretrained(
            "bert-base-uncased", do_lower_case=True
        )
encoded = tokenizer.batch_encode_plus(
    sentence_pairs,
    add_special_tokens=True,
    max_length=max_length,
    return_attention_mask=True,
    return_token_type_ids=True,
    pad_to_max_length=True,
    return_tensors="tf",
)
print(len(encoded["input_ids"][0]))
print(encoded["input_ids"])

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
128
tf.Tensor(
[[  101  2048  2308  2024 14158  2242  2362  1012   102     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     

In [15]:
import json
json_numbers = json.dumps(numbers)
print(json_numbers)
print(type(json_numbers)) #

["1", "2", "3"]
<class 'str'>


In [16]:
import json   
# Data to be written
liste = [1,2,3]
for i in len(liste):
    dictionary ={   
    "A": 5,   
    "B": "guys",   
    }

# Serializing json
json_object = json.dumps(dictionary, indent = 4)   
print(json_object)
# Output

{
    "A": 5,
    "B": "guys"
}


In [1]:
from googletrans import Translator

translator = Translator()
liste = "hello how are you"
translated = translator.translate(liste, src='en', dest='fr')
print(translated.src)
print(translated.dest)
print(translated.origin)
print(translated.text)
print(translated.pronunciation)



ModuleNotFoundError: No module named 'googletrans'

In [2]:
from google_trans_new import google_translator  


In [5]:
from transformers import pipeline, set_seed

In [6]:
english_generator = pipeline('text-generation', model='gpt2')

Some weights of GPT2Model were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def translate(texte):
  translator = Translator()
  translated = translator.translate(texte, src='fr', dest='en').text
  #print(translated)
  reponse = generator(translated, max_length=45, num_return_sequences=5)
  #print(type(reponse))
  #for res in reponse:
   # print(res,"\n")
    #print(type(res))

  
  final = translator.translate(reponse[0]["generated_text"], src='en', dest='fr').text
  print(final)

In [13]:
translate("hello how are you")

'salut comment allez-vous '

In [43]:
import re
def translate(texte,src="en",dest="fr"):
  #translator = Translator()
  #translated = translator.translate(texte, src=src, dest=dest).text
  translator = google_translator()  
  translate_text = translator.translate(texte,lang_tgt=dest)  
  return translate_text

def generate_sentences_english_gpt2(debut_phrase,num_return_sequences=1,length=20,top_p=0.40):
    debut_phrase = translate(debut_phrase,'fr',dest = 'en')
    #print(debut_phrase)
    #print(debut_phrase)
    response_debut_phrase = english_generator(debut_phrase,num_return_sequences=num_return_sequences,max_length=length,top_p=0.40)
    liste = []
    #print(response_debut_phrase)
    for res in response_debut_phrase:
      #print("res",res)
      temp = translate(res["generated_text"],'en',dest = 'fr')
      #print("reponse :",temp)
      temp = truncate(temp)
      liste.append(temp)
    return liste

def truncate(string):
  strin_clean = re.sub('\.(.*)', '.', string)
  strin_clean = re.sub('\?(.*)', '?', strin_clean)
  strin_clean = re.sub('\!(.*)', '!', strin_clean)
  return strin_clean


liste_reponses = []
liste_reponses = generate_sentences_english_gpt2('je suis allé',num_return_sequences=4,length=30)
print(liste_reponses)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence
["Je suis allé au magasin et j'ai acheté quelques choses, et je suis sûr que je reviendrai.", "Je suis allé à l'école et j'ai reçu une liste de tous les élèves qui n'étaient pas sur la liste.", "Je suis allé dans une école un peu différente de mon état d'origine, New York.", "Je suis allé a l'hôpital."]


In [92]:
translate("bonjour comment allez vous",'en','fr')

'["[\'salut comment allez-vous \', \'salut comment allez-vous \', \'salut comment allez-vous \'] ", "[\'salut comment allez-vous \', \'salut comment allez-vous \', \'salut comment allez-vous \'] ", "[\'salut comment allez-vous \', \'salut comment allez-vous \', \'salut comment allez-vous \'] "] '

In [31]:
def truncate(string):
  strin_clean = re.sub('\.(.*)', '.', string)
  strin_clean = re.sub('\?(.*)', '?', strin_clean)
  strin_clean = re.sub('\!(.*)', '!', strin_clean)
  return strin_clean

truncate("Salut tu vas bien . Moi ca va")

'Salut tu vas bien .'