In [None]:
! pip install transformers

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import string
import re
import sklearn
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:
# getting the data
train = pd.read_json("train.jsonl",lines=True)
dev = pd.read_json("dev.jsonl",lines=True)
del train['id']
del train['img']
del dev['id']
del dev['img']

In [None]:
# functions to pre process text data : 
# 1. remove stopwords
# 2. remove punctuation

def stop_words(df, column, new_column):
  df[new_column]=df[column].apply(lambda x: ' '.join([item for item in x.split() if item not in stopwords.words('english')]))
  return df

def punctuation(df, column, new_column):
  df[new_column]=df[column].apply(lambda x: "".join([char for char in x if char not in string.punctuation]))
  return df

In [None]:
# pre processing the data
cleaned_train = stop_words(train, 'text', 'cleaned_text')
cleaned_train = punctuation(cleaned_train, 'cleaned_text', 'cleaned_text')
cleaned_dev = stop_words(dev, 'text', 'cleaned_text')
cleaned_dev = punctuation(cleaned_dev, 'cleaned_text', 'cleaned_text')

In [None]:
text_train = cleaned_train.cleaned_text.values
labels_train = cleaned_train.label.values
text_dev = cleaned_dev.cleaned_text.values
labels_dev = cleaned_dev.label.values

In [None]:
train_input, val_input = text_train, text_dev
train_label, val_label = labels_train, labels_dev

In [None]:
from transformers import BertTokenizer, TFBertModel, BertConfig, TFBertForSequenceClassification

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
# Finding the maximum length
max_len_train = 0

for text in text_train :
  max_len_train = max(max_len_train, len(text))

In [None]:
# Finding the maximum length
max_len_dev = 0

for text in text_dev :
  max_len_dev = max(max_len_dev, len(text))

In [None]:
# fonction d'encoding
def mask_inputs_for_bert(text,max_len):
  input_ids = []
  attention_masks = []
  i = 0
  for t in text : 
    if (i<3):  # on affiche les 3 premiers textes
      print("text :", t)
    encoded_dict =  tokenizer.encode_plus(t, add_special_tokens = True, max_length = max_len, pad_to_max_length = True, return_attention_mask = True)
    if (i<3): # on affiche les 3 premiers textes tokenizés
      print('dict :', encoded_dict['input_ids'])
      print('attention masks :', encoded_dict['attention_mask'])
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    i += 1
  # convert totensor and return
  input_ids = tf.convert_to_tensor(input_ids)
  attention_masks = tf.convert_to_tensor(attention_masks)
  return input_ids,attention_masks

In [None]:
train_inp, train_mask = mask_inputs_for_bert(train_input,max_len_train)
val_inp, val_mask = mask_inputs_for_bert(val_input,max_len_dev)
train_label = tf.convert_to_tensor(train_label)
val_label = tf.convert_to_tensor(val_label)

In [None]:
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 2, return_dict=True, output_attentions = True,)

In [None]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-6, epsilon = 1e-08, weight_decay=0.1)

bert_model.compile(loss = loss, optimizer = optimizer, metrics = [metric])

In [None]:
with tf.device('/device:GPU:0'):
  history = bert_model.fit([train_inp, train_mask],\
                         train_label,\
                         batch_size = 16,\
                         epochs = 4,\
                         validation_data = ([val_inp, val_mask], val_label))

In [None]:
trained_weights = bert_model.get_weights() # returns a list consisting of NumPy arrays