In [1]:
%run ../preprocessing.py

In [2]:
import os
import re
import csv
import json
import time
import random
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils import class_weight
from transformers import FlaubertTokenizer
from transformers.modeling_tf_flaubert import TFFlaubertForSequenceClassification

In [3]:
BATCH_SIZE = 8
SEQUENCE_LENGTH = 64
ROOT_FOLDER = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) + "/"
MODEL_PATH = ROOT_FOLDER + "models/"
DATASET_PATH = ROOT_FOLDER + "dataset/"
LOG_PATH = ROOT_FOLDER + "logs/"

## Import dataset

In [4]:
# One Hot encoder class label by alphabetical order
labels = ['culture', 'france', 'international', 'santé', 'science_high-tech', 'sports', 'économie']
loss_weights=[[2.0014090613483635/21.57126168224299, 1.5277059590046953/21.57126168224299, 1.853869129790919/21.57126168224299, 21.57126168224299/21.57126168224299, 3.844472204871955/21.57126168224299, 1.0/21.57126168224299, 3.357883251500273/21.57126168224299]]

In [5]:
enc = OneHotEncoder()
enc.fit([[label] for label in labels])

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=True)

In [6]:
# Print encoder values to make sure they are ordonned as label order
print(enc.transform([["culture"]]).toarray())
print(enc.transform([["france"]]).toarray())
print(enc.transform([["international"]]).toarray())
print(enc.transform([["santé"]]).toarray())
print(enc.transform([["science_high-tech"]]).toarray())
print(enc.transform([["sports"]]).toarray())
print(enc.transform([["économie"]]).toarray())

[[1. 0. 0. 0. 0. 0. 0.]]
[[0. 1. 0. 0. 0. 0. 0.]]
[[0. 0. 1. 0. 0. 0. 0.]]
[[0. 0. 0. 1. 0. 0. 0.]]
[[0. 0. 0. 0. 1. 0. 0.]]
[[0. 0. 0. 0. 0. 1. 0.]]
[[0. 0. 0. 0. 0. 0. 1.]]


In [7]:
def get_main_category(dictOfNames):
    new_dict = {}
    try:
        for (key,value) in dictOfNames.items():
            #if "score" in key or "applenews" in key or "homepage" in key:
            #    continue
            new_key = re.sub(r'desktop_|mobile_webview_', "", key)
            new_key = re.sub(r'google_', "", new_key)
            if new_key not in labels:
                continue
            if new_key not in new_dict:
                new_dict[new_key] = 0
            new_dict[new_key] += value
        #return [key for key in new_dict.keys()]
        return max(new_dict, key=new_dict.get)
    except ValueError as e :
        return ""

In [8]:
lines = open(DATASET_PATH + 'since_january.csv').readlines()
lines = lines[1:]
random.shuffle(lines)
print("# lines : ", len(lines))
open(DATASET_PATH + 'shuffled_since_january.csv', 'w').writelines(lines)

# lines :  553030


In [9]:
def file_generator():
    samples = []
    categories = []
    idx = 0
    while 1:
        with open(DATASET_PATH + 'shuffled_since_january.csv', 'r', newline='') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
            idx = 0
            for i, row in enumerate(reader):
                if len(row) < 3 or row[3] not in labels:
                    continue

                text = row[0]
                category = ""
                if row[4] != {} and row[4] != "":
                    category = get_main_category(json.loads(row[4]))
                if category == "":
                    continue

                samples.append(tokenizer.encode(text, max_length=SEQUENCE_LENGTH, pad_to_max_length=SEQUENCE_LENGTH, add_special_tokens=True))
                categories.append([category])

                idx += 1
                if idx >= BATCH_SIZE:
                    categories = enc.transform(categories).toarray()
                    yield tf.convert_to_tensor(samples, dtype=tf.int32), tf.convert_to_tensor(categories, dtype=tf.int32)
                    samples = []
                    categories = []
                    idx = 0

## Import camembert model

In [10]:
model = TFFlaubertForSequenceClassification.from_pretrained(
    "jplu/tf-flaubert-base-cased",
    num_labels=len(labels),
    max_length=SEQUENCE_LENGTH,
    #force_download=True
)

tokenizer = FlaubertTokenizer.from_pretrained("jplu/tf-flaubert-base-cased")

## Test model

In [11]:
input_ids = tokenizer.encode("Sida. Une start-up française découvre une avancée majeure dans la lutte contre le VIH", return_tensors='tf')#, add_special_tokens=True, pad_to_max_length=, return_tensors='tf')
out = model(input_ids)

#print(input_ids)
print(out)
print(np.argmax(np.abs(out[0])))
print(labels[np.argmax(np.abs(out[0]))])

(<tf.Tensor: shape=(1, 7), dtype=float32, numpy=
array([[ 0.95408237, -0.28413275,  1.5128503 , -0.20761025, -0.40679094,
         0.50068426, -0.0023958 ]], dtype=float32)>,)
2
international


In [12]:
for idx, (a, b) in enumerate(file_generator()):
    if idx > 1:
        break
    print(b)

tf.Tensor(
[[0 1 0 0 0 0 0]
 [0 0 0 0 0 1 0]
 [0 0 1 0 0 0 0]
 [0 0 0 0 0 1 0]
 [0 0 0 0 0 1 0]
 [0 0 0 0 0 1 0]
 [1 0 0 0 0 0 0]
 [0 0 0 0 0 1 0]], shape=(8, 7), dtype=int32)
tf.Tensor(
[[0 0 0 0 0 1 0]
 [0 0 1 0 0 0 0]
 [1 0 0 0 0 0 0]
 [0 0 0 0 0 1 0]
 [0 0 1 0 0 0 0]
 [0 0 0 0 0 1 0]
 [0 1 0 0 0 0 0]
 [0 0 0 0 0 1 0]], shape=(8, 7), dtype=int32)


## Train model on new dataset

In [13]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.CategoricalAccuracy('accuracy')
tensorboard = tf.keras.callbacks.TensorBoard(log_dir=LOG_PATH+"flaubert_cased_"+time.strftime("%d%m%y/%H:%M:%S"))
checkpoint = tf.keras.callbacks.ModelCheckpoint(MODEL_PATH+"checkpoints/")

model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=[metric],
    loss_weights=loss_weights
)
model.summary()

Model: "tf_flaubert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequence_summary (TFSequence multiple                  5383      
_________________________________________________________________
transformer (TFFlaubertMainL multiple                  138233088 
Total params: 138,238,471
Trainable params: 138,238,471
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.fit(
    file_generator(),
    epochs=100,
    max_queue_size=2,
    steps_per_epoch=200,
    validation_data=file_generator(),
    validation_steps=10,
    #callbacks=[checkpoint] # tensorboard
)

  ...
    to  
  ['...']
Train for 200 steps, validate for 10 steps
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch

<tensorflow.python.keras.callbacks.History at 0x7f29dc7ec7d0>

In [19]:
input_ids = tf.constant(
    [
        tokenizer.encode(
            clean_text("PSG : Incroyable, le Real a réellement renoncé à Mbappé !"),
            add_special_tokens=True
        ),
    ], tf.int32
)
out = model(input_ids)

In [20]:
print(out)
print(np.argmax(np.abs(out[0])))
print(list(labels)[np.argmax(np.abs(out[0]))])

print(labels)

(<tf.Tensor: shape=(1, 7), dtype=float32, numpy=
array([[-1.5705025 , -0.8648224 ,  0.33070713, -4.861527  , -1.0947516 ,
         6.9560866 , -0.41979995]], dtype=float32)>,)
5
sports
['culture', 'france', 'international', 'santé', 'science_high-tech', 'sports', 'économie']


In [17]:
for idx, (X_test, y_test) in enumerate(file_generator()):
    if idx > 2:
        break
    #Confution Matrix and Classification Report
    Y_pred = model(X_test)

    y_pred = [labels[int(np.argmax(y))] for y in Y_pred[0]]
    #for (a, b) in zip(X_test, y_pred):
    #    print(b, ":", tokenizer.decode(a, skip_special_tokens=True))
    y_test = enc.inverse_transform(y_test.numpy())

    #for x, y in zip(y_test, y_pred):
    #    print(x, "/", y)

    print('Confusion Matrix')
    print(confusion_matrix(y_test, y_pred))
    print('Classification Report')

    #print(classification_report(y_test, y_pred, target_names=labels))

Confusion Matrix
[[1 0 0 0]
 [1 0 0 0]
 [0 0 1 0]
 [0 0 0 5]]
Classification Report
Confusion Matrix
[[1 0 0 0 0]
 [0 1 0 0 0]
 [0 0 2 0 0]
 [0 0 0 3 1]
 [0 0 0 0 0]]
Classification Report
Confusion Matrix
[[2 0 0 0 0]
 [0 1 0 0 0]
 [0 0 1 0 0]
 [0 0 0 2 0]
 [0 0 0 0 2]]
Classification Report


In [18]:
model.save_pretrained(MODEL_PATH+"last_model")