In [1]:
import re
import csv
import json
import time
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MultiLabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils import class_weight
from transformers import (
    CamembertTokenizer,
    TFCamembertModel,
    TFRobertaPreTrainedModel,
    TFRobertaMainLayer
)
from transformers.modeling_tf_utils import get_initializer
from tensorflow.keras.layers import * 

In [2]:
BATCH_SIZE = 32
STEPS = 32

## Import dataset

In [3]:
# One Hot encoder class label by alphabetical order
labels = ['santé', 'science_high-tech', 'sports', 'économie'] #'international', 'culture', 'france', ]# 'homepage', 
class_weights = [
    {0: 14.5},#6.06},
    {1: 3.5},#1.47},
    {2: 1.0},#0.42},
    {3: 3.16}#1.32 }
]

#class_weights = {
#    0: 1.0,
#    1: 2.0712994058382845,
#    2: 2.026282537275714,
#    3: 1.5499710032862941,
#    4: 3.541910546659304,
#    5: 14.931098696461826,
#    6: 17.023354564755838,
#    7: 3.301966436734274
#}

In [4]:
enc = OneHotEncoder()
#enc.fit(labels)
enc.fit([[label] for label in labels])

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=True)

In [5]:
print(enc.transform([["santé"]]).toarray())
print(enc.transform([["science_high-tech"]]).toarray())
print(enc.transform([["sports"]]).toarray())
print(enc.transform([["économie"]]).toarray())

[[1. 0. 0. 0.]]
[[0. 1. 0. 0.]]
[[0. 0. 1. 0.]]
[[0. 0. 0. 1.]]


In [6]:
stopwords = set(['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût', 'fussions', 'fussiez', 'fussent', 'ayant', 'ayante', 'ayantes', 'ayants', 'eu', 'eue', 'eues', 'eus', 'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', 'aurez', 'auront', 'aurais', 'aurait', 'aurions', 'auriez', 'auraient', 'avais', 'avait', 'avions', 'aviez', 'avaient', 'eut', 'eûmes', 'eûtes', 'eurent', 'aie', 'aies', 'ait', 'ayons', 'ayez', 'aient', 'eusse', 'eusses', 'eût', 'eussions', 'eussiez', 'eussent'])

whitespace = re.compile("[\\s]+", re.UNICODE)
dash = re.compile("[\\-\\˗\\֊\\‐\\‑\\‒\\–\\—\\⁻\\₋\\−\\﹣\\－]")
left_parenthesis_filter = re.compile("[\\(\\[\\{\\⁽\\₍\\❨\\❪\\﹙\\（]")
right_parenthesis_filter = re.compile("[\\)\\]\\}\\⁾\\₎\\❩\\❫\\﹚\\）]")
currencies = re.compile("[¥£₪$€฿₨]")
apostrophe_filter = re.compile(
    r'&#39;|[ʼ՚＇‘’‛❛❜ߴߵ`‵´ˊˋ{}{}{}{}{}{}{}{}{}]'.format(
        chr(768), chr(769), chr(832),
        chr(833), chr(2387), chr(5151),
        chr(5152), chr(65344), chr(8242)
    ), re.UNICODE
)
basic_cleaner = re.compile(r'[^\w\s{}]'.format(re.escape("€-!?/;\"'%&<>.()@#:,|=*")), re.UNICODE)

In [7]:
def get_main_category(dictOfNames):
    new_dict = {}
    try:
        for (key,value) in dictOfNames.items():
            #if "score" in key or "applenews" in key or "homepage" in key:
            #    continue
            new_key = re.sub(r'desktop_|mobile_webview_', "", key)
            new_key = re.sub(r'google_', "", new_key)
            if new_key not in labels:
                continue
            if new_key not in new_dict:
                new_dict[new_key] = 0
            new_dict[new_key] += value
        #return [key for key in new_dict.keys()]
        return max(new_dict, key=new_dict.get)
    except ValueError as e :
        return ""

In [8]:
def clean_text(text):
    text = str.strip(str.lower(text))
    text = whitespace.sub(' ', text)
    text = dash.sub('-', text)
    text = currencies.sub('€', text)
    text = apostrophe_filter.sub("'", text)
    text = left_parenthesis_filter.sub("(", text)
    text = right_parenthesis_filter.sub(")", text)
    text = basic_cleaner.sub('', text)
    return text

In [9]:
lines = open('since_january.csv').readlines()
lines = lines[1:]
random.shuffle(lines)
print("# lines : ", len(lines))
open('shuffled_since_january.csv', 'w').writelines(lines)
del lines

# lines :  553030


In [10]:
def file_generator(steps=1):
    samples = []
    categories = []
    idx = 0
    with open('shuffled_since_january.csv', 'r', newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        idx = 0
        for row in reader:
            if len(row) < 3 or row[3] not in labels:
                continue

            idx += 1
            text = row[0]
            category = ""
            if row[4] != {} and row[4] != "":
                category = get_main_category(json.loads(row[4]))
            if category == "":
                continue

            samples.append(tokenizer.encode(clean_text(text), pad_to_max_length=16, add_special_tokens=False))
            categories.append([category])

            if idx >= BATCH_SIZE * steps:
                categories = enc.transform(categories).toarray()
                #print(categories)
                #print(len(samples))
                yield tf.convert_to_tensor(samples, dtype=tf.int32),  tf.convert_to_tensor(categories, dtype=tf.int32)
                samples = []
                categories = []
                idx = 0

## Import camembert model

In [11]:
class TFRobertaClassificationHead(tf.keras.layers.Layer):
    """Head for sentence-level classification tasks."""

    def __init__(self, config, **kwargs):
        super().__init__(config, **kwargs)
        self.init = get_initializer(config.initializer_range)
        self.batch = BatchNormalization()
        self.merge = Concatenate(axis=1)
        self.pool = MaxPooling1D(pool_size=2)
        self.global_pool = GlobalMaxPooling1D()
        self.dropout = Dropout(config.hidden_dropout_prob)
        self.dense = Dense(config.hidden_size, activation='relu', kernel_initializer=self.init)
        self.out_proj = Dense(config.num_labels, kernel_initializer=self.init, activation="sigmoid", name="out_proj")
        self.conv = Conv1D(
            filters=config.hidden_size,
            activation='relu',
            kernel_size=1,
            input_shape=(config.max_position_embeddings, config.hidden_size),
        )
        self.conv2 = Conv1D(
            filters=config.hidden_size,
            activation='relu',
            kernel_size=2,
        )

    def call(self, features, training=False):
        x = features[:, 0, :]
        x = self.conv(features)
        x = self.pool(x)
        x = self.conv(x)
        x = self.global_pool(x)
        x = self.dense(x)
        x = self.dropout(x, training=training)
        x = self.dense(x)
        x = self.out_proj(x)
        return x

In [12]:
class TFCamembertForSequenceClassification(TFRobertaPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels

        self.roberta = TFRobertaMainLayer(config, name="roberta")
        self.classifier = TFRobertaClassificationHead(config, name="classifier")

    def call(self, inputs, **kwargs):
        outputs = self.roberta(inputs, **kwargs)
        sequence_output = outputs[0]
        logits = self.classifier(sequence_output, training=kwargs.get("training", False))
        outputs = (logits,) + outputs[2:]
        return outputs  # logits, (hidden_states), (attentions)

In [13]:
model = TFCamembertForSequenceClassification.from_pretrained(
    "jplu/tf-camembert-base",
    num_labels=len(labels)
)
tokenizer = CamembertTokenizer.from_pretrained(
    "jplu/tf-camembert-base",
    output_hidden_states=True,
    output_attentions=True
)

In [14]:
#special_tokens_dict = {'cls_token': '<CLS>'}
#tokenizer.add_special_tokens(special_tokens_dict)

## Test model

In [15]:
input_ids = tf.constant([tokenizer.encode("Sida. Une start-up française découvre une avancée majeure dans la lutte contre le VIH", add_special_tokens=False, pad_to_max_length=34)], tf.int32)
out = model(input_ids)

print(out)
print(np.argmax(out[0]) - 1)
print(list(labels)[np.argmax(out[0])])

(<tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[0.50389194, 0.4961208 , 0.50139683, 0.49829933]], dtype=float32)>,)
-1
santé


In [16]:
#assert False

## Train model on new dataset

In [17]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
tensorboard = tf.keras.callbacks.TensorBoard(log_dir="./log_january/"+time.strftime("%d%m%y/%H:%M:%S"))

model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=["categorical_accuracy"],
    #loss_weights=list(class_weights.values())
)

In [18]:
model.summary()

Model: "tf_camembert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
roberta (TFRobertaMainLayer) multiple                  110621952 
_________________________________________________________________
classifier (TFRobertaClassif multiple                  1184260   
Total params: 111,806,212
Trainable params: 111,806,212
Non-trainable params: 0
_________________________________________________________________


In [19]:
print(model.layers[0])
model.layers[0].trainable = False

<transformers.modeling_tf_roberta.TFRobertaMainLayer object at 0x7ff523e58dd0>


In [None]:
#labels_weight = np.array([14.5, 3.5, 1.0, 3.16])
#labels_weight = np.array([labels_weight[np.argmax(q)] for q in y])

model.fit(
    file_generator(),
    epochs=100,
    class_weight=class_weights,
    #sample_weight=labels_weight,
    steps_per_epoch=STEPS * 10,
    max_queue_size=8, 
    verbose=1,
    #validation_split = 0.2
)

Train for 320 steps
Epoch 1/100

In [None]:
input_ids = tf.constant(
    [
        #tokenizer.encode(
        #    clean_text("Sida. Une start-up française découvre une avancée majeure dans la lutte contre le VIH"),
        #    add_special_tokens=True
        #),
        tokenizer.encode(
            clean_text("Annuler l’Euro 2020 ferait perdre très gros à l’UEFA"),
            add_special_tokens=True
        )
    ], tf.int32
)
out = model(input_ids)

In [None]:
print(out)
print(np.argmax(out[0]))
print(list(labels)[np.argmax(out[0])])

print(labels)

In [None]:

c, d = file_generator(2)
#Confution Matrix and Classification Report
Y_pred = model(c)
#print(Y_pred)
y_pred = [labels[int(np.argmax(y))] for y in Y_pred[0]]
d = enc.inverse_transform(d)

print(Y_pred)
print(y_pred)
print(d)

print('Confusion Matrix')
print(confusion_matrix(d, y_pred))
print('Classification Report')
target_names = ['Cats', 'Dogs', 'Horse']
print(classification_report(d, y_pred, target_names=labels))