In [None]:
import re
import csv
import json
import time
import random
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils import class_weight
from transformers import (
    CamembertTokenizer,
    CamembertForSequenceClassification
)
from transformers.modeling_tf_utils import get_initializer
from tensorflow.keras.layers import * 

In [None]:
torch.cuda.set_device(0)

In [None]:
torch.cuda.get_device_name(0)

In [None]:
BATCH_SIZE = 16

## Import dataset

In [None]:
# One Hot encoder class label by alphabetical order
labels = ['santé', 'science_high-tech', 'sports', 'économie'] #'international', 'culture', 'france', ]# 'homepage', 
#{'sports': 1.0, 'économie': 3.116034374345001, 'santé': 17.25464252553389, 'science_high-tech': 3.7884409561184444}

class_weights = {
    0: 1.0,
    1: 1.0959043500381582,
    2: 1.0959043500381582,
    3: 1.0788880540946657,
}

In [None]:
enc = LabelBinarizer()
#enc.fit(labels)
enc.fit(labels)

In [None]:
print(enc.transform(["santé"]))
print(enc.transform(["science_high-tech"]))
print(enc.transform(["sports"]))
print(enc.transform(["économie"]))

In [None]:
stopwords = set(['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût', 'fussions', 'fussiez', 'fussent', 'ayant', 'ayante', 'ayantes', 'ayants', 'eu', 'eue', 'eues', 'eus', 'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', 'aurez', 'auront', 'aurais', 'aurait', 'aurions', 'auriez', 'auraient', 'avais', 'avait', 'avions', 'aviez', 'avaient', 'eut', 'eûmes', 'eûtes', 'eurent', 'aie', 'aies', 'ait', 'ayons', 'ayez', 'aient', 'eusse', 'eusses', 'eût', 'eussions', 'eussiez', 'eussent'])

whitespace = re.compile("[\\s]+", re.UNICODE)
dash = re.compile("[\\-\\˗\\֊\\‐\\‑\\‒\\–\\—\\⁻\\₋\\−\\﹣\\－]")
left_parenthesis_filter = re.compile("[\\(\\[\\{\\⁽\\₍\\❨\\❪\\﹙\\（]")
right_parenthesis_filter = re.compile("[\\)\\]\\}\\⁾\\₎\\❩\\❫\\﹚\\）]")
currencies = re.compile("[¥£₪$€฿₨]")
apostrophe_filter = re.compile(
    r'&#39;|[ʼ՚＇‘’‛❛❜ߴߵ`‵´ˊˋ{}{}{}{}{}{}{}{}{}]'.format(
        chr(768), chr(769), chr(832),
        chr(833), chr(2387), chr(5151),
        chr(5152), chr(65344), chr(8242)
    ), re.UNICODE
)
basic_cleaner = re.compile(r'[^\w\s{}]'.format(re.escape("€-!?/;\"'%&<>.()@#:,|=*")), re.UNICODE)

In [None]:
def get_main_category(dictOfNames):
    new_dict = {}
    try:
        for (key,value) in dictOfNames.items():
            #if "score" in key or "applenews" in key or "homepage" in key:
            #    continue
            new_key = re.sub(r'desktop_|mobile_webview_', "", key)
            new_key = re.sub(r'google_', "", new_key)
            if new_key not in labels:
                continue
            if new_key not in new_dict:
                new_dict[new_key] = 0
            new_dict[new_key] += value
        #return [key for key in new_dict.keys()]
        return max(new_dict, key=new_dict.get)
    except ValueError as e :
        return ""

In [None]:
def clean_text(text):
    text = str.strip(str.lower(text))
    text = whitespace.sub(' ', text)
    text = dash.sub('-', text)
    text = currencies.sub('€', text)
    text = apostrophe_filter.sub("'", text)
    text = left_parenthesis_filter.sub("(", text)
    text = right_parenthesis_filter.sub(")", text)
    text = basic_cleaner.sub('', text)
    #return text
    result = []
    for word in re.split("\W+", text):
      if word not in stopwords:
        result.append(word)
    return " ".join(result)

In [None]:
#lines = open('since_january.csv').readlines()
#lines = lines[1:]
#random.shuffle(lines)
#print("# lines : ", len(lines))
#open('shuffled_since_january.csv', 'w').writelines(lines)
#del lines

In [None]:
def file_generator():
    samples = []
    categories = []
    idx = 0
    while 1:
        with open('shuffled_since_january.csv', 'r', newline='') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
            idx = 0
            for i, row in enumerate(reader):
                if len(row) < 3 or row[3] not in labels:
                    continue

                text = row[0]
                category = ""
                if row[4] != {} and row[4] != "":
                    category = get_main_category(json.loads(row[4]))
                if category == "":
                    continue

                #{'sports': 74333, 'économie': 23855, 'santé': 4308, 'science_high-tech': 19621}
                if category == 'sports' and i % 19 != 0:
                    continue
                elif category == 'économie' and i % 6 != 0:
                    continue
                elif category == 'science_high-tech' and i % 5 != 0:
                    continue

                samples.append(tokenizer.encode(clean_text(text), pad_to_max_length=32, add_special_tokens=True))
                categories.append(category)

                idx += 1
                if idx >= BATCH_SIZE:
                    categories = enc.transform(categories)
                    yield torch.tensor(samples).cuda(), torch.tensor(categories).cuda()
                    samples = []
                    categories = []
                    idx = 0

## Import camembert model

In [None]:
model = CamembertForSequenceClassification.from_pretrained(
    "camembert-base",
    num_labels=len(labels),
    #force_download=True
)
tokenizer = CamembertTokenizer.from_pretrained(
    "camembert-base",
    output_hidden_states=True,
    output_attentions=True
)

In [None]:
for idx, (a, b) in enumerate(file_generator()):
    if idx > 1:
        break
    #print(a, b)

## Test model

In [None]:
input_ids = torch.tensor(tokenizer.encode("Sida. Une start-up française découvre une avancée majeure dans la lutte contre le VIH", add_special_tokens=True)).unsqueeze(0)
out = model(input_ids)

print(out)
#print(np.argmax(out[0]) - 1)
#print(list(labels)[np.argmax(out[0])])

In [None]:
#assert False

## Train model on new dataset

In [None]:
import torch.nn as nn

model.cuda()

#Define loss criterion
criterion = nn.CrossEntropyLoss()
#Define the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [None]:
from sklearn.metrics import accuracy_score

losses = []
for idx, (x_batch, y_batch) in enumerate(file_generator()):
    for x, y in zip(x_batch, y_batch):
        print(x, y)
        #if idx % 10 == 0: # Validation
        #    a = model(x)[0].detach().cpu()
        #    print(a[:2])
        #    #a = np.argmax(a, decimals=-1)
        #    print(accuracy_score(np.argmax(a, axis=1), np.argmax(y.cpu(), axis=1)))
        #else:
        #Precit the output for Given input
        print(torch.Tensor([x]))
        y_pred = 0#model.forward(cuda())
        #Compute Cross entropy loss
        loss = criterion(y_pred, torch.max(y, 1)[1])
        #Add loss to the list
        losses.append(loss.item())
        #Clear the previous gradients
        optimizer.zero_grad()
        #Compute gradients
        loss.backward()
        #Adjust weights
        optimizer.step()

In [None]:
model.summary()

In [None]:
print(model.layers[0])
model.layers[0].trainable = False

In [None]:
input_ids = tf.constant(
    [
        #tokenizer.encode(
        #    clean_text("Sida. Une start-up française découvre une avancée majeure dans la lutte contre le VIH"),
        #    add_special_tokens=True
        #),
        tokenizer.encode(
            clean_text("Annuler l’Euro 2020 ferait perdre très gros à l’UEFA"),
            add_special_tokens=True
        )
    ], tf.int32
)
out = model(input_ids)

In [None]:
print(out)
print(np.argmax(out[0]))
print(list(labels)[np.argmax(out[0])])

print(labels)

In [None]:

c, d = next(file_generator())

#Confution Matrix and Classification Report
Y_pred = model(c)
#print(Y_pred)
y_pred = [labels[int(np.argmax(y))] for y in Y_pred[0]]
d = enc.inverse_transform(d.numpy())

for x, y in zip(d, y_pred):
    print(x, "/", y)

print('Confusion Matrix')
print(confusion_matrix(d, y_pred))
print('Classification Report')

print(classification_report(d, y_pred, target_names=labels))