#Camembert pre-trained model with classification head trained on political tweets for binary classification left or right.

To train the model with all the data, comment this line: df = df.sample(100)

##0. Install the packages and import libraries.

In [None]:
!pip install -q transformers
!pip install -q datasets
!pip install pymongo[srv]
!pip install mlflow --quiet
!pip install pytorch-lightning

In [29]:
# Set the GPU with PyTorch on a CUDA-capable system

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

# Import the data

from pymongo import MongoClient
import pandas as pd
from datasets import Dataset, DatasetDict
import getpass

# Set MLFlow

import mlflow
import mlflow.pytorch

# Preprocessing

from sklearn.preprocessing import LabelEncoder

# NLP libraries from Transformers / Huggingface

from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm

##1. Connect to Databricks and set MLFlow

Configure the connection to Databricks where the models, parameters and metrics will be logged in a MLFlow experiment.

In [None]:
!databricks configure --host https://community.cloud.databricks.com/

Set the MLFlow experiment in Databricks (if not exists)

In [31]:
databricks_username = input("Enter Databricks username (e-mail):")
mlflow_experiment_name = "popolibot_model_search"
mlflow.set_tracking_uri("databricks")
mlflow.set_experiment(f"/Users/{databricks_username}/{mlflow_experiment_name}")

##2. Import the data

The data is imported from a MongoDG NoSQL database, in the form of a JSON dictionary, then turned into a Pandas DataFrame.

In [32]:
# Enter the password for logging into the MongoDB database
password = getpass.getpass()

··········


In [33]:
uri = f"mongodb+srv://politweet:{password}@cluster0.0rn9i.mongodb.net/politweet?retryWrites=true&w=majority"
client = MongoClient(uri)
db = client.politweet

# Retrieve tweets from the french deputees
tweets = list(db.tweets.find({"retweetedTweet":False, "quotedTweet":False}, {"_id":0, "content":1, "group":1, "date":1}))

# Retrieve tweets from the french political parties' official Twitter accounts
party_tweets = list(db.party_tweets.find({"retweetedTweet":False, "quotedTweet":False}, {"_id":0, "content":1, "group":1, "date":1}))
tweets = tweets + party_tweets

# Put the data in a Pandas DataFrame
df = pd.DataFrame(tweets)
df.head()

Unnamed: 0,group,date,content
0,AGIR-E,2021-02-16 18:28:53,Antoine Herth et Valérie Petit au nom d’⁦@Agir...
1,AGIR-E,2021-03-26 17:43:02,C’est avec plaisir que j’ai accueilli @franckr...
2,AGIR-E,2020-09-25 08:36:51,"Avec lui, nous veillerons à ce que la nouvelle..."
3,AGIR-E,2020-09-24 12:24:52,🎥 ➡️ https://t.co/qqxN74AQAn
4,AGIR-E,2020-09-22 06:30:55,J’apprends ce matin que Gilbert Meyer nous a q...


In [34]:
# Tweet count in each political group
df.group.value_counts()

LAREM     153728
LR         40428
FI         36407
DEM        25608
AGIR-E     14242
SOC        11024
EDS         8987
LT          8085
UDI_I       7698
LND         6312
RN          6200
DLF         5062
GDR         3839
GE          1297
LDS          123
Name: group, dtype: int64

In [35]:
# "AGIR-E": "centre-droit", # https://fr.wikipedia.org/wiki/Groupe_Agir_ensemble
# "DEM": "centre", # https://fr.wikipedia.org/wiki/Mouvement_d%C3%A9mocrate_(France)
# "DLF": "droite-plus", # https://fr.wikipedia.org/wiki/Debout_la_France
# "EDS": "centre-gauche", # https://fr.wikipedia.org/wiki/Groupe_%C3%89cologie_d%C3%A9mocratie_solidarit%C3%A9
# "FI": "gauche-plus", # https://fr.wikipedia.org/wiki/La_France_insoumise
# "GDR": "gauche-plus", # https://fr.wikipedia.org/wiki/Groupe_de_la_Gauche_d%C3%A9mocrate_et_r%C3%A9publicaine
# "GE": "centre-gauche", # https://fr.wikipedia.org/wiki/G%C3%A9n%C3%A9ration_%C3%A9cologie
# "LAREM": "centre", # https://fr.wikipedia.org/wiki/La_R%C3%A9publique_en_marche
# "LDS": "droite-plus", # https://fr.wikipedia.org/wiki/Ligue_du_Sud_(France)
# "LND": "gauche", # https://fr.wikipedia.org/wiki/Les_Nouveaux_D%C3%A9mocrates
# "LR": "droite", # https://fr.wikipedia.org/wiki/Les_R%C3%A9publicains
# "LT": "centre", # https://fr.wikipedia.org/wiki/Groupe_Libert%C3%A9s_et_territoires
# "RN": "droite-plus", # https://fr.wikipedia.org/wiki/Rassemblement_national
# "SOC": "gauche", # https://fr.wikipedia.org/wiki/Groupe_socialiste_(Assembl%C3%A9e_nationale)
# "UDI_I": "centre-droit", # https://fr.wikipedia.org/wiki/Union_des_d%C3%A9mocrates_et_ind%C3%A9pendants

def get_target(group):
    '''
    Takes the political group as an argument
    Returns 'droite' or 'gauche'
    Else returns 'inconnu' e.g. if it is a group of the center
    '''
    target_dict = {
      "droite":["DLF", "LDS", "LR", "RN"],
      "gauche":["FI", "GDR", "LND", "SOC"],
    }
    if group in target_dict["droite"]:
      return "droite"
    elif group in target_dict["gauche"]:
      return "gauche"
    else:
      return "inconnu"
  

In [36]:
# Create the target column
df['target'] = df.group.apply(get_target)

In [37]:
# Tweet count in each target value.
df = df.query("target!='inconnu'")
df.target.value_counts()

gauche    57582
droite    51813
Name: target, dtype: int64

In [38]:
# Sample the dataframe if needed for a quick test
df = df.sample(100)

In [39]:
def balance_class(df, field):
  '''
  Reduces the largest group to the size of the smallest group.
  '''
  min_size = df[field].value_counts().values.min()
  return df.groupby(field).apply(lambda x: x.sample(min_size)).reset_index(drop=True)

df = balance_class(df, 'target')

# Tweet count after calling the balance_class function (should be twice the same number)
df.target.value_counts()

droite    41
gauche    41
Name: target, dtype: int64

In [40]:
df

Unnamed: 0,group,date,content,target
0,LR,2021-01-14 11:27:23,Nous échangeons avec Jacqueline @herremans_ et...,droite
1,LR,2020-06-07 13:09:54,"🗨️ @ChJacob77 : ""700 000 jeunes vont arriver s...",droite
2,LR,2020-07-08 09:52:38,#directAN #CDDAT Mon intervention sur le fret ...,droite
3,LR,2020-08-26 18:00:00,👍🗣️ @nadine__morano sera présente pour le @rdv...,droite
4,RN,2020-07-13 09:55:01,📹 @sebchenu : « La transformation de la basili...,droite
...,...,...,...,...
77,FI,2020-08-22 07:56:36,Suivez en direct les #AMFiS2020 de la @francei...,gauche
78,FI,2021-06-18 11:57:20,"Merci à Jacques Weber, Jean Pierre Darroussin ...",gauche
79,SOC,2021-01-14 10:07:48,@Rochelaisgentil Depuis octobre nous alertons ...,gauche
80,FI,2021-05-16 09:48:08,🇵🇸 En interdisant des manifestations pro-pales...,gauche


In [41]:
# Encode the target to a numerical value, either 0 or 1
le = LabelEncoder()
df.target = le.fit_transform(df.target)

In [42]:
# Convert the Pandas DataFrame to a dataset object using the dataset library
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['group', 'date', 'content', 'target'],
    num_rows: 82
})

##3. Preprocessing

We do a train_test_split method on the dataset object to create 2 new sets: train and validation.

We then do again this method on the validation dataset to split it into validation and test.

As a result we now have 3 different datasets: train (60%), validation (20%) and test (20%).

In [43]:
train_dataset, validation_dataset= dataset.train_test_split(train_size=0.6).values()

In [44]:
validation_dataset, test_dataset= validation_dataset.train_test_split(test_size=0.5).values()

We put these 3 datasets in a DatasetDict object 'raw_datasets' that will be passed in a tokenize function.

In [45]:
raw_datasets = DatasetDict({"train":train_dataset,
                            "validation":validation_dataset,
                            "test":test_dataset})
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['group', 'date', 'content', 'target'],
        num_rows: 32
    })
    validation: Dataset({
        features: ['group', 'date', 'content', 'target'],
        num_rows: 25
    })
    test: Dataset({
        features: ['group', 'date', 'content', 'target'],
        num_rows: 25
    })
})

Now we set the checkpoint of the Transformers model, that will be also used to grab its tokenizer.

In [46]:
checkpoint = "camembert-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["content"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# This collate function will apply the correct amount of padding to
# the items of the dataset we want to batch together.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

The tokenize function adds the features 'attention_mask' and 'input_ids' needed by the model.

In [47]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'content', 'date', 'group', 'input_ids', 'target'],
        num_rows: 32
    })
    validation: Dataset({
        features: ['attention_mask', 'content', 'date', 'group', 'input_ids', 'target'],
        num_rows: 25
    })
    test: Dataset({
        features: ['attention_mask', 'content', 'date', 'group', 'input_ids', 'target'],
        num_rows: 25
    })
})

Now we remove the columns 'content, 'date' and 'group', and we rename 
the 'target' as 'labels' as it is the expected name from the model.

In [48]:
tokenized_datasets = tokenized_datasets.remove_columns(
    ['content', 'date', 'group']
)
tokenized_datasets = tokenized_datasets.rename_column("target", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets['train'].column_names

['attention_mask', 'input_ids', 'labels']

In [49]:
tokenized_datasets['train'].features


{'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'labels': Value(dtype='int64', id=None)}

We set the dataloaders that we will use to iterate over batches.

In [50]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [51]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'attention_mask': torch.Size([8, 82]),
 'input_ids': torch.Size([8, 82]),
 'labels': torch.Size([8])}

##3. Train and evaluate

This cell groups the training of the model and its evaluation, inside a MLFlow run. This will log the parameters, the accuracy and the model in the MLFlow experiment.

In [52]:
with mlflow.start_run():
  
  # Instanciate the model

  model = AutoModelForSequenceClassification.from_pretrained(
      checkpoint,
      num_labels=2
      )
  model.to(device)

  # Set the parameters

  optimizer = AdamW(model.parameters(), lr=5e-5)
  num_epochs = 3
  num_training_steps = num_epochs * len(train_dataloader)
  lr_scheduler = get_scheduler(
      "linear",
      optimizer=optimizer,
      num_warmup_steps=0,
      num_training_steps=num_training_steps
      )
  mlflow.log_param("checkpoint", checkpoint)
  mlflow.log_param("num_epochs", num_epochs)
  mlflow.log_param("num_training_steps", num_training_steps)

  # Train the model

  progress_bar = tqdm(range(num_training_steps))
  model.train()
  for epoch in range(num_epochs):
      for batch in train_dataloader:
          batch = {k: v.to(device) for k, v in batch.items()}
          outputs = model(**batch)
          loss = outputs.loss
          loss.backward()
          
          optimizer.step()
          lr_scheduler.step()
          optimizer.zero_grad()
          progress_bar.update(1)
  mlflow.pytorch.log_model(model, "model") # log the model in MLFlow

  # Model evaluation

  from datasets import load_metric
  metric= load_metric("glue", "sst2")
  model.eval()
  for batch in eval_dataloader:
      batch = {k: v.to(device) for k, v in batch.items()}
      with torch.no_grad():
          outputs = model(**batch)
      
      logits = outputs.logits
      predictions = torch.argmax(logits, dim=-1)
      metric.add_batch(predictions=predictions, references=batch["labels"])

  results = metric.compute()
  accuracy = results['accuracy']
  mlflow.log_metric("accuracy", accuracy)

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.weight'

  0%|          | 0/12 [00:00<?, ?it/s]



##4. Make predictions

With the run_id of the best model, we can call it from Databricks to use it for predictions.

In [53]:
# Best model's id from MLFlow in Databricks
run_id =  "6ad15808d761447fa46a07ec88f13669"
model_name = "model"
best_model  = mlflow.pytorch.load_model(f"runs:/{run_id}/{model_name}")

Finally, we can test our best model using any sentence.

In [54]:
test_sentence = """
il faut demander leur passeport aux oiseaux migrateur
"""
classes = le.inverse_transform([0, 1])

test_tokens = tokenizer.encode_plus(test_sentence, return_tensors="pt")
test_tokens.to(device)
classification_logits = best_model(**test_tokens)[0]
results = torch.softmax(classification_logits, dim=1).tolist()[0]
results_dict = {}
for i in range(len(classes)):
    results_dict[classes[i]] = results[i]
results_dict = sorted(results_dict.items(), key=lambda x: x[1], reverse=True)
print(test_sentence)
for i in range(len(results_dict)):
    print(f"C'est de {results_dict[i][0]}: {round(results_dict[i][1] * 100)}%")


il faut demander leur passeport aux oiseaux migrateur

C'est de droite: 97%
C'est de gauche: 3%
