# Combining Knowledge Graphs and Deep Learning techniques for Categorizing Tweets
## BERT Models (TweetBERT, BERT, RoBERTa, CamemBERT, DistilBERT, Albert, Flaubert)


Authors:


Experiments:
* Applying RF, RNN and Bi-LSTM models to 2 datasets for classifying 4 binary categories.
* 2 datatasets: (i) textual information and (ii) textual information and embeddings obtained from knowledge graph exploitation (KGE).
 
 


In [1]:
import nltk.data
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import regex as re
import string
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_colwidth', None)
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from simpletransformers.classification import ClassificationModel
import io
import os
import json
from collections import Counter
from wordcloud import WordCloud
import re, string, unicodedata
import nltk
from nltk import word_tokenize, sent_tokenize, FreqDist
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
nltk.download
from ast import literal_eval
'''
tweets = pd.read_csv('ed-dataset-falcon_spacy2-embeddings-sentence.csv', sep=';', encoding='utf8', converters=
                           {
                            'entities_instances_wikidata':literal_eval,
                            'spacy_entities_ids':literal_eval,
                            'spacy_entities_labels':literal_eval,
                            'falcon_spacy_entities':literal_eval,
                            'falcon_spacy_labels':literal_eval,
                            'falcon_spacy_embeddingsmd4_mw50_RW':literal_eval,
                            'falcon_spacy_embeddingsmd2_mw100_RW':literal_eval,
                            'sent_embedding_1':literal_eval,
                            'sent_embedding_2':literal_eval},error_bad_lines=False)

'''
tweets = pd.read_csv('dis-dataset-falcon_spacy2-embeddings-sentence-md4.csv', sep=';', encoding='utf8', converters=
                           {
                            'falcon_spacy_embeddingsmd4_mw50_RW':literal_eval,
                           'sent_embedding_1':literal_eval},error_bad_lines=False)



  exec(code_obj, self.user_global_ns, self.user_ns)


In [2]:
n_unique_words = 10000 # cut texts after this number of words
maxlen = 20
batch_size = 8 

In [3]:
punctuations = "¡!#$%&'()*+,-./:;<=>¿?@[\]^_`{|}~"

def read_txt(filename):
    list = []
    with open(filename, 'r', encoding='utf-8') as f:
        data = f.readlines()
        for line in data:
            list.append(str(line).replace('\n', ''))
    return list

stopwords = read_txt('english_stopwords.txt')

stemmer = SnowballStemmer('english')


def clean_accents(tweet):
    tweet = re.sub(r"[àáâãäå]", "a", tweet)
    tweet = re.sub(r"ç", "c", tweet)
    tweet = re.sub(r"[èéêë]", "e", tweet)
    tweet = re.sub(r"[ìíîï]", "i", tweet)
    tweet = re.sub(r"[òóôõö]", "o", tweet)
    tweet = re.sub(r"[ùúûü]", "u", tweet)
    tweet = re.sub(r"[ýÿ]", "y", tweet)

    return tweet

def clean_tweet(tweet, stem = False):
    tweet = tweet.lower().strip()
    tweet = re.sub(r'https?:\/\/\S+', '', tweet)
    tweet = re.sub(r'http?:\/\/\S+', '', tweet)
    tweet = re.sub(r'www?:\/\/\S+', '', tweet)
    tweet = re.sub(r'\s([@#][\w_-]+)', "", tweet)
    tweet = re.sub(r"\n", " ", tweet)
    tweet = clean_accents(tweet)
    tweet = re.sub(r"\b(a*ha+h[ha]*|o?l+o+l+[ol]*|x+d+[x*d*]*|a*ja+[j+a+]+)\b", "<risas>", tweet)
    for symbol in punctuations:
        tweet = tweet.replace(symbol, "")
    tokens = []
    for token in tweet.strip().split():
        if token not in punctuations and token not in stopwords:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [4]:
tweets1 = tweets.copy()
tweets1['text_cleaned'] = tweets['text'].apply(lambda s : clean_tweet(s))
print(tweets1['text_cleaned'].head(5))

0                                                           deeds reason may allah forgive us
1                                                       forest fire near la ronge sask canada
2    residents asked shelter place notified officers evacuation shelter place orders expected
3                                           13000 people receive evacuation orders california
4                                                      got sent photo ruby smoke pours school
Name: text_cleaned, dtype: object


In [5]:
# This will hold all of the dataset samples, as strings.
sen_w_feats = []

# The labels for the samples.
labels = []

# First, reload the dataset to undo the transformations we applied for XGBoost.
data_df = tweets.copy()

# Some of the reviews are missing either a "Title" or "Review Text", so we'll 
# replace the NaN values with empty string.
data_df = data_df.fillna("")

# Combining features following https://mccormickml.com/2021/06/29/combining-categorical-numerical-features-with-bert/
print('Combining features ...')

# For each of the samples...
for index, row in data_df.iterrows():

    # Piece it together...    
    combined = row["text"]
    combined += " {:} ".format(row["sent_embedding_1"])
    
    # Add the combined text to the list.
    sen_w_feats.append(combined)

    # Also record the sample's label.
    labels.append(row["target"])

print('  DONE.')

print('Dataset contains {:,} samples.'.format(len(sen_w_feats)))



Combining features ...
  DONE.
Dataset contains 7,309 samples.


In [6]:
X = sen_w_feats
X2 = tweets1['text_cleaned']

df = tweets1.copy()
Y1 = df['target']


X1_train, X1_test, y1_train, y1_test = train_test_split(X, Y1, test_size=0.3, random_state=42)


In [7]:
y1_test.value_counts(normalize=True)

0.0    0.562244
1.0    0.437756
Name: target, dtype: float64

In [8]:
train_args ={"reprocess_input_data": True,
           "fp16":False,
             "evaluate_during_training": False,
             "evaluate_during_training_verbose":False,
             "learning_rate":2e-5,
             "train_batch_size":4,
             "eval_batch_size":4,
           "num_train_epochs": 10, 'overwrite_output_dir': True, "evaluation_strategy":'epochs'
            }
#optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)


def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='micro')

def calcule_f1(df):
    return(df['tp'] / (df['tp'] + 0.5 * (df['fp'] + df['fn'])))

import torch
import gc
from tqdm import tqdm


dfEval1 = pd.DataFrame()

In [9]:
N_ITER = 5

In [10]:
N_ITER = 1

In [11]:
gc.collect()
torch.cuda.empty_cache()

In [12]:
import torch
print(torch.cuda.is_available())


True


## 1. BERT Models applied to Category I - Tweets written by people suffering Eating Disorders

In [None]:
gc.collect()
torch.cuda.empty_cache()
limitsave=0

X1_train, X1_test, y1_train, y1_test = train_test_split(X, Y1, test_size=0.3, random_state=42)
train_df1 = pd.DataFrame({ 'text_cleaned': X1_train, 'target': y1_train })
test_df1 = pd.DataFrame({ 'text_cleaned': X1_test, 'target': y1_test })

c_model_1 = ["bertweet","bert","roberta", "distilbert","camembert",  "albert", "flaubert"]
c_model_2 = ["vinai/bertweet-base","bert-base-multilingual-cased","roberta-base","distilbert-base-cased", "camembert-base", "albert-base-v1", "flaubert/flaubert_base_cased"]

for idx, model in enumerate(c_model_1):
    
    for i in range(0,N_ITER):
        
        model1 = ClassificationModel(
        c_model_1[idx], c_model_2[idx],
            use_cuda = True,
            args=train_args
        )
        model1.train_model(train_df1)
        result1, model_outputs1, wrong_predictions1 = model1.eval_model(test_df1, f1=f1_multiclass, acc=accuracy_score)
        print(result1)
        
        if(i<limitsave):
            torch.save(model1, 'model1'+str(i)+'.pt')
        del model1
        gc.collect()
        torch.cuda.empty_cache()
        if(i==0):
            dfResultsModels1 = pd.DataFrame.from_dict(result1, orient="index").T
        else:
            dfResultsModels1b = pd.DataFrame.from_dict(result1, orient="index").T
            dfResultsModels1 = dfResultsModels1.append(dfResultsModels1b)

    dfResultsModels1Trans = pd.DataFrame(dfResultsModels1.mean(axis=0)).T
    dfResultsModels1Trans['f1'] = calcule_f1(dfResultsModels1Trans)
    if(idx == 0):
        dfResultsModelsTotal = dfResultsModels1Trans.copy()
    else:
        dfResultsModelsTotal = dfResultsModelsTotal.append(dfResultsModels1Trans)
    dfResultsModelsTotal.to_csv('dfResultsModelsTotalEMB-1.csv')

X1_train, X1_test, y1_train, y1_test = train_test_split(X2, Y1, test_size=0.3, random_state=42)
train_df1 = pd.DataFrame({ 'text_cleaned': X1_train, 'target': y1_train })
test_df1 = pd.DataFrame({ 'text_cleaned': X1_test, 'target': y1_test })
for idx, model in enumerate(c_model_1):
    
    for i in range(0,N_ITER):
        model1 = ClassificationModel(
        c_model_1[idx], c_model_2[idx],
            use_cuda = True,
            args=train_args
        )
        model1.train_model(train_df1)
        result1, model_outputs1, wrong_predictions1 = model1.eval_model(test_df1, f1=f1_multiclass, acc=accuracy_score)
        print(result1)
        if(i<limitsave):
            torch.save(model1, 'model1'+str(i)+'.pt')
        del model1
        gc.collect()
        torch.cuda.empty_cache()
        if(i==0):
            dfResultsModels1w = pd.DataFrame.from_dict(result1, orient="index").T
        else:
            dfResultsModels1bw = pd.DataFrame.from_dict(result1, orient="index").T
            dfResultsModels1w = dfResultsModels1.append(dfResultsModels1bw)

    dfResultsModels1Transw = pd.DataFrame(dfResultsModels1w.mean(axis=0)).T
    dfResultsModels1Transw['f1'] = calcule_f1(dfResultsModels1Transw)

    if(idx == 0):
        dfResultsModelsTotalw = dfResultsModels1Transw.copy()
    else:
        dfResultsModelsTotalw = dfResultsModelsTotalw.append(dfResultsModels1Transw)
    dfResultsModelsTotalw.to_csv('dfResultsModelsTotalEMB-1.csv')

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

  0%|          | 0/5116 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/1279 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/1279 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/1279 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/1279 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/1279 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/1279 [00:00<?, ?it/s]

### 1.1. Results using original dataset (texts) + embeddings obtained from knowledge graph exploitation

In [None]:
indexBERT = ['TweetBERT','BERT','RoBERTa','DistilBERT','CamemBERT','Albert','Flaubert']

#dfResultsModelsTotal.reindex(indexBERT)
dfResultsModelsTotal = dfResultsModelsTotal.reset_index(drop=True)
dfResultsModelsTotal.index = indexBERT
dfResultsModelsTotal

dfResultssModelsTotal1 = dfResultsModelsTotal.copy()
dfResultssModelsTotal1

### 1.2. Results using original dataset (texts)

In [None]:
indexBERT = ['TweetBERT','BERT','RoBERTa','DistilBERT','CamemBERT','Albert','Flaubert']

#dfResultsModelsTotal.reindex(indexBERT)
dfResultsModelsTotalw = dfResultsModelsTotalw.reset_index(drop=True)
dfResultsModelsTotalw.index = indexBERT
dfResultsModelsTotalw

dfResultssModelsTotal1w = dfResultsModelsTotalw.copy()
dfResultssModelsTotal1w

## 2. BERT Models applied to Category II - Tweets promoting Eating Disorders

## 3. BERT Models applied to Category III - Informative tweets

### 3.1. Results using original dataset (texts) + embeddings obtained from knowledge graph exploitation

### 3.2. Results using original dataset (texts) 

## 4. BERT Models applied to Category IV - Scientific Tweets

### 4.1. Results using original dataset (texts) + embeddings obtained from knowledge graph exploitation

### 4.2. Results using original dataset (texts) 