# Combining Knowledge Graphs and Deep Learning techniques for Categorizing Tweets
## Random Forest vs RNN vs Bi-LSTM


Authors:
<!-- ¡ -->

Experiments:
* Applying RF, RNN and Bi-LSTM models to 2 datasets for classifying 4 binary categories.
* 2 datatasets: (i) textual information and (ii) textual information and embeddings obtained from knowledge graph exploitation (KGE).
 
 
## 1. Random Forest

In [1]:
import os
import pandas as pd
import numpy as np
import json
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re, string, unicodedata
import nltk
from nltk import word_tokenize, sent_tokenize, FreqDist
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
nltk.download
from ast import literal_eval
'''
tweets = pd.read_csv('ed-dataset-falcon_spacy2-embeddings-sentence.csv', sep=';', encoding='utf8', converters=
                           {
                            'entities_instances_wikidata':literal_eval,
                            'spacy_entities_ids':literal_eval,
                            'spacy_entities_labels':literal_eval,
                            'falcon_spacy_entities':literal_eval,
                            'falcon_spacy_labels':literal_eval,
                            'falcon_spacy_embeddingsmd4_mw50_RW':literal_eval,
                            'falcon_spacy_embeddingsmd2_mw100_RW':literal_eval,
                            'sent_embedding_1':literal_eval,
                            'sent_embedding_2':literal_eval},error_bad_lines=False)

'''
tweets = pd.read_csv('dis-dataset-falcon_spacy2-embeddings-sentence-md4.csv', sep=';', encoding='utf8', converters=
                           {
                            'falcon_spacy_embeddingsmd4_mw50_RW':literal_eval,
                           'sent_embedding_1':literal_eval},error_bad_lines=False)



  exec(code_obj, self.user_global_ns, self.user_ns)


In [2]:
import pandas as pd
import numpy as np
import spacy
import nltk
import nltk.data
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import regex as re
import string
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_colwidth', None)
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from simpletransformers.classification import ClassificationModel
import io

punctuations = "¡!#$%&'()*+,-./:;<=>¿?@[\]^_`{|}~"
def read_txt(filename):
    list = []
    with open(filename, 'r', encoding='utf-8') as f:
        data = f.readlines()
        for line in data:
            list.append(str(line).replace('\n', ''))
    return list

stopwords = read_txt('english_stopwords.txt')
stemmer = SnowballStemmer('english')
def clean_accents(tweet):
    tweet = re.sub(r"[àáâãäå]", "a", tweet)
    tweet = re.sub(r"ç", "c", tweet)
    tweet = re.sub(r"[èéêë]", "e", tweet)
    tweet = re.sub(r"[ìíîï]", "i", tweet)
    tweet = re.sub(r"[òóôõö]", "o", tweet)
    tweet = re.sub(r"[ùúûü]", "u", tweet)
    tweet = re.sub(r"[ýÿ]", "y", tweet)

    return tweet

def clean_tweet(tweet, stem = False):
    tweet = tweet.lower().strip()
    tweet = re.sub(r'https?:\/\/\S+', '', tweet)
    tweet = re.sub(r'http?:\/\/\S+', '', tweet)
    tweet = re.sub(r'www?:\/\/\S+', '', tweet)
    tweet = re.sub(r'\s([@#][\w_-]+)', "", tweet)
    tweet = re.sub(r"\n", " ", tweet)
    tweet = clean_accents(tweet)
    tweet = re.sub(r"\b(a*ha+h[ha]*|o?l+o+l+[ol]*|x+d+[x*d*]*|a*ja+[j+a+]+)\b", "<risas>", tweet)
    for symbol in punctuations:
        tweet = tweet.replace(symbol, "")
    tokens = []
    for token in tweet.strip().split():
        if token not in punctuations and token not in stopwords:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [3]:
# 	id	text_orig	ED_Patient	ProED	informative	scientific	hashtags	
# entities_instances_wikidata	spacy_entities_ids	spacy_entities_labels	
# falcon_spacy_entities	falcon_spacy_labels	falcon_spacy_embeddingsmd2_mw50_RW	
# falcon_spacy_embeddingsmd2_mw100_RW	sent_embedding_1	sent_embedding_2

cols = ['id','text_orig','ED_Patient','ProED','informative','scientific','hashtag','entities_instances_wikidata','spacy_entities_ids','spacy_entities_labels','falcon_spacy_entities'
       ,'falcon_spacy_labels','falcon_spacy_embeddingsmd2_mw50_RW','falcon_spacy_embeddingsmd2_mw100_RW','sent_embedding_1','sent_embedding_2']


cols = ['id','keyword','location','text','target','falcon2_wd_entities_labels','falcon2_wd_entities_ids','falcon2_wd_relations_ids','falcon2_wd_relations_labels','spacy_wd_entities_ids','spacy_wd_entities_urls','spacy_wd_entities_labels','falcon2_wd_entities_ids_cleaned','falcon2_wd_entities_labels_cleaned','falcon_spacy_embeddingsmd4_mw50_RW','sent_embedding_1']


In [4]:
tweets1 = tweets.copy()
# .drop(['entities_instances_wikidata','spacy_entities_ids','spacy_entities_labels'], axis=1)

tweets1['text_cleaned'] = tweets['text'].apply(lambda s : clean_tweet(s))
#print(tweets1['text_cleaned'].head(5))

In [5]:
# This will hold all of the dataset samples, as strings.
sen_w_feats = []

# The labels for the samples.
labels = []

# First, reload the dataset to undo the transformations we applied for XGBoost.
data_df = tweets.copy()

# Some of the reviews are missing either a "Title" or "Review Text", so we'll 
# replace the NaN values with empty string.
data_df = data_df.fillna("")

# Combining features following https://mccormickml.com/2021/06/29/combining-categorical-numerical-features-with-bert/
print('Combining features ...')

# For each of the samples...
for index, row in data_df.iterrows():

    # Piece it together...    
    combined = " {:} ".format(row["sent_embedding_1"])
    
    # Add the combined text to the list.
    sen_w_feats.append(combined)

    # Also record the sample's label.
    labels.append(row["target"])

print('  DONE.')

print('Dataset contains {:,} samples.'.format(len(sen_w_feats)))



Combining features ...
  DONE.
Dataset contains 7,309 samples.


Preparing data to train and test RF models using original dataset and (original dataset+KGE)

In [6]:
import numpy as np
import re
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
import pickle
from nltk.corpus import stopwords
df = tweets1.copy()
X = df['text_cleaned']
Xc = sen_w_feats

#X = np.array(df['falcon_spacy_labels'].tolist())

Y1 = df['target']


documents = []

from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents.append(document)
    
documents2 = []

stemmer = WordNetLemmatizer()

for sen in range(0, len(Xc)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(Xc[sen]))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents2.append(document)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MICROSOFT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()


# Xc is Dataset + KGE information
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
Xc = vectorizer.fit_transform(documents2).toarray()
tfidfconverter = TfidfTransformer()
Xc = tfidfconverter.fit_transform(Xc).toarray()

### 1.1. RF Applied to Category I - Tweets written by people suffering from eating disorders

In [57]:
from sklearn.ensemble import RandomForestClassifier

X1_train, X1_test, y1_train, y1_test = train_test_split(X, Y1, test_size=0.3, random_state=42)
classifier = RandomForestClassifier(n_estimators=1000, random_state=42)
classifier.fit(X1_train, y1_train) 

y1_pred = classifier.predict(X1_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y1_test,y1_pred))
print(classification_report(y1_test,y1_pred))
print(accuracy_score(y1_test, y1_pred))

from sklearn.ensemble import RandomForestClassifier

X1_train, X1_test, y1_train, y1_test = train_test_split(Xc, Y1, test_size=0.3, random_state=42)
classifier = RandomForestClassifier(n_estimators=1000, random_state=42)
classifier.fit(X1_train, y1_train) 

y1_pred = classifier.predict(X1_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y1_test,y1_pred))
print(classification_report(y1_test,y1_pred))
print(accuracy_score(y1_test, y1_pred))




[[1232    1]
 [ 919   41]]
              precision    recall  f1-score   support

         0.0       0.57      1.00      0.73      1233
         1.0       0.98      0.04      0.08       960

    accuracy                           0.58      2193
   macro avg       0.77      0.52      0.40      2193
weighted avg       0.75      0.58      0.45      2193

0.5804833561331509
[[1019  214]
 [ 289  671]]
              precision    recall  f1-score   support

         0.0       0.78      0.83      0.80      1233
         1.0       0.76      0.70      0.73       960

    accuracy                           0.77      2193
   macro avg       0.77      0.76      0.76      2193
weighted avg       0.77      0.77      0.77      2193

0.7706338349293206


## 2. Recurrent Neural Network (RNN)

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline

In [8]:
df = tweets1.copy()

X_orig = df['text_cleaned']
X_b = np.array(df['sent_embedding_1'].tolist())
X_b = np.asarray(X_b, dtype=np.float32)

Y1 = df['target']


In [9]:
max_words = 1000
max_len = 100
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_orig)
sequences = tok.texts_to_sequences(X_orig)
X_c = sequence.pad_sequences(sequences,maxlen=max_len)

In [10]:
X_c.shape

(7309, 100)

In [11]:
X_b.shape

(7309, 100)

In [12]:
X = X_b

In [13]:
max_len=1000
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,100,input_length=max_len)(inputs)
    layer = LSTM(100)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.1)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [14]:
from sklearn.metrics import confusion_matrix
'''y1_pred = model.predict(X1_test)
y1_pred = np.argmax(y1_pred, axis=1)
conf_mat = confusion_matrix(y1_test, y1_pred)'''
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

### 2.1. RNN Applied to Category I - Tweets written by people suffering from eating disorders

In [15]:
# CATEGORY 1


X1_train, X1_test, y1_train, y1_test = train_test_split(X_orig, Y1, test_size=0.3, random_state=42)

max_words = 1000
max_len = 200
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X1_train)
sequences = tok.texts_to_sequences(X1_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['acc',f1_m,precision_m, recall_m])
model.fit(sequences_matrix,y1_train,batch_size=128,epochs=10,
          validation_split=0.3,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])
test_sequences = tok.texts_to_sequences(X1_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)
#accr = model.evaluate(test_sequences_matrix,y1_test)
loss, accuracy, f1_score, precision, recall = model.evaluate(test_sequences_matrix, y1_test, verbose=0)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n fº {:0.3f}'.format(loss,accuracy,f1_score))


X1_train, X1_test, y1_train, y1_test = train_test_split(X, Y1, test_size=0.3, random_state=42)

max_words = 1000
max_len = 200
#tok = Tokenizer(num_words=max_words)
#tok.fit_on_texts(X1_train)
#sequences = tok.texts_to_sequences(X1_train)
sequences_matrix = X1_train
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['acc',f1_m,precision_m, recall_m])
model.fit(sequences_matrix,y1_train,batch_size=128,epochs=10,
          validation_split=0.3,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])
#test_sequences = X1_test
test_sequences_matrix = X1_test
#accr = model.evaluate(test_sequences_matrix,y1_test)
loss, accuracy, f1_score, precision, recall = model.evaluate(X1_test, y1_test, verbose=0)

#print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n fº {:0.3f}'.format(loss,accuracy,f1_score))



Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          [(None, 200)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 200, 100)          100000    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               80400     
_________________________________________________________________
FC1 (Dense)                  (None, 256)               25856     
_________________________________________________________________
activation (Activation)      (None, 256)               0         
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
out_layer (Dense)            (None, 1)                

### 2.2. RNN Applied to Category II - Tweets promotiong Eating Disorders

## 3. Bi-LSTM

### 3.1. Bi-LSTM Applied to Category I - Tweets written by people suffering from eating disorders

In [16]:
import tensorflow as tf

In [17]:
# CATEGORY 1
X1_train, X1_test, y1_train, y1_test = train_test_split(X, Y1, test_size=0.3, random_state=42)

VOCAB_SIZE=1000


model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=1968,
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(2e-4),
              metrics=['acc',f1_m,precision_m, recall_m])

history = model.fit(X1_train,y1_train, epochs=14,
                    validation_data=(X1_test,y1_test), 
                    validation_steps=30)


loss, accuracy, f1_score, precision, recall = model.evaluate(X1_test,y1_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n fº {:0.3f}'.format(loss,accuracy,f1_score))

X1_train, X1_test, y1_train, y1_test = train_test_split(X_orig, Y1, test_size=0.3, random_state=42)

VOCAB_SIZE=1000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(np.asarray(X1_train))

model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(2e-4),
              metrics=['acc',f1_m,precision_m, recall_m])

history = model.fit(X1_train,y1_train, epochs=14,
                    validation_data=(X1_test,y1_test), 
                    validation_steps=30)


loss, accuracy, f1_score, precision, recall = model.evaluate(X1_test,y1_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n fº {:0.3f}'.format(loss,accuracy,f1_score))

Epoch 1/14
Epoch 2/14
Epoch 3/14
Epoch 4/14
Epoch 5/14
Epoch 6/14
Epoch 7/14
Epoch 8/14
Epoch 9/14
Epoch 10/14
Epoch 11/14
Epoch 12/14
Epoch 13/14
Epoch 14/14
Test set
  Loss: 0.686
  Accuracy: 0.562
 fº 0.000
Epoch 1/14
Epoch 2/14
Epoch 3/14
Epoch 4/14
Epoch 5/14
Epoch 6/14
Epoch 7/14
Epoch 8/14
Epoch 9/14
Epoch 10/14
Epoch 11/14
Epoch 12/14
Epoch 13/14
Epoch 14/14
Test set
  Loss: 0.620
  Accuracy: 0.760
 fº 0.703
