# Combining Knowledge Graphs and Deep Learning techniques for Categorizing Tweets
## Random Forest vs RNN vs Bi-LSTM


Authors:
<!-- ¡ -->

Experiments:
* Applying RF, RNN and Bi-LSTM models to 2 datasets for classifying 4 binary categories.
* 2 datatasets: (i) textual information and (ii) textual information and embeddings obtained from knowledge graph exploitation (KGE).
 
 
## 1. Random Forest

In [1]:
import os
import pandas as pd
import numpy as np
import json
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re, string, unicodedata
import nltk
from nltk import word_tokenize, sent_tokenize, FreqDist
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
nltk.download
from ast import literal_eval
'''
tweets = pd.read_csv('ed-dataset-falcon_spacy2-embeddings-sentence.csv', sep=';', encoding='utf8', converters=
                           {
                            'entities_instances_wikidata':literal_eval,
                            'spacy_entities_ids':literal_eval,
                            'spacy_entities_labels':literal_eval,
                            'falcon_spacy_entities':literal_eval,
                            'falcon_spacy_labels':literal_eval,
                            'falcon_spacy_embeddingsmd4_mw50_RW':literal_eval,
                            'falcon_spacy_embeddingsmd2_mw100_RW':literal_eval,
                            'sent_embedding_1':literal_eval,
                            'sent_embedding_2':literal_eval},error_bad_lines=False)

'''
tweets = pd.read_csv('ed-dataset-falcon_spacy2-embeddings-sentence-md4.csv', sep=';', encoding='utf8', converters=
                           {
                            'falcon_spacy_embeddingsmd4_mw50_RW':literal_eval,
                            'sent_embedding_1':literal_eval},error_bad_lines=False)



  exec(code_obj, self.user_global_ns, self.user_ns)


In [2]:
import pandas as pd
import numpy as np
import spacy
import nltk
import nltk.data
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import regex as re
import string
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_colwidth', None)
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from simpletransformers.classification import ClassificationModel
import io

punctuations = "¡!#$%&'()*+,-./:;<=>¿?@[\]^_`{|}~"
def read_txt(filename):
    list = []
    with open(filename, 'r', encoding='utf-8') as f:
        data = f.readlines()
        for line in data:
            list.append(str(line).replace('\n', ''))
    return list

stopwords = read_txt('english_stopwords.txt')
stemmer = SnowballStemmer('english')
def clean_accents(tweet):
    tweet = re.sub(r"[àáâãäå]", "a", tweet)
    tweet = re.sub(r"ç", "c", tweet)
    tweet = re.sub(r"[èéêë]", "e", tweet)
    tweet = re.sub(r"[ìíîï]", "i", tweet)
    tweet = re.sub(r"[òóôõö]", "o", tweet)
    tweet = re.sub(r"[ùúûü]", "u", tweet)
    tweet = re.sub(r"[ýÿ]", "y", tweet)

    return tweet

def clean_tweet(tweet, stem = False):
    tweet = tweet.lower().strip()
    tweet = re.sub(r'https?:\/\/\S+', '', tweet)
    tweet = re.sub(r'http?:\/\/\S+', '', tweet)
    tweet = re.sub(r'www?:\/\/\S+', '', tweet)
    tweet = re.sub(r'\s([@#][\w_-]+)', "", tweet)
    tweet = re.sub(r"\n", " ", tweet)
    tweet = clean_accents(tweet)
    tweet = re.sub(r"\b(a*ha+h[ha]*|o?l+o+l+[ol]*|x+d+[x*d*]*|a*ja+[j+a+]+)\b", "<risas>", tweet)
    for symbol in punctuations:
        tweet = tweet.replace(symbol, "")
    tokens = []
    for token in tweet.strip().split():
        if token not in punctuations and token not in stopwords:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [3]:
# 	id	text_orig	ED_Patient	ProED	informative	scientific	hashtags	
# entities_instances_wikidata	spacy_entities_ids	spacy_entities_labels	
# falcon_spacy_entities	falcon_spacy_labels	falcon_spacy_embeddingsmd2_mw50_RW	
# falcon_spacy_embeddingsmd2_mw100_RW	sent_embedding_1	sent_embedding_2

cols = ['id','text_orig','ED_Patient','ProED','informative','scientific','hashtag','entities_instances_wikidata','spacy_entities_ids','spacy_entities_labels','falcon_spacy_entities'
       ,'falcon_spacy_labels','falcon_spacy_embeddingsmd2_mw50_RW','falcon_spacy_embeddingsmd2_mw100_RW','sent_embedding_1','sent_embedding_2']

In [4]:
tweets1 = tweets.copy()
# .drop(['entities_instances_wikidata','spacy_entities_ids','spacy_entities_labels'], axis=1)

tweets1['text_cleaned'] = tweets['text_orig'].apply(lambda s : clean_tweet(s))
#print(tweets1['text_cleaned'].head(5))

In [5]:
# This will hold all of the dataset samples, as strings.
sen_w_feats = []

# The labels for the samples.
labels = []

# First, reload the dataset to undo the transformations we applied for XGBoost.
data_df = tweets.copy()

# Some of the reviews are missing either a "Title" or "Review Text", so we'll 
# replace the NaN values with empty string.
data_df = data_df.fillna("")

# Combining features following https://mccormickml.com/2021/06/29/combining-categorical-numerical-features-with-bert/
print('Combining features ...')

# For each of the samples...
for index, row in data_df.iterrows():

    # Piece it together...    
    combined = " {:} ".format(row["sent_embedding_1"])
    
    # Add the combined text to the list.
    sen_w_feats.append(combined)

    # Also record the sample's label.
    labels.append(row["ProED"])

print('  DONE.')

print('Dataset contains {:,} samples.'.format(len(sen_w_feats)))



Combining features ...
  DONE.
Dataset contains 1,968 samples.


Preparing data to train and test RF models using original dataset and (original dataset+KGE)

In [6]:
import numpy as np
import re
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
import pickle
from nltk.corpus import stopwords
df = tweets1.copy()
X = df['text_cleaned']
Xc = sen_w_feats

#X = np.array(df['falcon_spacy_labels'].tolist())

Y1 = df['ED_Patient']
Y2 = df['ProED']
Y3 = df['informative']
Y4 = df['scientific']

documents = []

from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents.append(document)
    
documents2 = []

stemmer = WordNetLemmatizer()

for sen in range(0, len(Xc)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(Xc[sen]))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents2.append(document)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MICROSOFT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()


# Xc is Dataset + KGE information
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
Xc = vectorizer.fit_transform(documents2).toarray()
tfidfconverter = TfidfTransformer()
Xc = tfidfconverter.fit_transform(Xc).toarray()

### 1.1. RF Applied to Category I - Tweets written by people suffering from eating disorders

In [8]:
from sklearn.ensemble import RandomForestClassifier

X1_train, X1_test, y1_train, y1_test = train_test_split(Xc, Y1, test_size=0.3, random_state=42)
classifier = RandomForestClassifier(n_estimators=1000, random_state=42)
classifier.fit(X1_train, y1_train) 

y1_pred = classifier.predict(X1_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y1_test,y1_pred))
print(classification_report(y1_test,y1_pred))
print(accuracy_score(y1_test, y1_pred))


from sklearn.ensemble import RandomForestClassifier

X1_train, X1_test, y1_train, y1_test = train_test_split(X, Y1, test_size=0.3, random_state=42)
classifier = RandomForestClassifier(n_estimators=1000, random_state=42)
classifier.fit(X1_train, y1_train) 

y1_pred = classifier.predict(X1_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y1_test,y1_pred))
print(classification_report(y1_test,y1_pred))
print(accuracy_score(y1_test, y1_pred))

[[295   0]
 [292   4]]
              precision    recall  f1-score   support

           0       0.50      1.00      0.67       295
           1       1.00      0.01      0.03       296

    accuracy                           0.51       591
   macro avg       0.75      0.51      0.35       591
weighted avg       0.75      0.51      0.35       591

0.505922165820643
[[253  42]
 [ 43 253]]
              precision    recall  f1-score   support

           0       0.85      0.86      0.86       295
           1       0.86      0.85      0.86       296

    accuracy                           0.86       591
   macro avg       0.86      0.86      0.86       591
weighted avg       0.86      0.86      0.86       591

0.856175972927242


### 1.2. RF Applied to Category II - Tweets promotiong Eating Disorders

In [9]:
from sklearn.ensemble import RandomForestClassifier

X1_train, X1_test, y1_train, y1_test = train_test_split(Xc, Y2, test_size=0.3, random_state=42)
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X1_train, y1_train) 

y1_pred = classifier.predict(X1_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y1_test,y1_pred))
print(classification_report(y1_test,y1_pred))
print(accuracy_score(y1_test, y1_pred))


from sklearn.ensemble import RandomForestClassifier

X1_train, X1_test, y1_train, y1_test = train_test_split(X, Y2, test_size=0.3, random_state=42)
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X1_train, y1_train) 

y1_pred = classifier.predict(X1_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y1_test,y1_pred))
print(classification_report(y1_test,y1_pred))
print(accuracy_score(y1_test, y1_pred))

[[463   2]
 [124   2]]
              precision    recall  f1-score   support

           0       0.79      1.00      0.88       465
           1       0.50      0.02      0.03       126

    accuracy                           0.79       591
   macro avg       0.64      0.51      0.46       591
weighted avg       0.73      0.79      0.70       591

0.7868020304568528
[[420  45]
 [ 38  88]]
              precision    recall  f1-score   support

           0       0.92      0.90      0.91       465
           1       0.66      0.70      0.68       126

    accuracy                           0.86       591
   macro avg       0.79      0.80      0.79       591
weighted avg       0.86      0.86      0.86       591

0.8595600676818951


### 1.3. RF Applied to Category III - Informatives Tweets

In [10]:
from sklearn.ensemble import RandomForestClassifier

X1_train, X1_test, y1_train, y1_test = train_test_split(Xc, Y3, test_size=0.3, random_state=42)
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X1_train, y1_train) 

y1_pred = classifier.predict(X1_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y1_test,y1_pred))
print(classification_report(y1_test,y1_pred))
print(accuracy_score(y1_test, y1_pred))


from sklearn.ensemble import RandomForestClassifier

X1_train, X1_test, y1_train, y1_test = train_test_split(X, Y3, test_size=0.3, random_state=42)
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X1_train, y1_train) 

y1_pred = classifier.predict(X1_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y1_test,y1_pred))
print(classification_report(y1_test,y1_pred))
print(accuracy_score(y1_test, y1_pred))

[[343  13]
 [229   6]]
              precision    recall  f1-score   support

           0       0.60      0.96      0.74       356
           1       0.32      0.03      0.05       235

    accuracy                           0.59       591
   macro avg       0.46      0.49      0.39       591
weighted avg       0.49      0.59      0.46       591

0.5905245346869712
[[325  31]
 [ 61 174]]
              precision    recall  f1-score   support

           0       0.84      0.91      0.88       356
           1       0.85      0.74      0.79       235

    accuracy                           0.84       591
   macro avg       0.85      0.83      0.83       591
weighted avg       0.84      0.84      0.84       591

0.8443316412859561


### 1.4. RF Applied to Category IV - Scientific Tweets

In [11]:
from sklearn.ensemble import RandomForestClassifier

X1_train, X1_test, y1_train, y1_test = train_test_split(Xc, Y4, test_size=0.3, random_state=42)
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X1_train, y1_train) 

y1_pred = classifier.predict(X1_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y1_test,y1_pred))
print(classification_report(y1_test,y1_pred))
print(accuracy_score(y1_test, y1_pred))


from sklearn.ensemble import RandomForestClassifier

X1_train, X1_test, y1_train, y1_test = train_test_split(X, Y4, test_size=0.3, random_state=42)
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X1_train, y1_train) 

y1_pred = classifier.predict(X1_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y1_test,y1_pred))
print(classification_report(y1_test,y1_pred))
print(accuracy_score(y1_test, y1_pred))

[[437   0]
 [154   0]]
              precision    recall  f1-score   support

           0       0.74      1.00      0.85       437
           1       0.00      0.00      0.00       154

    accuracy                           0.74       591
   macro avg       0.37      0.50      0.43       591
weighted avg       0.55      0.74      0.63       591

0.739424703891709


  _warn_prf(average, modifier, msg_start, len(result))


[[418  19]
 [ 28 126]]
              precision    recall  f1-score   support

           0       0.94      0.96      0.95       437
           1       0.87      0.82      0.84       154

    accuracy                           0.92       591
   macro avg       0.90      0.89      0.89       591
weighted avg       0.92      0.92      0.92       591

0.9204737732656514


## 2. Recurrent Neural Network (RNN)

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline

In [9]:
df['sent_embedding_1'].tolist()

[[-0.2749309577047825,
  0.41105543076992035,
  0.05028924054931849,
  -0.010098360478878021,
  0.20684843137860298,
  -0.5677935183048248,
  -0.16164803504943848,
  0.8477769494056702,
  -0.28463442623615265,
  -0.29416023194789886,
  -0.06745482329279184,
  -0.4493356943130493,
  -0.4987877458333969,
  0.13505493104457855,
  0.14811605587601662,
  -0.027825807221233845,
  0.05328531190752983,
  0.36715953052043915,
  -0.31236808001995087,
  -0.26593852788209915,
  0.37916867434978485,
  -0.10252587124705315,
  0.40555284917354584,
  -0.1692783460021019,
  0.05614596605300903,
  0.09495411440730095,
  -0.2239082083106041,
  0.031672872602939606,
  -0.19396403804421425,
  -0.20960760861635208,
  0.575357049703598,
  -0.2558159753680229,
  0.007797600701451302,
  -0.5721764266490936,
  -0.009518767707049847,
  0.032026466680690646,
  0.34997861087322235,
  0.09136635810136795,
  -0.1970382034778595,
  0.034147247672080994,
  0.13652361184358597,
  -0.2142499014735222,
  -0.5113563686609

In [8]:
df = tweets1.copy()

X_orig = df['text_cleaned']
X_b = np.array(df['sent_embedding_1'].tolist())
X_b = np.asarray(X_b, dtype=np.float32)

Y1 = df['ED_Patient']
Y2 = df['ProED']
Y3 = df['informative']
Y4 = df['scientific']

In [14]:
max_words = 1000
max_len = 100
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_orig)
sequences = tok.texts_to_sequences(X_orig)
X_c = sequence.pad_sequences(sequences,maxlen=max_len)

In [15]:
X_c.shape

(1968, 100)

In [16]:
X_b.shape

(1968, 100)

In [16]:
X = X_b

In [17]:
max_len=1000
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,100,input_length=max_len)(inputs)
    layer = LSTM(100)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.1)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [18]:
from sklearn.metrics import confusion_matrix
'''y1_pred = model.predict(X1_test)
y1_pred = np.argmax(y1_pred, axis=1)
conf_mat = confusion_matrix(y1_test, y1_pred)'''
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

### 2.1. RNN Applied to Category I - Tweets written by people suffering from eating disorders

In [19]:
# CATEGORY 1

X1_train, X1_test, y1_train, y1_test = train_test_split(X, Y1, test_size=0.3, random_state=42)

max_words = 1000
max_len = 200
#tok = Tokenizer(num_words=max_words)
#tok.fit_on_texts(X1_train)
#sequences = tok.texts_to_sequences(X1_train)
sequences_matrix = X1_train
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['acc',f1_m,precision_m, recall_m])
model.fit(sequences_matrix,y1_train,batch_size=128,epochs=10,
          validation_split=0.3,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])
#test_sequences = X1_test
test_sequences_matrix = X1_test
#accr = model.evaluate(test_sequences_matrix,y1_test)
loss, accuracy, f1_score, precision, recall = model.evaluate(X1_test, y1_test, verbose=0)

#print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n fº {:0.3f}'.format(loss,accuracy,f1_score))


X1_train, X1_test, y1_train, y1_test = train_test_split(X_orig, Y1, test_size=0.3, random_state=42)

max_words = 1000
max_len = 200
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X1_train)
sequences = tok.texts_to_sequences(X1_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['acc',f1_m,precision_m, recall_m])
model.fit(sequences_matrix,y1_train,batch_size=128,epochs=10,
          validation_split=0.3,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])
test_sequences = tok.texts_to_sequences(X1_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)
#accr = model.evaluate(test_sequences_matrix,y1_test)
loss, accuracy, f1_score, precision, recall = model.evaluate(test_sequences_matrix, y1_test, verbose=0)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n fº {:0.3f}'.format(loss,accuracy,f1_score))

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          [(None, 200)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 200, 100)          100000    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               80400     
_________________________________________________________________
FC1 (Dense)                  (None, 256)               25856     
_________________________________________________________________
activation (Activation)      (None, 256)               0         
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
out_layer (Dense)            (None, 1)                

### 2.2. RNN Applied to Category II - Tweets promotiong Eating Disorders

In [23]:
# CATEGORY 2
X1_train, X1_test, y1_train, y1_test = train_test_split(X, Y2, test_size=0.3, random_state=42)

max_words = 1000
max_len = 200
#tok = Tokenizer(num_words=max_words)
#tok.fit_on_texts(X1_train)
#sequences = tok.texts_to_sequences(X1_train)
sequences_matrix = X1_train
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['acc',f1_m,precision_m, recall_m])
model.fit(sequences_matrix,y1_train,batch_size=128,epochs=10,
          validation_split=0.3,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])
#test_sequences = X1_test
test_sequences_matrix = X1_test
loss, accuracy, f1_score, precision, recall = model.evaluate(test_sequences_matrix, y1_test, verbose=0)

#print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n fº {:0.3f}'.format(loss,accuracy,f1_score))

X1_train, X1_test, y1_train, y1_test = train_test_split(X_orig, Y2, test_size=0.3, random_state=42)

max_words = 1000
max_len = 200
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X1_train)
sequences = tok.texts_to_sequences(X1_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['acc',f1_m,precision_m, recall_m])
model.fit(sequences_matrix,y1_train,batch_size=128,epochs=10,
          validation_split=0.3,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])
test_sequences = tok.texts_to_sequences(X1_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)
loss, accuracy, f1_score, precision, recall = model.evaluate(test_sequences_matrix, y1_test, verbose=0)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n fº {:0.3f}'.format(loss,accuracy,f1_score))

Model: "functional_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          [(None, 200)]             0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 200, 100)          100000    
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               80400     
_________________________________________________________________
FC1 (Dense)                  (None, 256)               25856     
_________________________________________________________________
activation_6 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 256)               0         
_________________________________________________________________
out_layer (Dense)            (None, 1)                

### 2.3. RNN Applied to Category III - Informatives Tweets

In [24]:
# CATEGORY 3
X1_train, X1_test, y1_train, y1_test = train_test_split(X, Y3, test_size=0.3, random_state=42)

max_words = 1000
max_len = 200
#tok = Tokenizer(num_words=max_words)
#tok.fit_on_texts(X1_train)
#sequences = tok.texts_to_sequences(X1_train)
sequences_matrix = X1_train
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['acc',f1_m,precision_m, recall_m])
model.fit(sequences_matrix,y1_train,batch_size=128,epochs=10,
          validation_split=0.3,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])
#test_sequences = X1_test
test_sequences_matrix = X1_test
loss, accuracy, f1_score, precision, recall = model.evaluate(test_sequences_matrix, y1_test, verbose=0)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n fº {:0.3f}'.format(loss,accuracy,f1_score))


X1_train, X1_test, y1_train, y1_test = train_test_split(X_orig, Y3, test_size=0.3, random_state=42)

max_words = 1000
max_len = 200
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X1_train)
sequences = tok.texts_to_sequences(X1_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['acc',f1_m,precision_m, recall_m])
model.fit(sequences_matrix,y1_train,batch_size=128,epochs=10,
          validation_split=0.3,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])
test_sequences = tok.texts_to_sequences(X1_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)
loss, accuracy, f1_score, precision, recall = model.evaluate(test_sequences_matrix, y1_test, verbose=0)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n fº {:0.3f}'.format(loss,accuracy,f1_score))

Model: "functional_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          [(None, 200)]             0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 200, 100)          100000    
_________________________________________________________________
lstm_5 (LSTM)                (None, 100)               80400     
_________________________________________________________________
FC1 (Dense)                  (None, 256)               25856     
_________________________________________________________________
activation_10 (Activation)   (None, 256)               0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 256)               0         
_________________________________________________________________
out_layer (Dense)            (None, 1)               

### 2.4. RNN Applied to Category IV - Scientific Tweets

In [25]:
# CATEGORY 4
X1_train, X1_test, y1_train, y1_test = train_test_split(X_orig, Y4, test_size=0.3, random_state=42)


tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X1_train)
sequences = tok.texts_to_sequences(X1_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['acc',f1_m,precision_m, recall_m])
model.fit(sequences_matrix,y1_train,batch_size=128,epochs=8,
          validation_split=0.3,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])
test_sequences = tok.texts_to_sequences(X1_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)
loss, accuracy, f1_score, precision, recall = model.evaluate(test_sequences_matrix, y1_test, verbose=0)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n fº {:0.3f}'.format(loss,accuracy,f1_score))

X1_train, X1_test, y1_train, y1_test = train_test_split(X, Y4, test_size=0.3, random_state=42)

max_words = 1000
max_len = 200
#tok = Tokenizer(num_words=max_words)
#tok.fit_on_texts(X1_train)
#sequences = tok.texts_to_sequences(X1_train)
sequences_matrix = X1_train
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['acc',f1_m,precision_m, recall_m])
model.fit(sequences_matrix,y1_train,batch_size=128,epochs=10,
          validation_split=0.3,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])
#test_sequences = X1_test
test_sequences_matrix = X1_test
loss, accuracy, f1_score, precision, recall = model.evaluate(test_sequences_matrix, y1_test, verbose=0)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n fº {:0.3f}'.format(loss,accuracy,f1_score))

Model: "functional_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          [(None, 200)]             0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 200, 100)          100000    
_________________________________________________________________
lstm_7 (LSTM)                (None, 100)               80400     
_________________________________________________________________
FC1 (Dense)                  (None, 256)               25856     
_________________________________________________________________
activation_14 (Activation)   (None, 256)               0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 256)               0         
_________________________________________________________________
out_layer (Dense)            (None, 1)               

## 3. Bi-LSTM

### 3.1. Bi-LSTM Applied to Category I - Tweets written by people suffering from eating disorders

In [27]:
import tensorflow as tf

In [28]:
# CATEGORY 1
X1_train, X1_test, y1_train, y1_test = train_test_split(X, Y1, test_size=0.3, random_state=42)

VOCAB_SIZE=1000


model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=1968,
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(2e-4),
              metrics=['acc',f1_m,precision_m, recall_m])

history = model.fit(X1_train,y1_train, epochs=14,
                    validation_data=(X1_test,y1_test), 
                    validation_steps=30)


loss, accuracy, f1_score, precision, recall = model.evaluate(X1_test,y1_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n fº {:0.3f}'.format(loss,accuracy,f1_score))

X1_train, X1_test, y1_train, y1_test = train_test_split(X_orig, Y1, test_size=0.3, random_state=42)

VOCAB_SIZE=1000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(np.asarray(X1_train))

model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(2e-4),
              metrics=['acc',f1_m,precision_m, recall_m])

history = model.fit(X1_train,y1_train, epochs=14,
                    validation_data=(X1_test,y1_test), 
                    validation_steps=30)


loss, accuracy, f1_score, precision, recall = model.evaluate(X1_test,y1_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n fº {:0.3f}'.format(loss,accuracy,f1_score))





Epoch 1/14
Epoch 2/14
Epoch 3/14
Epoch 4/14
Epoch 5/14
Epoch 6/14
Epoch 7/14
Epoch 8/14
Epoch 9/14
Epoch 10/14
Epoch 11/14
Epoch 12/14
Epoch 13/14
Epoch 14/14
Test set
  Loss: 0.695
  Accuracy: 0.499
 fº 0.000
Epoch 1/14
Epoch 2/14
Epoch 3/14
Epoch 4/14
Epoch 5/14
Epoch 6/14
Epoch 7/14
Epoch 8/14
Epoch 9/14
Epoch 10/14
Epoch 11/14
Epoch 12/14
Epoch 13/14
Epoch 14/14
Test set
  Loss: 0.641
  Accuracy: 0.851
 fº 0.851


### 3.2. Bi-LSTM Applied to Category II - Tweets promotiong Eating Disorders

In [29]:
# CATEGORY 2
X1_train, X1_test, y1_train, y1_test = train_test_split(X, Y2, test_size=0.3, random_state=42)

VOCAB_SIZE=1000


model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=1968,
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(2e-4),
              metrics=['acc',f1_m,precision_m, recall_m])

history = model.fit(X1_train,y1_train, epochs=14,
                    validation_data=(X1_test,y1_test), 
                    validation_steps=30)


loss, accuracy, f1_score, precision, recall = model.evaluate(X1_test,y1_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n fº {:0.3f}'.format(loss,accuracy,f1_score))

X1_train, X1_test, y1_train, y1_test = train_test_split(X_orig, Y2, test_size=0.3, random_state=42)

VOCAB_SIZE=1000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(np.asarray(X1_train))

model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(2e-4),
              metrics=['acc',f1_m,precision_m, recall_m])

history = model.fit(X1_train,y1_train, epochs=14,
                    validation_data=(X1_test,y1_test), 
                    validation_steps=30)


loss, accuracy, f1_score, precision, recall = model.evaluate(X1_test,y1_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n fº {:0.3f}'.format(loss,accuracy,f1_score))




Epoch 1/14
Epoch 2/14
Epoch 3/14
Epoch 4/14
Epoch 5/14
Epoch 6/14
Epoch 7/14
Epoch 8/14
Epoch 9/14
Epoch 10/14
Epoch 11/14
Epoch 12/14
Epoch 13/14
Epoch 14/14
Test set
  Loss: 0.521
  Accuracy: 0.787
 fº 0.000
Epoch 1/14
Epoch 2/14
Epoch 3/14
Epoch 4/14
Epoch 5/14
Epoch 6/14
Epoch 7/14
Epoch 8/14
Epoch 9/14
Epoch 10/14
Epoch 11/14
Epoch 12/14
Epoch 13/14
Epoch 14/14
Test set
  Loss: 0.537
  Accuracy: 0.865
 fº 0.656


### 3.3. Bi-LSTM Applied to Category III - Informatives Tweets

In [30]:
# CATEGORY 3

X1_train, X1_test, y1_train, y1_test = train_test_split(X, Y3, test_size=0.3, random_state=42)

VOCAB_SIZE=1000


model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=1968,
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(2e-4),
              metrics=['acc',f1_m,precision_m, recall_m])

history = model.fit(X1_train,y1_train, epochs=14,
                    validation_data=(X1_test,y1_test), 
                    validation_steps=30)


loss, accuracy, f1_score, precision, recall = model.evaluate(X1_test,y1_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n fº {:0.3f}'.format(loss,accuracy,f1_score))


X1_train, X1_test, y1_train, y1_test = train_test_split(X_orig, Y3, test_size=0.3, random_state=42)

VOCAB_SIZE=1000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(np.asarray(X1_train))

model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(2e-4),
              metrics=['acc',f1_m,precision_m, recall_m])

history = model.fit(X1_train,y1_train, epochs=14,
                    validation_data=(X1_test,y1_test), 
                    validation_steps=30)


loss, accuracy, f1_score, precision, recall = model.evaluate(X1_test,y1_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n fº {:0.3f}'.format(loss,accuracy,f1_score))




Epoch 1/14
Epoch 2/14
Epoch 3/14
Epoch 4/14
Epoch 5/14
Epoch 6/14
Epoch 7/14
Epoch 8/14
Epoch 9/14
Epoch 10/14
Epoch 11/14
Epoch 12/14
Epoch 13/14
Epoch 14/14
Test set
  Loss: 0.672
  Accuracy: 0.602
 fº 0.000
Epoch 1/14
Epoch 2/14
Epoch 3/14
Epoch 4/14
Epoch 5/14
Epoch 6/14
Epoch 7/14
Epoch 8/14
Epoch 9/14
Epoch 10/14
Epoch 11/14
Epoch 12/14
Epoch 13/14
Epoch 14/14
Test set
  Loss: 1.090
  Accuracy: 0.819
 fº 0.762


### 3.4. Bi-LSTM Applied to Category IV - Scientific Tweets

In [31]:
# CATEGORY 4
X1_train, X1_test, y1_train, y1_test = train_test_split(X, Y4, test_size=0.3, random_state=42)

VOCAB_SIZE=1000


model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=1968,
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(2e-4),
              metrics=['acc',f1_m,precision_m, recall_m])

history = model.fit(X1_train,y1_train, epochs=14,
                    validation_data=(X1_test,y1_test), 
                    validation_steps=30)


loss, accuracy, f1_score, precision, recall = model.evaluate(X1_test,y1_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n fº {:0.3f}'.format(loss,accuracy,f1_score))


X1_train, X1_test, y1_train, y1_test = train_test_split(X_orig, Y4, test_size=0.3, random_state=42)

VOCAB_SIZE=1000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(np.asarray(X1_train))

model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(2e-4),
              metrics=['acc',f1_m,precision_m, recall_m])

history = model.fit(X1_train,y1_train, epochs=14,
                    validation_data=(X1_test,y1_test), 
                    validation_steps=30)


loss, accuracy, f1_score, precision, recall = model.evaluate(X1_test,y1_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n fº {:0.3f}'.format(loss,accuracy,f1_score))

Epoch 1/14
Epoch 2/14
Epoch 3/14
Epoch 4/14
Epoch 5/14
Epoch 6/14
Epoch 7/14
Epoch 8/14
Epoch 9/14
Epoch 10/14
Epoch 11/14
Epoch 12/14
Epoch 13/14
Epoch 14/14
Test set
  Loss: 0.575
  Accuracy: 0.739
 fº 0.000
Epoch 1/14
Epoch 2/14
Epoch 3/14
Epoch 4/14
Epoch 5/14
Epoch 6/14
Epoch 7/14
Epoch 8/14
Epoch 9/14
Epoch 10/14
Epoch 11/14
Epoch 12/14
Epoch 13/14
Epoch 14/14
Test set
  Loss: 0.563
  Accuracy: 0.905
 fº 0.777
