imports

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np
from tensorflow.data import Dataset
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from transformers import pipeline
import json
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer


Define constants and variables

In [2]:
prefix = './datasets/'
english_folder = prefix + 'english/'



Read datasets

In [3]:
from os import walk

def readDataset(file):
    file = open(file, 'r')
    data = []
    for line in file:
        data.append(json.loads(line))
    return pd.json_normalize(data)

def readDataInFolder(dir_path):
    dfs = []
    for (dirpath, dirnames, filenames) in walk(dir_path):
        for file in filenames:
            print(f'File: {english_folder + file}')
            df = readDataset(english_folder + file)
            dfs.append(df)
    return dfs



In [4]:
dfs = readDataInFolder(english_folder)
print(f'Número de archivos leídos: {len(dfs)}')

data = pd.concat(dfs)
print(f'Shape of the hole dataset: {data.shape}')
data.head(10)

File: ./datasets/english/Persona_en_N0_tagged.json
File: ./datasets/english/Persona_en_N3_tagged_final.json
File: ./datasets/english/Persona_en_N2_tagged.json
Número de archivos leídos: 3
Shape of the hole dataset: (3000, 10)


Unnamed: 0,index,author_id,text,lang,context_annotations,id,created_at,date_str,hashtag,tag
0,1169503,117418104,#Bucks: Milwaukee Bucks and Milwaukee Health D...,en,"[{'domain': {'id': '3', 'name': 'TV Shows', 'd...",1387163086981898240,2021-04-27T21:53:46.000Z,2021-04-27,[Bucks],vaccination
1,1088588,726456608,Youth Ages 12-15 Now Eligible for Pfizer COVID...,en,"[{'domain': {'id': '65', 'name': 'Interests an...",1392964893469511681,2021-05-13T22:08:05.000Z,2021-05-13,[],vaccination
2,1229144,191092262,Fighting Stigma: Young people should be free t...,en,"[{'domain': {'id': '123', 'name': 'Ongoing New...",1387110938088677376,2021-04-27T18:26:33.000Z,2021-04-27,[],vaccination
3,1041354,2985792143,Second jab done! Although by the state of Tesc...,en,"[{'domain': {'id': '123', 'name': 'Ongoing New...",1391014287565545477,2021-05-08T12:57:04.000Z,2021-05-08,[],school-reopening
4,1190556,19809471,4/27 - Vaccination and Testing Update - 2020-2...,en,"[{'domain': {'id': '123', 'name': 'Ongoing New...",1387142140606648320,2021-04-27T20:30:32.000Z,2021-04-27,[],vaccines
5,337931,1048999496347279360,If you support state authority after a year of...,en,"[{'domain': {'id': '123', 'name': 'Ongoing New...",1379169272837021696,2021-04-05T20:29:13.000Z,2021-04-05,[],mental-health
6,551389,856622507427635200,We contributed to Saturday's 4-million vaccina...,en,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",1379141859654557699,2021-04-05T18:40:17.000Z,2021-04-05,[],vaccination
7,687455,1254358657132888064,Illinois bar opening event linked to 46 Covid-...,en,"[{'domain': {'id': '123', 'name': 'Ongoing New...",1379114893664731137,2021-04-05T16:53:08.000Z,2021-04-05,[],school-reopening
8,399765,1719674052,Share your COVID-19 vaccination experience wit...,en,"[{'domain': {'id': '48', 'name': 'Product', 'd...",1379161149665648640,2021-04-05T19:56:56.000Z,2021-04-05,[],vaccination
9,895036,18556112,Update on the vaccine stats. \n\n187 people bl...,en,"[{'domain': {'id': '65', 'name': 'Interests an...",1378838436040298496,2021-04-04T22:34:35.000Z,2021-04-04,[],vaccines


In [5]:
print(data.columns)
data['tag'].value_counts()


Index(['index', 'author_id', 'text', 'lang', 'context_annotations', 'id',
       'created_at', 'date_str', 'hashtag', 'tag'],
      dtype='object')


vaccination           1368
vaccines               593
NONE                   589
school-reopening       271
mental-health          177
household-violence       2
Name: tag, dtype: int64

In [6]:
'''
Se dividen los datos en 60 training, 20 validación y 20 testing
'''

sentences = data['text'].values

y = data['tag'].values

sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.20, random_state=1000)

In [7]:
text_dataset = Dataset.from_tensor_slices((sentences_train))

Tokenization

In [8]:
vectorize_layer = TextVectorization(
    ngrams=None, max_tokens=None, vocabulary=None,
    output_mode='int', output_sequence_length=None, pad_to_max_tokens=True, 
)

Get vocabulary

In [9]:
vectorize_layer.adapt(text_dataset.batch(32))
len(vectorize_layer.get_vocabulary())


12746

Arquitecture

In [10]:
vocab_size = len(vectorize_layer.get_vocabulary())


In [11]:
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(sentences_train)


x_train = tokenizer.texts_to_matrix(sentences_train)
x_test = tokenizer.texts_to_matrix(sentences_test)

encoder = LabelBinarizer()
encoder.fit(y_train)
y_train = encoder.transform(y_train)
y_test = encoder.transform(y_test)

In [12]:
num_labels = len(data["tag"].unique())
print(num_labels)

6


In [13]:
def getModel():
    model = Sequential()
    model.add(Dense(512, input_shape=(vocab_size,)))
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    model.add(Dense(num_labels))
    model.add(Activation('softmax'))
    model.summary()

    return model

In [14]:
model = getModel()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               6526464   
_________________________________________________________________
activation (Activation)      (None, 512)               0         
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               262656    
_________________________________________________________________
activation_1 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 3

Compile de model

In [15]:
print(type(model))

<class 'tensorflow.python.keras.engine.sequential.Sequential'>


In [16]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

Fiting the model

In [17]:
num_epochs =10
batch_size = 128
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=num_epochs,
                    verbose=2,
                    validation_split=0.2)

Epoch 1/10
15/15 - 5s - loss: 1.4073 - accuracy: 0.4536 - val_loss: 1.1995 - val_accuracy: 0.6042
Epoch 2/10
15/15 - 3s - loss: 0.7801 - accuracy: 0.7375 - val_loss: 0.8642 - val_accuracy: 0.6625
Epoch 3/10
15/15 - 3s - loss: 0.2781 - accuracy: 0.9198 - val_loss: 0.8629 - val_accuracy: 0.7042
Epoch 4/10
15/15 - 3s - loss: 0.0582 - accuracy: 0.9896 - val_loss: 0.9900 - val_accuracy: 0.7208
Epoch 5/10
15/15 - 3s - loss: 0.0147 - accuracy: 0.9984 - val_loss: 1.1028 - val_accuracy: 0.7063
Epoch 6/10
15/15 - 4s - loss: 0.0067 - accuracy: 1.0000 - val_loss: 1.1556 - val_accuracy: 0.6896
Epoch 7/10
15/15 - 5s - loss: 0.0038 - accuracy: 0.9990 - val_loss: 1.1862 - val_accuracy: 0.6917
Epoch 8/10
15/15 - 4s - loss: 0.0020 - accuracy: 1.0000 - val_loss: 1.2079 - val_accuracy: 0.7000
Epoch 9/10
15/15 - 2s - loss: 0.0018 - accuracy: 0.9995 - val_loss: 1.2528 - val_accuracy: 0.7021
Epoch 10/10
15/15 - 3s - loss: 0.0017 - accuracy: 0.9995 - val_loss: 1.2483 - val_accuracy: 0.7021


In [18]:
from sklearn.metrics import classification_report
#from sklearn.metrics import confusion_matrix


def get_metrics_by_class(model, x, y):
  y_pred = model.predict(x, batch_size=64, verbose=1)
  y_pred_bool = np.argmax(y_pred, axis=1)
  y_label = np.argmax(y, axis=1)
  #print(confusion_matrix(y_pred_bool, y_label))
  print(classification_report(y_label, y_pred_bool))

In [19]:
get_metrics_by_class(model, x_test, y_test)

              precision    recall  f1-score   support

           0       0.57      0.52      0.54       114
           2       0.78      0.83      0.81        35
           3       0.58      0.43      0.49        51
           4       0.77      0.83      0.80       282
           5       0.67      0.66      0.66       118

    accuracy                           0.70       600
   macro avg       0.67      0.65      0.66       600
weighted avg       0.70      0.70      0.70       600

