imports

In [42]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np
from tensorflow.data import Dataset
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from transformers import pipeline
import json

Define constants and variables

In [3]:
prefix = './datasets/'
english_folder = prefix + 'english/'



Read datasets

In [17]:
from os import walk

def readDataset(file):
    file = open(file, 'r')
    data = []
    for line in file:
        data.append(json.loads(line))
    return pd.json_normalize(data)

def readDataInFolder(dir_path):
    dfs = []
    for (dirpath, dirnames, filenames) in walk(dir_path):
        for file in filenames:
            print(f'File: {english_folder + file}')
            df = readDataset(english_folder + file)
            dfs.append(df)
    return dfs



In [18]:
dfs = readDataInFolder(english_folder)
print(f'Número de archivos leídos: {len(dfs)}')

data = pd.concat(dfs)
print(f'Shape of the hole dataset: {data.shape}')
data.head(10)

File: ./datasets/english/Persona_en_N0_tagged.json
File: ./datasets/english/Persona_en_N2_tagged.json
File: ./datasets/english/Persona_en_N3_tagged.json
Número de archivos leídos: 3
Shape of the hole dataset: (3000, 10)


Unnamed: 0,index,author_id,text,lang,context_annotations,id,created_at,date_str,hashtag,tag
0,1169503,117418104,#Bucks: Milwaukee Bucks and Milwaukee Health D...,en,"[{'domain': {'id': '3', 'name': 'TV Shows', 'd...",1387163086981898240,2021-04-27T21:53:46.000Z,2021-04-27,[Bucks],vaccination
1,1088588,726456608,Youth Ages 12-15 Now Eligible for Pfizer COVID...,en,"[{'domain': {'id': '65', 'name': 'Interests an...",1392964893469511681,2021-05-13T22:08:05.000Z,2021-05-13,[],vaccination
2,1229144,191092262,Fighting Stigma: Young people should be free t...,en,"[{'domain': {'id': '123', 'name': 'Ongoing New...",1387110938088677376,2021-04-27T18:26:33.000Z,2021-04-27,[],vaccination
3,1041354,2985792143,Second jab done! Although by the state of Tesc...,en,"[{'domain': {'id': '123', 'name': 'Ongoing New...",1391014287565545477,2021-05-08T12:57:04.000Z,2021-05-08,[],school-reopening
4,1190556,19809471,4/27 - Vaccination and Testing Update - 2020-2...,en,"[{'domain': {'id': '123', 'name': 'Ongoing New...",1387142140606648320,2021-04-27T20:30:32.000Z,2021-04-27,[],vaccines
5,337931,1048999496347279360,If you support state authority after a year of...,en,"[{'domain': {'id': '123', 'name': 'Ongoing New...",1379169272837021696,2021-04-05T20:29:13.000Z,2021-04-05,[],mental-health
6,551389,856622507427635200,We contributed to Saturday's 4-million vaccina...,en,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",1379141859654557699,2021-04-05T18:40:17.000Z,2021-04-05,[],vaccination
7,687455,1254358657132888064,Illinois bar opening event linked to 46 Covid-...,en,"[{'domain': {'id': '123', 'name': 'Ongoing New...",1379114893664731137,2021-04-05T16:53:08.000Z,2021-04-05,[],school-reopening
8,399765,1719674052,Share your COVID-19 vaccination experience wit...,en,"[{'domain': {'id': '48', 'name': 'Product', 'd...",1379161149665648640,2021-04-05T19:56:56.000Z,2021-04-05,[],vaccination
9,895036,18556112,Update on the vaccine stats. \n\n187 people bl...,en,"[{'domain': {'id': '65', 'name': 'Interests an...",1378838436040298496,2021-04-04T22:34:35.000Z,2021-04-04,[],vaccines


In [20]:
print(data.columns)
data['tag'].value_counts()

Index(['index', 'author_id', 'text', 'lang', 'context_annotations', 'id',
       'created_at', 'date_str', 'hashtag', 'tag'],
      dtype='object')


vaccination           1368
vaccines               593
NONE                   589
school-reopening       271
mental-health          177
household-violence       2
Name: tag, dtype: int64

Encoding of classes

In [28]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(data[['tag']])
print(enc.categories_)

[array(['NONE', 'household-violence', 'mental-health', 'school-reopening',
       'vaccination', 'vaccines'], dtype=object)]


In [36]:
'''
Se dividen los datos en 60 training, 20 validación y 20 testing
'''

sentences = data['text'].values

y = enc.transform(data[['text']]).toarray()
sentences_train_val, sentences_test, y_train_val, y_test = train_test_split(sentences, y, test_size=0.20, random_state=1000)
sentences_train, sentences_val, y_train, y_val = train_test_split(sentences_train_val, y_train_val, test_size=0.25, random_state=1000)

# Once we have our handles, we format the datasets in a Keras-fit compatible
# format: a tuple of the form (text_data, label).
train_dataset = Dataset.from_tensor_slices((sentences_train, y_train))
test_dataset = Dataset.from_tensor_slices((sentences_test, y_test))

# We also create a dataset with only the textual data in it. This will be used
# to build our vocabulary later on.
text_dataset = Dataset.from_tensor_slices((sentences_train))

Tokenization

In [37]:
vectorize_layer = TextVectorization(
    ngrams=None, max_tokens=None, vocabulary=None,
    output_mode='int', output_sequence_length=None, pad_to_max_tokens=True, 
)

Get vocabulary

In [38]:
vectorize_layer.adapt(text_dataset.batch(32))
len(vectorize_layer.get_vocabulary())

10416

Get classifier

In [4]:
classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli')

Downloading: 100%|██████████| 908/908 [00:00<00:00, 872kB/s]
Downloading: 100%|██████████| 1.63G/1.63G [02:17<00:00, 11.9MB/s]
Downloading: 100%|██████████| 899k/899k [00:00<00:00, 3.72MB/s]
Downloading: 100%|██████████| 456k/456k [00:00<00:00, 2.43MB/s]
Downloading: 100%|██████████| 1.36M/1.36M [00:00<00:00, 4.81MB/s]
Downloading: 100%|██████████| 26.0/26.0 [00:00<00:00, 5.26kB/s]


Arquitecture

In [44]:
vocab_size = len(vectorize_layer.get_vocabulary())
num_labels = len(enc.categories_)

In [40]:
def getModel():
    model = Sequential()
    model.add(Dense(512, input_shape=(vocab_size,)))
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    model.add(Dense(num_labels))
    model.add(Activation('softmax'))
    model.summary()

In [45]:
model = getModel()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 512)               5333504   
_________________________________________________________________
activation_2 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 512)               262656    
_________________________________________________________________
activation_3 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                