In [3]:
import os, re, csv, math, codecs, logging
from collections import Counter
from pathlib import Path
from io import StringIO
import pickle
import gdown

import numpy as np
from matplotlib import pyplot as plt
import tensorflow as tf
from sklearn.datasets import fetch_20newsgroups
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# from keras.metrics import F1Score

In [4]:
# cargamos los datos (ya separados de forma predeterminada en train y test)
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))
class_num = 20

In [3]:
# descargamos los embeddings de palabras de Fasttext para inglés y descomprimimos el archivo.
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
!unzip wiki-news-300d-1M.vec.zip

--2024-06-28 21:26:57--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.163.189.96, 3.163.189.51, 3.163.189.108, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.163.189.96|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 681808098 (650M) [application/zip]
Saving to: ‘wiki-news-300d-1M.vec.zip’


2024-06-28 21:27:00 (242 MB/s) - ‘wiki-news-300d-1M.vec.zip’ saved [681808098/681808098]

Archive:  wiki-news-300d-1M.vec.zip
  inflating: wiki-news-300d-1M.vec   


In [12]:
# cargamos los embeddings de palabras
print('loading word embeddings...')
embeddings_index = {}
f = codecs.open('wiki-news-300d-1M.vec', encoding='utf-8')

for line in f:
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print(f'found {len(embeddings_index)} word vectors')

loading word embeddings...
found 999995 word vectors


In [5]:
# instanciamos el tokenizador
token = Tokenizer(num_words=30000,
                filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                lower=True,
                split=' ',
                char_level=False,
                oov_token="UNK",
                document_count=0)

In [7]:
# fiteamos el tokenizador
token.fit_on_texts(newsgroups_train.data)

In [8]:
# obtenemos los diccionarios idx2word y word2idx
reverse_dictionary = token.index_word
dictionary = dict([(value, key) for (key, value) in reverse_dictionary.items()])
# CHECK QUE EMPIEZA POR 0

In [14]:
# cargamos en una matriz los embeddings de las palabras
# presentes en el vocabulario
embed_dim=300
num_words=len(dictionary)+1
embedding_matrix=np.zeros([num_words,embed_dim])
for word, idx in dictionary.items():
  if idx <= num_words and word in embeddings_index:
    embedding_matrix[idx,:]=embeddings_index[word]

In [15]:
embedding_matrix.shape

(105374, 300)

In [30]:
# Aplicamos principal component analisis
from sklearn.decomposition import PCA
pca = PCA(n_components=200)
embedding_matrix_reduced = pca.fit_transform(embedding_matrix)

In [32]:
# se tokenizan los textos
train_sequences=token.texts_to_sequences(newsgroups_train.data)
test_sequences=token.texts_to_sequences(newsgroups_test.data)

In [33]:
# En este punto seleccionamos el tamaño de contexto a procesar en la variable `max_len`
max_len=100
train_sequences=pad_sequences(train_sequences,maxlen=max_len)
test_sequences=pad_sequences(test_sequences,maxlen=max_len)

In [43]:

from keras.layers import Bidirectional, LSTM, Dense, Embedding, Dropout, BatchNormalization, Activation
from keras.models import Sequential
from keras.layers import GlobalMaxPooling1D, SpatialDropout1D
from keras.losses import SparseCategoricalCrossentropy
from keras.callbacks import EarlyStopping
import tensorflow as tf
from tensorflow.keras import layers, models

In [58]:
model = Sequential()

model.add(Embedding(input_dim=num_words, output_dim=200, weights=[embedding_matrix_reduced], input_shape=(None,), trainable = True))
model.add(SpatialDropout1D(0.5))
model.add(Bidirectional(LSTM(200, return_sequences=True)))
model.add(Dropout(0.3))

model.add(Bidirectional(LSTM(200, return_sequences=True)))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.3))

model.add(Dense(512,kernel_regularizer=tf.keras.regularizers.l2(0.001)))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.3))

model.add(Dense(32,kernel_regularizer=tf.keras.regularizers.l2(0.001)))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.3))

model.add(Dense(20,kernel_regularizer=tf.keras.regularizers.l2(0.001)))
model.add(BatchNormalization())
model.add(Activation('softmax'))

# Clasificación multiple categórica --> loss = categorical_crossentropy
# notar que usamos la versión Sparse para utilizar sólo índices en lugar de OHE
model.compile(loss=SparseCategoricalCrossentropy(), optimizer='adam', metrics=['accuracy'])
# ^ El modelo da mejores resultados con Adam (para sorpresa de nadie)

model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, None, 200)         21074800  
                                                                 
 spatial_dropout1d_1 (Spati  (None, None, 200)         0         
 alDropout1D)                                                    
                                                                 
 bidirectional_6 (Bidirecti  (None, None, 400)         641600    
 onal)                                                           
                                                                 
 dropout_12 (Dropout)        (None, None, 400)         0         
                                                                 
 bidirectional_7 (Bidirecti  (None, None, 400)         961600    
 onal)                                                           
                                                      

In [62]:
# Callbacks

from tensorflow.keras.callbacks import EarlyStopping,LearningRateScheduler,ModelCheckpoint,ReduceLROnPlateau, TensorBoard
from tensorflow.keras.optimizers import Adam

# Model Checkpoint
mc = ModelCheckpoint(
    "bestweights.h5",
    monitor = "val_accuracy",
    verbose = 1,
    save_best_only = True,
    save_weights_only = True,
)

# Reduce Learning Rate on Plateau
rlrop = ReduceLROnPlateau(
    monitor = "val_accuracy",
    factor = 0.5,
    patience = 3,
    verbose = 1,
    min_lr = 1e-5
)

# Early Stopping
es = EarlyStopping(
    monitor = "val_accuracy",
    patience = 10,
    verbose = 1,
    restore_best_weights = True,
)

# Tensorboard
tb = TensorBoard(
    log_dir="logs",
)

In [63]:
history = model.fit(train_sequences, newsgroups_train.target,
                    batch_size=256,
                    epochs=100,
                    validation_split=0.2,
                    callbacks=[mc,es,rlrop,tb],
                    )


Epoch 1/100
Epoch 1: val_accuracy improved from -inf to 0.66549, saving model to bestweights.h5
Epoch 2/100
Epoch 2: val_accuracy improved from 0.66549 to 0.66681, saving model to bestweights.h5
Epoch 3/100
Epoch 3: val_accuracy improved from 0.66681 to 0.66858, saving model to bestweights.h5
Epoch 4/100
Epoch 4: val_accuracy did not improve from 0.66858
Epoch 5/100
Epoch 5: val_accuracy did not improve from 0.66858
Epoch 6/100
Epoch 6: val_accuracy did not improve from 0.66858

Epoch 6: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.
Epoch 7/100
Epoch 7: val_accuracy did not improve from 0.66858
Epoch 8/100
Epoch 8: val_accuracy did not improve from 0.66858
Epoch 9/100
Epoch 9: val_accuracy improved from 0.66858 to 0.67079, saving model to bestweights.h5
Epoch 10/100
Epoch 10: val_accuracy did not improve from 0.67079
Epoch 11/100
Epoch 11: val_accuracy did not improve from 0.67079
Epoch 12/100
Epoch 12: val_accuracy did not improve from 0.67079

Epoch 12: ReduceLRO

In [64]:
# Medir F1-score y accuracy en test

import numpy as np
from sklearn.metrics import f1_score, accuracy_score

y_pred = np.argmax(model.predict(test_sequences), axis=-1)

f1 = f1_score(newsgroups_test.target, y_pred, average='macro')
accuracy = accuracy_score(newsgroups_test.target, y_pred)

print(f"F1 Score: {f1}")
print(f"Accuracy: {accuracy}")


F1 Score: 0.6124805438308195
Accuracy: 0.6196229421136484


In [None]:
'''
Tokenizacion: opciones DONE
Elman, LSTM, GRU
Bidireccional     DONE
Tamaño de capas y cantidad    DONE
Dropout
RMSProp, ADAM     DONE
BATCH_SIZE DONE
Unloop
TPU?
Embedding entrenable    DONE
Forma de colapsar las secuencias
Reduccion de dimensionalidad embedding    PCA/DONE
'''