<a href="https://colab.research.google.com/github/jpcilfone/BalancedRobot/blob/main/RNN_TP4_C%C3%ADlfone.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os, re, csv, math, codecs, logging
from collections import Counter
from pathlib import Path
from io import StringIO
import pickle
import gdown

import numpy as np
from matplotlib import pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [2]:
from sklearn.datasets import fetch_20newsgroups

# Use fetch_20newsgroups to load the data with subset selection
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

In [3]:
# descargamos los embeddings de palabras de Fasttext para inglés y descomprimimos el archivo.
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
!unzip wiki-news-300d-1M.vec.zip

--2024-06-28 01:14:18--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.162.163.11, 3.162.163.19, 3.162.163.34, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.162.163.11|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 681808098 (650M) [application/zip]
Saving to: ‘wiki-news-300d-1M.vec.zip’


2024-06-28 01:14:28 (65.0 MB/s) - ‘wiki-news-300d-1M.vec.zip’ saved [681808098/681808098]

Archive:  wiki-news-300d-1M.vec.zip
  inflating: wiki-news-300d-1M.vec   


In [4]:
# cargamos los embeddings de palabras
print('loading word embeddings...')
embeddings_index = {}
f = codecs.open('wiki-news-300d-1M.vec', encoding='utf-8')

for line in f:
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print(f'found {len(embeddings_index)} word vectors')

loading word embeddings...
found 999995 word vectors


In [5]:
# instanciamos el tokenizador
token = Tokenizer(num_words=40000,        # Cambiar el numero de palabras a 10k, 20k, 40k
                filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                lower=True,
                split=' ',
                char_level=False,
                oov_token="UNK",
                document_count=0)

In [6]:
# fiteamos el tokenizador
token.fit_on_texts(newsgroups_train.data)

In [7]:
# obtenemos los diccionarios idx2word y word2idx
reverse_dictionary = token.index_word
dictionary = dict([(value, key) for (key, value) in reverse_dictionary.items()])

In [8]:
# cargamos en una matriz los embeddings de las palabras
# presentes en el vocabulario
embed_dim=300
num_words=len(dictionary)+1
embedding_matrix=np.zeros([num_words,embed_dim])
for word, idx in dictionary.items():
  if idx <= num_words and word in embeddings_index:
    embedding_matrix[idx,:]=embeddings_index[word]

In [9]:
embedding_matrix.shape

(105374, 300)

In [10]:
# se tokenizan los textos
train_sequences=token.texts_to_sequences(newsgroups_train.data)
test_sequences=token.texts_to_sequences(newsgroups_test.data)

In [11]:
# En este punto seleccionamos el tamaño de contexto a procesar en la variable `max_len`
max_len=500
train_sequences=pad_sequences(train_sequences,maxlen=max_len)
test_sequences=pad_sequences(test_sequences,maxlen=max_len)

In [12]:
from keras.layers import Bidirectional, LSTM, Dense, Embedding, Dropout, BatchNormalization
from keras.models import Sequential
from keras.losses import SparseCategoricalCrossentropy
from keras.callbacks import EarlyStopping

In [13]:
model = Sequential()

model.add(Embedding(input_dim=num_words, output_dim=embed_dim, weights=[embedding_matrix], input_shape=(None,), trainable = True))

model.add(Bidirectional(LSTM(200, return_sequences=True)))
model.add(Dropout(0.3))

model.add(Bidirectional(LSTM(200, return_sequences=True)))
model.add(Dropout(0.3))

model.add(LSTM(200))  #
model.add(Dropout(0.3))

model.add(Dense(32, activation='swish'))
model.add(BatchNormalization())
model.add(Dense(20, activation='softmax'))

# Clasificación multiple categórica --> loss = categorical_crossentropy
# notar que usamos la versión Sparse para utilizar sólo índices en lugar de OHE
model.compile(loss=SparseCategoricalCrossentropy(), optimizer='adam', metrics=['accuracy'])


model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 300)         31612200  
                                                                 
 bidirectional (Bidirection  (None, None, 400)         801600    
 al)                                                             
                                                                 
 dropout (Dropout)           (None, None, 400)         0         
                                                                 
 bidirectional_1 (Bidirecti  (None, None, 400)         961600    
 onal)                                                           
                                                                 
 dropout_1 (Dropout)         (None, None, 400)         0         
                                                                 
 lstm_2 (LSTM)               (None, 200)               4

**Encontre esto para lo de Return Sequences**

When return_sequences=True, an output is generated for each timestep. So if there are 5 LSTM Cells in your layer, there will be 5 outputs, one per cell.

When return_sequences=False, only the last output of the forward pass (located at timestep T-1) AND the last output of the backward pass (located at timestep 0) are returned.

In both cases, the outputs are merged in some defined way, e.g concat, sum, etc

In [14]:
# Model Checkpoint
mc = ModelCheckpoint(
    "best.weights.h5",
    monitor = "val_accuracy",
    verbose = 1,
    save_best_only = True,
    save_weights_only = True,
)

# Reduce Learning Rate on Plateau
rlrop = ReduceLROnPlateau(
    monitor = "val_accuracy",
    factor = 0.5,
    patience = 3,
    verbose = 1,
    min_lr = 1e-5
)

# Early Stopping
es = EarlyStopping(
    monitor = "val_accuracy",
    patience = 5,
    verbose = 1,
    restore_best_weights = True,
)

# Tensorboard
tb = TensorBoard(
    log_dir="logs",
)
)

history = model.fit(train_sequences, newsgroups_train.target,
                    batch_size=64,
                    epochs=100,
                    validation_split=0.2,
                    callbacks=[mc,rlop,es,tb],
                    verbose=1
                    )


Epoch 1/100
  1/142 [..............................] - ETA: 2:07:21 - loss: 3.1063 - accuracy: 0.0312

KeyboardInterrupt: 

In [None]:
# Medir F1-score en test

In [None]:
# prompt: Medir F1-score en test

import numpy as np
from sklearn.metrics import f1_score

y_pred = model.predict(test_sequences)
y_pred = np.argmax(y_pred, axis=1)
f1 = f1_score(newsgroups_test.target, y_pred, average="weighted")
print(f"F1-Score en test: {f1}")


In [None]:
'''

Tamaño de capas y cantidad
Dropout
RMSProp, ADAM
BATCH_SIZE
Unloop
TPU?
Embedding entrenable
Forma de colapsar las secuencias
Reduccion de dimensionalidad embedding
'''

'\n\nTamaño de capas y cantidad\nDropout\nRMSProp, ADAM\nBATCH_SIZE\nUnloop\nTPU?\nEmbedding entrenable\nForma de colapsar las secuencias\nReduccion de dimensionalidad embedding\n'