<a href="https://colab.research.google.com/github/jgermanob/JobOffersClassifier/blob/master/Notebooks/Split/CNN_trainable_split_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Acceso a drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Bibliotecas necesarias

In [2]:
!pip3 install pickle5

Collecting pickle5
[?25l  Downloading https://files.pythonhosted.org/packages/f7/4c/5c4dd0462c8d3a6bc4af500a6af240763c2ebd1efdc736fc2c946d44b70a/pickle5-0.0.11.tar.gz (132kB)
[K     |██▌                             | 10kB 24.3MB/s eta 0:00:01[K     |█████                           | 20kB 15.0MB/s eta 0:00:01[K     |███████▍                        | 30kB 13.1MB/s eta 0:00:01[K     |██████████                      | 40kB 12.2MB/s eta 0:00:01[K     |████████████▍                   | 51kB 7.6MB/s eta 0:00:01[K     |██████████████▉                 | 61kB 8.3MB/s eta 0:00:01[K     |█████████████████▍              | 71kB 8.4MB/s eta 0:00:01[K     |███████████████████▉            | 81kB 8.7MB/s eta 0:00:01[K     |██████████████████████▎         | 92kB 8.8MB/s eta 0:00:01[K     |████████████████████████▉       | 102kB 7.4MB/s eta 0:00:01[K     |███████████████████████████▎    | 112kB 7.4MB/s eta 0:00:01[K     |█████████████████████████████▊  | 122kB 7.4MB/s eta 0:00:01

In [3]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adadelta
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import plot_model
from tensorflow.keras.utils import to_categorical
from gensim.models import KeyedVectors
import tensorflow.keras.backend as K
import numpy as np
import io
import datetime
import pickle5 as pickle
from sklearn.metrics import classification_report

## Definición del modelo

In [4]:
class CNN:
  def __init__(self, input_dim, embedding_matrix, input_length=300):
    output_dim = 300
    input = Input(shape=(input_length,), name='input_1')
    embeddings = layers.Embedding(input_dim+1,output_dim,input_length=input_length, weights=[embedding_matrix], trainable=True)
    conv1D = layers.Conv1D(128, 5, activation='relu', name='Conv1D_layer')

    embedding_output = embeddings(input)
    conv1D_output = conv1D(embedding_output)

    conv1D_output = layers.GlobalMaxPooling1D()(conv1D_output)
    
    predictions = layers.Dense(23, activation='softmax', name='dense_layer')(conv1D_output)
    self.model = Model([input],predictions)
    self.__compile()
    plot_model(self.model, to_file='/content/drive/My Drive/dicode/Backtranslation/cnn_model.png', show_shapes=True, show_dtype=True)
    print(self.model.summary())
  
  def __compile(self):
      self.model.compile(loss = tf.keras.losses.CategoricalCrossentropy() , optimizer='adam', metrics=['accuracy'])
    
  def fit(self, input, targets, epochs=5, batch_size=128):
      early_stopping_monitor = EarlyStopping(monitor = 'loss',patience = 20, min_delta=0.00001)
      callbacks = [early_stopping_monitor]
      history = self.model.fit([input], targets, epochs = epochs, batch_size = batch_size, callbacks = callbacks)
    
  def predict(self, input):
      return self.model.predict([input])
        
  def evaluate(self, input, targets, batch_size=128):
      return self.model.evaluate([input], targets, batch_size=batch_size)


## Entrenamiento y evaluación

In [5]:
def load_data(path):
  data = np.load(path)
  return data['arr_0']

In [6]:
x_train = load_data('/content/drive/My Drive/dicode/Backtranslation/Split/Train/x_train_base.npz')
y_train = load_data('/content/drive/My Drive/dicode/Backtranslation/Split/Train/y_train_base.npz')
x_test = load_data('/content/drive/My Drive/dicode/Backtranslation/Split/Test/x_test_base.npz')
y_test = load_data('/content/drive/My Drive/dicode/Backtranslation/Split/Test/y_test_base.npz')
embedding_matrix = load_data('/content/drive/My Drive/dicode/Backtranslation/Split/Train/embedding_matrix_train_base.npz')
with open('/content/drive/My Drive/dicode/Backtranslation/Split/Train/tokenizer_train_base.pickle', 'rb') as handle:
  tokenizer = pickle.load(handle)

input_dim = len(tokenizer.word_index)
cnn_model = CNN(input_dim=input_dim, embedding_matrix=embedding_matrix)
cnn_model.fit(x_train, y_train, epochs=10)
cnn_model.evaluate(x_test, y_test)
y_pred = cnn_model.predict(x_test)
y_predicted = np.argmax(y_pred, axis=-1)
y_true = np.argmax(y_test, axis=-1)
report = classification_report(y_true, y_predicted)
print(report)
output_file = open('/content/drive/My Drive/dicode/Backtranslation/Resultados/Split/classification_report_cnn_trainable_base.txt', 'w',encoding='utf8')
output_file.write(report)
output_file.close()


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 300)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 300, 300)          37010400  
_________________________________________________________________
Conv1D_layer (Conv1D)        (None, 296, 128)          192128    
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense_layer (Dense)          (None, 23)                2967      
Total params: 37,205,495
Trainable params: 37,205,495
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10