## 1. Importar las dependencias

In [1]:
import numpy as np
import re
import pandas as pd
from bs4 import BeautifulSoup
import warnings

In [2]:
import tensorflow as tf
warnings.filterwarnings('ignore')

# Utilizaremos el tokenizador de este módulo
import tensorflow_datasets as tfds

---
## 2. Preprocesado de datos

### Carga de Ficheros

In [3]:
# Headers
cols = ["sentiment", "id", "date", "query", "user", "text"]

train_data = pd.read_csv(
    r'D:\Python Scripts & Notebooks\Jupyter Notebooks\Artificial Intelligence\Deep Learning\Procesamiento del Lenguaje Natural Moderno en Python\RNC\train.csv',
    header=None,
    names=cols,
    engine="python",
    encoding="latin1")

test_data = pd.read_csv(
    r'D:\Python Scripts & Notebooks\Jupyter Notebooks\Artificial Intelligence\Deep Learning\Procesamiento del Lenguaje Natural Moderno en Python\RNC\test.csv',
    header=None,
    names=cols,
    engine="python",
    encoding="latin1")

In [4]:
# Visualizar el Dataframe
train_data.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


El conjunto de datos de testing tiene 3 etiquetas diferentes (una negativa, una positiva y una neutra), mientras que el conjunto de datos de entrenamiento tiene solo dos, por lo que no usaremos el archivo de testing y dividiremos el archivo de entrenamiento más tarde nosotros mismos.

In [5]:
# Copia en profundidad
data = train_data.copy()

### Limpieza

In [6]:
# Dropeamos los predictores que no necesitamos
data.drop(["id", "date", "query", "user"], axis=1, inplace=True)

In [7]:
# Función para limpiar los tweets
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # Eliminamos la @ y su mención
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Eliminamos los links de las URLs
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Nos quedamos solamente con los caracteres
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Eliminamos espacios en blanco adicionales
    tweet = re.sub(r" +", ' ', tweet)
    
    return tweet

In [8]:
# Limpiamos los tweets con la función en la columna text
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [9]:
# Cantidad de datos agrupados para el target
data.groupby('sentiment').size()

sentiment
0    800000
4    800000
dtype: int64

In [10]:
# Cambiamos las etiquetas del 4 al 1
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

### Tokenización

In [11]:
# Tokenizador
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(data_clean, target_vocab_size=2**16)

# Entradas
data_inputs = [tokenizer.encode(sentence) for sentence in data_clean]

### Padding

In [12]:
# Obtenemos el valor máximo entre todas las palabras de las entradas
max_len = max([len(sentence) for sentence in data_inputs])

# Rellenamos con '0' la longitud de aquellos valores tokenizados hasta alcanzar al 'max_len'
data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs,
                                                            value=0,
                                                            padding="post",
                                                            maxlen=max_len)

### Dividimos en los conjuntos de training y testing


In [13]:
test_idx = np.random.randint(0, 800000, 8000)
test_idx = np.concatenate((test_idx, test_idx+800000))

# Test
test_inputs = data_inputs[test_idx]
test_labels = data_labels[test_idx]

# Train
train_inputs = np.delete(data_inputs, test_idx, axis=0)
train_labels = np.delete(data_labels, test_idx)

---
## 3. Construcción del modelo

### Construir y Compilar

In [14]:
from keras import Sequential
from keras.layers import Flatten
from keras.layers import Dense
from keras.layers import Conv1D
from keras.layers import Dropout
from keras.layers import GlobalAveragePooling1D
from keras.layers.embeddings import Embedding
from keras.optimizers import adam_v2

# Convolutional Neural Network
def CNN():
    
    model = Sequential()
    model.add(Embedding(input_dim=tokenizer.vocab_size, output_dim=200))
    model.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu'))
    model.add(Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu'))
    model.add(Conv1D(filters=100, kernel_size=4, padding='valid', activation='relu'))
    model.add(GlobalAveragePooling1D())
    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))

    # Optimizer
    adam = adam_v2.Adam(learning_rate=0.0001)
    
    # Compile
    model.compile(loss='binary_crossentropy',
                 optimizer=adam,
                 metrics=['accuracy'])
    
    return model

---
## 4. Ajuste

In [15]:
# Defining callback and compiling the model
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
filepath = './checkpoints-weights.hdf5'

early_stopping = EarlyStopping(monitor='loss', patience=2, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.5, patience=1, verbose=1)
checkpoint = ModelCheckpoint(filepath, monitor='loss', mode='max', save_best_only=True, verbose=1)

### Entrenar

In [16]:
# Instanciamos
# model = CNN()

# Reentrenar
# model.fit(train_inputs,
#           train_labels,
#           batch_size=32,
#           epochs=5,
#           callbacks = [reduce_lr, early_stopping, checkpoint])

### Cargar

In [17]:
# Cargar los pesos
from keras.models import load_model

model = load_model('model_final.h5')

### Evaluación

In [18]:
# Evaluar resultados con el conjunto de test
results = model.evaluate(test_inputs, test_labels, batch_size=32)
print(f'Accuracy: {round(results[1]*100, 2)}%')

Accuracy: 83.84%
