# Modelling

https://www.datacamp.com/es/tutorial/introduction-to-convolutional-neural-networks-cnns

In [1]:
# Constants & Hyperparameters to define
RANDOM_SEED = 42

NUM_WORDS = 5000
MAX_SEQ_LEN = 50
EMBEDDING_DIM = 50
NUM_FILTERS = 64
KERNEL_SIZE = 5
NUM_CLASSES = 2

In [2]:
# Import Libraries
import pandas as pd
from tensorflow import keras
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense

# Import functions
import sys
sys.path.append('../src')
# from support_model import

# Ignore Warnings
import warnings
warnings.filterwarnings('ignore')

2024-07-08 12:00:59.548224: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-08 12:00:59.560464: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-08 12:00:59.578937: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 12:00:59.578973: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-08 12:00:59.590100: I tensorflow/core/platform/cpu_feature_gua

In [3]:
# Import data
train_data = pd.read_csv('../data/train_data_preprocessed.csv')

print(f'train_data.shape: {train_data.shape}')
train_data = train_data.sample(frac=0.01)
print(f'train_data.shape: {train_data.shape}')

train_data.shape: (3599455, 5)
train_data.shape: (35995, 5)


In [4]:
text_data = train_data['text']

# Text preprocessing
tokenizer = Tokenizer(num_words=NUM_WORDS)
tokenizer.fit_on_texts(text_data)
sequences = tokenizer.texts_to_sequences(text_data)

# Padding sequences
sequences = pad_sequences(
    sequences, 
    maxlen=MAX_SEQ_LEN, 
    padding='post')

print(f'sequences:\n{sequences[:5]}')

sequences:
[[   3   68   31    1  318   12    3  160   50    1  207  116    2  235
    66   18  178  161   39   66   15  180  594   78  664 4827 1191    1
    64 1484    9 1962 4939   84   44  912  130    4  577  427   66   35
    66    3  699  246 1711   11    8  410]
 [   8   13    1   30   74 1994 4062   20   12    3   19   53    2    3
   259    6    6  346   42  263   70  116  290    4  603    3   33  327
   157    6  163    8   20    9  566    2    3   37  133    6    4  198
    71  961    4   53    0    0    0    0]
 [   1  301   35  132    1   46   28  265  189   11    5  467 2465  365
     8    9   15    6  172   28   59 1936    4 1009  163    2   63  104
    39    5   20    2 4465    5  167    7   54    2  837  108  161   12
   311  375    1 1734    8    9   59   20]
 [   8    9   95   70  915    2   95   70  246    3   19  306   27  174
   226   38   14   19    5    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0  

In [5]:
labels = pd.get_dummies(train_data['labels'], prefix='_label', dtype=int)

print(f'labels:\n{labels[:5]}')

# One-hot encode labels
labels_encoded = to_categorical(
    labels['_label_0'], 
    num_classes=NUM_CLASSES)

print(f'\nlabels_encoded:\n{labels_encoded[:5]}')

labels:
         _label_0  _label_1
157027          0         1
606029          0         1
1821717         1         0
1396672         0         1
2255680         0         1

labels_encoded:
[[1. 0.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [1. 0.]]


In [6]:
# Split data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(
    sequences, 
    labels_encoded, 
    test_size=0.2, 
    random_state=RANDOM_SEED)

print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)

X_train shape: (28796, 50)
y_train shape: (28796, 2)


## Model
**Sequential convolutional neural network (CNN) for text classification**

1. Embedding Layer
* `Embedding(input_dim=5000, output_dim=100, input_length=100)`
* `input_dim=5000`: This specifies the vocabulary size, meaning the model can handle up to **5000** unique words.
* `output_dim=100`: This defines the dimensionality of the embedding vector, which compresses each word into a **100**-dimensional vector.
* `input_length=100`: This sets the maximum length of the input text sequences (sentences or paragraphs) to **100** words.

2. Convolutional Layer
* `Conv1D(filters=64, kernel_size=5, activation='relu')`: This 1D convolutional layer extracts features from the embedded text sequences.
* `filters=64`: This indicates the number of filters used to identify patterns in the text.
* `kernel_size=5`: This defines the size of the window that the filter slides over the text sequence (**5** words in this case).
* `activation='relu'`: This activation function introduces non-linearity, allowing the model to learn complex relationships between words.

    * `'relu'` means Rectified Linear Unit (ReLU). 
    * For any input value $(x)$, it outputs the value itself if it's positive $(x > 0)$ and zero otherwise $(x <= 0)$. 
    * Mathematically, it can be represented as:
    * $f(x) = max(0, x)$

3. Pooling Layer
* `MaxPooling1D(pool_size=4)`: This layer reduces the dimensionality of the data by taking the maximum value from every window of size **4** along the sequence This helps control overfitting and focuses on the most important features.

4. Flattening Layer
* `Flatten()`: This layer transforms the 2D output from the convolutional layer into a 1D vector suitable for feeding into the fully connected layers.

5. Fully Connected Layers
* `Dense(10, activation='relu')`: This first fully connected layer has **10** neurons and uses the ReLU activation function. It learns higher-level features by combining the extracted features from the convolutional layers.

* `Dense(3, activation='softmax')`: This final fully connected layer has 3 neurons and uses the softmax activation function. It outputs a probability distribution over 3 categories, making it suitable for multi-class classification tasks (e.g., classifying text into 3 different genres).

    * `'softmax'`: For each element $(i)$ in the input vector, softmax calculates the probability $(p_i)$ using the following formula:
    * $p_i = exp(x_i) / Σ(exp(x_j))$  for all $j$ in the vector
    * Here, $exp(x_i)$ represents the exponentiation of the i-th element in the input vector.
    * $Σ(exp(x_j))$ represents the sum of the exponentials of all elements in the vector.

6. Compiling the Model:
* `model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])`: This compiles the model by specifying the optimizer (Adaptive Moment Estimation (Adam) for efficient training), the loss function (sparse categorical crossentropy for multi-class classification), and the metrics (accuracy to measure performance).

In [7]:
# Define the CNN model
model = Sequential([
    Embedding(
        input_dim=MAX_SEQ_LEN, 
        output_dim=EMBEDDING_DIM, 
        input_length=MAX_SEQ_LEN),
    Conv1D(
        filters=NUM_FILTERS, 
        kernel_size=KERNEL_SIZE, 
        activation='relu', 
        padding='same'),
    MaxPooling1D(
        pool_size=4, 
        padding='same'),
    Flatten(),
    Dense(1, activation='relu'),
    Dense(NUM_CLASSES, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', 
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy', 'precision', 'recall'])

# Train model
model.fit(x=X_train, 
    y=y_train, 
    epochs=10, 
    validation_split=0.2)

Epoch 1/10


2024-07-08 12:01:16.173108: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-08 12:01:16.212353: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-08 12:01:16.212541: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

ValueError: Argument `output` must have rank (ndim) `target.ndim - 1`. Received: target.shape=(None, 2), output.shape=(None, 1)

In [None]:
# model_loss, model_accuracy, model_precision, model_recall, model_categorical_crossentropy, model_auc, model_f1_score = model.evaluate(X_test, y_test)
# print("F1 Score:", model_f1_score)