# Classifier based on deep learning

## Import necessary dependencies and data

In [1]:
from sklearn.metrics import accuracy_score, f1_score
import os
import numpy as np
from data_extraction import get_raw_dataset
import tensorflow as tf

KeyboardInterrupt: 

In [None]:
# Reload Raw Data
X_train, y_train = get_raw_dataset(mode='train')
X_dev, y_dev = get_raw_dataset(mode='dev')
X_test, _ = get_raw_dataset(mode='test')

## Load the Pre-Trained DistilBERT Classification-based Model

Note: This pre-trained model has a classification head which is suitable for our problem. 

In [None]:
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification

# Load a pretrained model
# https://huggingface.co/distilbert/distilbert-base-uncased
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Initialize DistilBERT model for sequence classification
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2) # binary classification

# Convert X_train to a list of strings
X_train_list = X_train.tolist()

encoded_input = tokenizer(
    X_train_list,
    padding=True,
    truncation=True,
    max_length=64,
    return_tensors='tf'
)

# Convert y_train to a tensor
y_train_tensor = tf.convert_to_tensor(y_train.values)

outputs = model(
    input_ids=encoded_input['input_ids'],
    attention_mask=encoded_input['attention_mask'],
    labels=y_train_tensor  # for computing a loss
)
loss = outputs.loss
logits = outputs.logits

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

ResourceExhaustedError: Exception encountered when calling layer 'embeddings' (type TFEmbeddings).

{{function_node __wrapped__AddV2_device_/job:localhost/replica:0/task:0/device:CPU:0}} OOM when allocating tensor with shape[119757,64,768] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator mklcpu [Op:AddV2] name: 

Call arguments received by layer 'embeddings' (type TFEmbeddings):
  • input_ids=tf.Tensor(shape=(119757, 64), dtype=int32)
  • position_ids=None
  • inputs_embeds=None
  • training=False

## Fine-tuning the DistilBERT Model

In [None]:
# Construct a Tensorflow-based dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(encoded_input),  # model expects a dict of input_ids/attention_mask
    y_train_tensor
)).batch(8)

In [None]:
# Compile the Model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5) # Learning rate inspired by: https://arxiv.org/pdf/1810.04805
model.compile(optimizer=optimizer, loss=model.compute_loss)

In [None]:
# Fine-tune the Model with less epochs via another training
model.fit(train_dataset, epochs=2)  

In [None]:
# Define the directory path
dir = os.path.dirname(os.curdir)

# Save the pretrained model
model.save_pretrained(os.path.join(dir, 'models', 'model_deep_learning_distilBERT'))