In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
import tensorflow_addons as tfa

In [2]:
df = pd.read_csv('/kaggle/input/financial-sentiment-analysis/data.csv')
df = df.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,Sentence,Sentiment
0,Altogether CapMan employs approximately 150 pe...,neutral
1,Work on the assignment has already started and...,neutral
2,Nordea will coordinate the syndicated loan .,neutral
3,The company had net sales of EUR 19.8 mn and a...,neutral
4,Customers wave cards in front of the reader to...,neutral
...,...,...
5837,"Motorola , the world 's second-largest maker o...",neutral
5838,"The Estonian electronic components factory , E...",negative
5839,Major Order in India Comptel Corporation has r...,positive
5840,"The add-on order contains , among others , con...",neutral


In [3]:
k = np.unique(df.Sentiment)
v = np.arange(len(k))
mapping_dict = dict(zip(k, v))
df.Sentiment = df.Sentiment.map(mapping_dict)
df

Unnamed: 0,Sentence,Sentiment
0,Altogether CapMan employs approximately 150 pe...,1
1,Work on the assignment has already started and...,1
2,Nordea will coordinate the syndicated loan .,1
3,The company had net sales of EUR 19.8 mn and a...,1
4,Customers wave cards in front of the reader to...,1
...,...,...
5837,"Motorola , the world 's second-largest maker o...",1
5838,"The Estonian electronic components factory , E...",0
5839,Major Order in India Comptel Corporation has r...,2
5840,"The add-on order contains , among others , con...",1


In [4]:
from transformers import TFAutoModel, AutoTokenizer, AutoConfig
from datasets import Dataset

In [5]:
checkpoint = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

# Dataset

In [6]:
df.columns = ['text', 'label']
train, valid = train_test_split(df, shuffle=True, random_state=123, test_size=0.2, stratify=df.label)

hf_train = Dataset.from_pandas(train)
hf_valid = Dataset.from_pandas(valid)

In [7]:
def tokenize_fn(row):
    return tokenizer(row['text'], max_length=128, padding='max_length', truncation=True)

In [8]:
train_tokenized = hf_train.map(tokenize_fn, batched=True)
valid_tokenized = hf_valid.map(tokenize_fn, batched=True)

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [9]:
batch_size = 64

train_ds = tf.data.Dataset.from_tensor_slices(({'input_ids': train_tokenized['input_ids'],
                                         'attention_mask': train_tokenized['attention_mask'],
                                        },
                                        train_tokenized['label']))
train_ds = train_ds.shuffle(128).batch(batch_size).prefetch(-1)

valid_ds = tf.data.Dataset.from_tensor_slices(({'input_ids': valid_tokenized['input_ids'],
                                               'attention_mask': valid_tokenized['attention_mask']
                                              }, 
                                              valid_tokenized['label']))
valid_ds = valid_ds.batch(batch_size).prefetch(-1)

2022-12-21 04:26:46.020487: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-21 04:26:46.111814: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-21 04:26:46.112703: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-21 04:26:46.115726: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

# Model Development

In [10]:
def create_inputs():
    input_ids = layers.Input(shape=(128,), name='input_ids', dtype=tf.int32)
    attention_mask = layers.Input(shape=(128,), name='attention_mask', dtype=tf.int32)
    inputs = {'input_ids': input_ids,
              'attention_mask': attention_mask}
    return inputs

In [11]:
def nn_model():
    
    inputs = create_inputs()
    
    config = AutoConfig.from_pretrained(checkpoint)
    config.output_hidden_states = True

    backbone = TFAutoModel.from_pretrained(checkpoint, config=config)
    x = backbone(inputs)[0]
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dense(128, activation='relu')(x)
    outputs = layers.Dense(3, activation='softmax')(x)
    
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=tfa.optimizers.AdamW(learning_rate=2e-5, weight_decay=1e-4),
                  metrics='accuracy',
                 )
    return model

In [12]:
model = nn_model()
model.summary()

Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]

2022-12-21 04:27:03.933962: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'vocab_layer_norm', 'vocab_transform', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the 

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
attention_mask (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_ids (InputLayer)          [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_distil_bert_model (TFDistilB TFBaseModelOutput(la 66362880    attention_mask[0][0]             
                                                                 input_ids[0][0]                  
__________________________________________________________________________________________________
global_average_pooling1d (Globa (None, 768)          0           tf_distil_bert_model[0][7]   

In [13]:
ckpt = tf.keras.callbacks.ModelCheckpoint(
    filepath='best_weights.h5',
    monitor='val_accuracy',
    save_best_only=True,
    save_weights_only=True,
    verbose=1,
    mode='max',
)

model.fit(train_ds,
          validation_data=valid_ds,
          epochs=3,
          callbacks=[ckpt]
         )

Epoch 1/3


2022-12-21 04:27:19.152420: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)



Epoch 00001: val_accuracy improved from -inf to 0.76989, saving model to best_weights.h5
Epoch 2/3

Epoch 00002: val_accuracy improved from 0.76989 to 0.77844, saving model to best_weights.h5
Epoch 3/3

Epoch 00003: val_accuracy improved from 0.77844 to 0.78015, saving model to best_weights.h5


<keras.callbacks.History at 0x7fd6c47319d0>

In [14]:
# load best weights
model.load_weights('./best_weights.h5')

In [16]:
# evaluate on validation set
loss, accuracy = model.evaluate(valid_ds, verbose=1)



In [17]:
print(f'Validation Loss: {loss:.4f}, Validation Accuracy: {accuracy:.4f}')

Validation Loss: 0.4559, Validation Accuracy: 0.7802
