In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from nltk.tokenize import word_tokenize

import tensorflow as tf
from tensorflow.keras import layers


from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'tensorflow.keras'

In [2]:
words = []
embeddings = []

with open('/kaggle/input/glove6b100d/glove.6B.100d.txt', 'r') as f:
    for line in tqdm(f, total=400_000):
        word, *vector = line.split()
        words.append(word)
        embeddings.append(vector)

  0%|          | 0/400000 [00:00<?, ?it/s]

In [3]:
embeddings = np.float32(np.array(embeddings))

In [4]:
embeddings.shape

(400000, 100)

In [5]:
embeddings = np.concatenate([np.zeros((2, 100)),
                            embeddings], axis=0)

In [6]:
word2idx = {w: i+2 for i, w in enumerate(words)}
idx2word = {i+2: w for i, w in enumerate(words)}

word2idx['PAD'] = 0
idx2word[0] = 'PAD'

word2idx['OOV'] = 1
idx2word[1] = 'OOV'

# Data reading and preprocecssing

In [7]:
df = pd.read_csv('/kaggle/input/financial-sentiment-analysis/data.csv')
df

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral
...,...,...
5837,RISING costs have forced packaging producer Hu...,negative
5838,Nordic Walking was first used as a summer trai...,neutral
5839,"According shipping company Viking Line , the E...",neutral
5840,"In the building and home improvement trade , s...",neutral


In [8]:
keys = np.unique(df.Sentiment)
values = list(range(len(keys)))
label_mapper = dict(zip(keys, values))
label_mapper

{'negative': 0, 'neutral': 1, 'positive': 2}

In [9]:
df.Sentiment = df.Sentiment.map(label_mapper)

In [10]:
df.Sentence = df.Sentence.apply(lambda x: x.lower())
df

Unnamed: 0,Sentence,Sentiment
0,the geosolutions technology will leverage bene...,2
1,"$esi on lows, down $1.50 to $2.50 bk a real po...",0
2,"for the last quarter of 2010 , componenta 's n...",2
3,according to the finnish-russian chamber of co...,1
4,the swedish buyout firm has sold its remaining...,1
...,...,...
5837,rising costs have forced packaging producer hu...,0
5838,nordic walking was first used as a summer trai...,1
5839,"according shipping company viking line , the e...",1
5840,"in the building and home improvement trade , s...",1


In [11]:
df.Sentence = df.Sentence.apply(lambda x: word_tokenize(x))

In [12]:
df

Unnamed: 0,Sentence,Sentiment
0,"[the, geosolutions, technology, will, leverage...",2
1,"[$, esi, on, lows, ,, down, $, 1.50, to, $, 2....",0
2,"[for, the, last, quarter, of, 2010, ,, compone...",2
3,"[according, to, the, finnish-russian, chamber,...",1
4,"[the, swedish, buyout, firm, has, sold, its, r...",1
...,...,...
5837,"[rising, costs, have, forced, packaging, produ...",0
5838,"[nordic, walking, was, first, used, as, a, sum...",1
5839,"[according, shipping, company, viking, line, ,...",1
5840,"[in, the, building, and, home, improvement, tr...",1


In [13]:
def indexing_fn(tokens):
    idxs = []
    for token in tokens:
        idxs.append(word2idx.get(token, word2idx['OOV']))
    return idxs

In [14]:
df.Sentence = df.Sentence.apply(indexing_fn)

In [15]:
df

Unnamed: 0,Sentence,Sentiment
0,"[2, 1, 734, 45, 10746, 1, 11, 14652, 4802, 23,...",2
1,"[82, 80988, 15, 9831, 3, 137, 82, 17310, 6, 82...",0
2,"[12, 2, 78, 550, 5, 657, 3, 1, 11, 1052, 528, ...",2
3,"[202, 6, 2, 1, 3198, 5, 2304, 3, 66, 2, 226, 9...",1
4,"[2, 2794, 11551, 1003, 33, 921, 49, 1528, 3179...",1
...,...,...
5837,"[1539, 1140, 35, 1013, 10831, 1940, 342165, 6,...",0
5838,"[9873, 3528, 17, 60, 182, 21, 9, 742, 790, 324...",1
5839,"[202, 4562, 130, 13631, 333, 3, 2, 646, 453, 4...",1
5840,"[8, 2, 449, 7, 165, 3600, 314, 3, 528, 7842, 2...",1


In [16]:
padded_idx = tf.keras.preprocessing.sequence.pad_sequences(df.Sentence,
                                                          maxlen=128,
                                                          padding='post',
                                                          truncating='post')

In [17]:
train_x, valid_x = train_test_split(padded_idx, test_size=0.2, shuffle=True, random_state=0, stratify=df.Sentiment)
train_y, valid_y = train_test_split(df.Sentiment, test_size=0.2, shuffle=True, random_state=0, stratify=df.Sentiment)

In [18]:
train_ds = tf.data.Dataset.from_tensor_slices((train_x, train_y)).shuffle(128).batch(64).prefetch(-1)
valid_ds = tf.data.Dataset.from_tensor_slices((valid_x, valid_y)).batch(64).prefetch(-1)

2022-11-28 11:21:28.413379: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-28 11:21:28.564047: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-28 11:21:28.565225: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-28 11:21:28.568612: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [61]:
for x, y in train_ds:
    break

# Embedding layer

In [69]:
embed_layer = layers.Embedding(input_dim=400002,
                               output_dim=100,
                               embeddings_initializer=tf.keras.initializers.Constant(embeddings),
                               trainable=True,)

In [75]:
embed_layer = layers.Embedding(input_dim=400002,
                               output_dim=200,
                               trainable=True,)

# model dev

In [83]:
def nn_model():
    inputs = layers.Input(shape=(128,))
    embed = layers.Embedding(input_dim=400002,
                               output_dim=100,
                               embeddings_initializer=tf.keras.initializers.Constant(embeddings),
                               trainable=False,)
    x = embed(inputs)
    x = layers.SimpleRNN(265, return_sequences=True)(x)
    x = layers.SimpleRNN(128, return_sequences=False)(x)
    
    x = layers.Dense(64, activation='relu')(x)
    outputs = layers.Dense(3, activation='softmax')(x)
    
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                  metrics=['accuracy'])
    return model

In [84]:
tf.keras.backend.clear_session()
model = nn_model()
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 128)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 128, 100)          40000200  
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 128, 265)          96990     
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 128)               50432     
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 195       
Total params: 40,156,073
Trainable params: 155,873
Non-trainable params: 40,000,200
___________________________________________

In [None]:
model.fit(train_ds, 
         validation_data=valid_ds,
         epochs=5)

Epoch 1/5


2022-11-28 11:18:55.484904: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 2/5
Epoch 3/5

In [19]:
def nn_model():
    inputs = layers.Input(shape=(128,))
    embed = layers.Embedding(input_dim=400002,
                               output_dim=100,
                               embeddings_initializer=tf.keras.initializers.Constant(embeddings),
                               trainable=False,)
    x = embed(inputs)
    x = layers.LSTM(265, return_sequences=True)(x)
    x = layers.LSTM(128, return_sequences=False)(x)
    
    x = layers.Dense(64, activation='relu')(x)
    outputs = layers.Dense(3, activation='softmax')(x)
    
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                  metrics=['accuracy'])
    return model

In [20]:
tf.keras.backend.clear_session()
model = nn_model()
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 128)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 128, 100)          40000200  
_________________________________________________________________
lstm (LSTM)                  (None, 128, 265)          387960    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               201728    
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 195       
Total params: 40,598,339
Trainable params: 598,139
Non-trainable params: 40,000,200
___________________________________________

In [21]:
model.fit(train_ds, 
         validation_data=valid_ds,
         epochs=5)

Epoch 1/5


2022-11-28 11:22:00.984670: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2022-11-28 11:22:02.122578: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f83b3155550>

In [22]:
def nn_model():
    inputs = layers.Input(shape=(128,))
    embed = layers.Embedding(input_dim=400002,
                               output_dim=100,
                               embeddings_initializer=tf.keras.initializers.Constant(embeddings),
                               trainable=False,)
    x = embed(inputs)
    x = layers.GRU(265, return_sequences=True)(x)
    x = layers.GRU(128, return_sequences=False)(x)
    
    x = layers.Dense(64, activation='relu')(x)
    outputs = layers.Dense(3, activation='softmax')(x)
    
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                  metrics=['accuracy'])
    return model

In [23]:
tf.keras.backend.clear_session()
model = nn_model()
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 128)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 128, 100)          40000200  
_________________________________________________________________
gru (GRU)                    (None, 128, 265)          291765    
_________________________________________________________________
gru_1 (GRU)                  (None, 128)               151680    
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 195       
Total params: 40,452,096
Trainable params: 451,896
Non-trainable params: 40,000,200
___________________________________________

In [24]:
model.fit(train_ds, 
         validation_data=valid_ds,
         epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f83b3092d50>

In [25]:
def nn_model():
    inputs = layers.Input(shape=(128,))
    embed = layers.Embedding(input_dim=400002,
                               output_dim=100,
                               embeddings_initializer=tf.keras.initializers.Constant(embeddings),
                               trainable=False,)
    x = embed(inputs)
    x = layers.Bidirectional(layers.LSTM(265, return_sequences=True))(x)
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=False))(x)
    
    x = layers.Dense(64, activation='relu')(x)
    outputs = layers.Dense(3, activation='softmax')(x)
    
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                  metrics=['accuracy'])
    return model

In [26]:
tf.keras.backend.clear_session()
model = nn_model()
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 128)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 128, 100)          40000200  
_________________________________________________________________
bidirectional (Bidirectional (None, 128, 530)          775920    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               674816    
_________________________________________________________________
dense (Dense)                (None, 64)                16448     
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 195       
Total params: 41,467,579
Trainable params: 1,467,379
Non-trainable params: 40,000,200
_________________________________________

In [27]:
model.fit(train_ds, 
         validation_data=valid_ds,
         epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f83b3a86d50>

In [29]:
preds = model.predict(valid_ds, verbose=1)



In [30]:
preds

array([[0.21806417, 0.6731907 , 0.10874511],
       [0.16601238, 0.73893976, 0.09504792],
       [0.01270728, 0.71048933, 0.27680346],
       ...,
       [0.03087193, 0.93018484, 0.03894313],
       [0.01813367, 0.81501305, 0.16685326],
       [0.01562289, 0.92322725, 0.06114994]], dtype=float32)