In [14]:
# data processing
import numpy as np 
import pandas as pd

# ml/dl
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

In [7]:
df = pd.read_csv('../data/news_headlines.csv')
df = df.drop(df.columns[0], axis=1)
df.head(4)

Unnamed: 0,label,headline
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...


In [8]:
df.shape

(30846, 2)

In [76]:
df.label.value_counts()

label
positive    13811
neutral     11627
negative     5408
Name: count, dtype: int64

## Using NewsSentiment library

In [2]:
from NewsSentiment import TargetSentimentClassifier

  from .autonotebook import tqdm as notebook_tqdm





In [3]:
tsc = TargetSentimentClassifier()

In [12]:
# select data and preprocess input
N = 1000
data = list(df.headline.values[:N])
data = list(map(lambda x: ('', x, ''), data))
data

[('',
  'According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .',
  ''),
 ('',
  'Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .',
  ''),
 ('',
  'The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .',
  ''),
 ('',
  'With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .',
  ''),
 ('',
  "According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating pro

In [13]:
# predict
sentiments = tsc.infer(targets=data)
print(sentiments[0])

Processing batches:   6%|▌         | 62/1000 [00:24<06:00,  2.60batch/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Processing batches:  39%|███▉      | 393/1000 [02:27<03:54,  2.59batch/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Processing batches: 100%|██████████| 1000/1000 [06:02<00:00,  2.76batch/s]

({'class_id': 1, 'class_label': 'neutral', 'class_prob': 0.8414392471313477}, {'class_id': 2, 'class_label': 'positive', 'class_prob': 0.14529915153980255}, {'class_id': 0, 'class_label': 'negative', 'class_prob': 0.013261653482913971})





In [4]:
tsc.infer_from_text('', 'Grayscale Bitcoin ETF faces over $600 million in outflows', '')

Processing batches: 100%|██████████| 1/1 [00:00<00:00,  2.28batch/s]


({'class_id': 0, 'class_label': 'negative', 'class_prob': 0.7503341436386108},
 {'class_id': 1, 'class_label': 'neutral', 'class_prob': 0.22876092791557312},
 {'class_id': 2, 'class_label': 'positive', 'class_prob': 0.02090493217110634})

In [15]:
# extract output labels names
sentiments = list(map(lambda x: x[0]['class_label'], sentiments))
for i in sentiments:
    print(i)

neutral
positive
neutral
positive
positive
positive
positive
positive
positive
positive
positive
positive
neutral
positive
positive
positive
positive
neutral
positive
positive
positive
neutral
neutral
positive
positive
positive
positive
positive
positive
positive
neutral
positive
neutral
positive
positive
neutral
neutral
neutral
positive
positive
positive
positive
neutral
neutral
positive
positive
positive
positive
positive
neutral
positive
positive
neutral
positive
positive
positive
neutral
positive
neutral
neutral
positive
positive
neutral
neutral
neutral
neutral
neutral
neutral
positive
positive
neutral
neutral
neutral
positive
neutral
neutral
positive
positive
neutral
neutral
neutral
neutral
positive
positive
positive
neutral
negative
positive
positive
positive
positive
positive
positive
positive
positive
positive
neutral
positive
positive
positive
positive
positive
positive
positive
neutral
positive
positive
positive
neutral
positive
positive
positive
neutral
positive
neutral
neut

In [16]:
# get target labels names
target = list(df.label.values[:N])
target

['neutral',
 'neutral',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'positive',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'positive',
 'neutral',
 'positive',
 'positive',
 'neutral',
 'positive',
 'neutral',
 'positive',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'positive',
 '

In [19]:
def accuracy(pred, target):
    accuracy = 0
    for i in range(len(pred)):
        # count positive and neutral as the same
        if pred[i] != 'negative' and target[i] != 'negative':
            accuracy = accuracy + 1
        else:
            accuracy = accuracy + int(pred[i] == target[i])
    
    return accuracy/len(pred)

In [20]:
accuracy(sentiments, target)

0.972

In [32]:
target_binary = list(map(lambda x: x if x == 'negative' else 'positive', target))
sentiments_binary = list(map(lambda x: x if x == 'negative' else 'positive', sentiments))

In [34]:
le = LabelEncoder().fit(target_binary)

encoded_target = le.transform(target_binary)
encoded_pred = le.transform(sentiments_binary)

print(classification_report(encoded_target, encoded_pred, target_names=list(le.classes_)))

              precision    recall  f1-score   support

    negative       0.73      0.42      0.53        38
    positive       0.98      0.99      0.99       962

    accuracy                           0.97      1000
   macro avg       0.85      0.71      0.76      1000
weighted avg       0.97      0.97      0.97      1000



## Custom model training

In [49]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [51]:
text = list(df['headline'])
labels = list(df['label'])
labels_binary = list(map(lambda x: x if x == 'negative' else 'positive', labels))

In [52]:
# encode labels
le = LabelEncoder()
labels_encoded = le.fit_transform(labels_binary)

In [53]:
X_train, X_test, y_train, y_test = train_test_split(text, labels_encoded, test_size=0.2, random_state=42)

In [54]:
#preprocess 
tokenizer = Tokenizer(num_words=10000, oov_token= "<OOV>")
tokenizer.fit_on_texts(text)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_train_pad = pad_sequences(X_train_seq, maxlen=120, padding='post', truncating='post')
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_pad = pad_sequences(X_test_seq, maxlen=120, padding='post', truncating='post')

# convert lists into numpy arrays to make it work with TensorFlow 
X_train_pad = np.array(X_train_pad)
y_train = np.array(y_train)
X_test_pad = np.array(X_test_pad)
y_test = np.array(y_test)


In [69]:
# create model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(10000, 16, input_length=120),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
model.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (None, 120, 16)           160000    
                                                                 
 global_average_pooling1d_1  (None, 16)                0         
 1 (GlobalAveragePooling1D)                                      
                                                                 
 dense_25 (Dense)            (None, 24)                408       
                                                                 
 dense_26 (Dense)            (None, 1)                 25        
                                                                 
Total params: 160433 (626.69 KB)
Trainable params: 160433 (626.69 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [70]:
epochs = 12
batch_size = 32
training = model.fit(X_train_pad, y_train, epochs=epochs, validation_data=(X_test_pad, y_test))

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
