In [1]:
# !python -m spacy download en_core_web_sm

In [2]:
#Others required libraries 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from prettytable import PrettyTable

In [3]:
training_headline_df = pd.read_csv('../input/reddit-fin-headlines-data/2k_new_labelled.csv')[['sentiment', 'title']]
serving_headline_df = pd.read_csv('../input/reddit-fin-headlines-data/8k-labelled-vader.csv')[['sentiment','title']]

kaggle_headline_df = pd.read_csv('../input/sentiment-analysis-for-financial-news/all-data.csv', sep=',', encoding='latin-1',names = ["sentiment","title"])
combined_df = pd.concat([training_headline_df, kaggle_headline_df]).sample(frac=1)

# Deep Learning

In [13]:
#Sklearn 
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

#tensorflow
import keras
import tensorflow as tf
from keras import backend as K
from tensorflow.keras import optimizers
from keras.models import Model, load_model
from keras.models import Sequential
from keras.layers import Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical,plot_model

In [14]:
finbert = "../input/finbert/"
cnnlstm = "../input/cnnlstm-weight/"

In [15]:
model_names = [
    "train_on_reddit",
    "transfer_learning_on_reddit",
    "combined_data" ]


In [16]:
# Split training_headline_df into training and test with ratio 9:1
train_df = training_headline_df.iloc[:1842]
test_df = training_headline_df.iloc[1842:]

In [17]:
model_name = model_names[-1]
cnn_cpt =  f"{cnnlstm}cnn_{model_name}.h5"
lstm_cpt =  f"{cnnlstm}lstm_{model_name}.h5"
bert_cpt = f"../input/finbert/finbert_finetuned_{model_name}.bin"

In [18]:
id2label = {
    0: "positive",
    1: "negative",
    2: "neutral"
  }

cnn_lstm_id2label = {
    0: "positive",
    1: "neutral",
    2: "negative"
  }

bert2cnn = {
    0: 0,
    1: 2,
    2: 1
  }

### Loading Keras models: CNN, LSTM

In [19]:
def recall_m(y_true, y_pred):
        true_positives     = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall             = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives      = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision           = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall    = recall_m(y_true, y_pred)
    return    2*(precision*recall)/(precision+recall+K.epsilon())

In [20]:
tokens = Tokenizer(oov_token = '<OOV>', lower=True, split=' ')
def get_vocab_size(df):
    tokens.fit_on_texts(df['title'].values)
    vocab_size = len(tokens.word_index) + 1
    return vocab_size, tokens

In [21]:
def prepare_data(headline_df):

    le = LabelEncoder()
    le.fit_transform(headline_df['title'])
    vocab_size, tokens = get_vocab_size(headline_df)
    headline_df['title'] = headline_df['title'].astype(str)
    encoded_train = tokens.texts_to_sequences(headline_df['title'])
    padded_train = pad_sequences(encoded_train, maxlen=200, padding='post', truncating='post')
    
    #label encoding
    le = LabelEncoder()
    headline_df['encoded_category'] = le.fit_transform(headline_df['sentiment'])
    y = headline_df["encoded_category"].values
    y = to_categorical(y, num_classes=3)

    return le, padded_train, y

In [22]:
def get_keras_pred(model_name, X_train):
    cnn_cpt = f"../input/cnnlstm-weight/cnn_{model_name}.h5"
    lstm_cpt = f"../input/cnnlstm-weight/lstm_{model_name}.h5"

    cnn_model = keras.models.load_model(cnn_cpt, custom_objects={"f1_m": f1_m, "precision_m": precision_m, "recall_m": recall_m})
    cnn_preds = cnn_model.predict(X_train)

    lstm_model = keras.models.load_model(lstm_cpt, custom_objects={"f1_m": f1_m, "precision_m": precision_m, "recall_m": recall_m})
    lstm_preds = lstm_model.predict(X_train)
    
    return (cnn_preds, lstm_preds)

In [23]:
def get_cnn_lstm_train_test_pred(train_df, test_df):

    le_train, padded_train, y = prepare_data(train_df)
    (cnn_preds, lstm_preds) = get_keras_pred(model_name, padded_train)

    le_test, padded_test, y_test = prepare_data(test_df)
    (test_cnn_preds, test_lstm_preds) = get_keras_pred(model_name, padded_test)
    
    return (cnn_preds, lstm_preds), (test_cnn_preds, test_lstm_preds), (y, y_test)

In [24]:
def get_cnn_lstm_serving_pred(headline_df):

    le = LabelEncoder()
    le.fit_transform(headline_df['title'])
    vocab_size, tokens = get_vocab_size(headline_df)
    headline_df['title'] = headline_df['title'].astype(str)
    encoded_train = tokens.texts_to_sequences(headline_df['title'])
    padded_train = pad_sequences(encoded_train, maxlen=200, padding='post', truncating='post')
    
    (cnn_preds, lstm_preds) = get_keras_pred(model_name, padded_train)
    
    return (cnn_preds.argmax(axis=-1), lstm_preds.argmax(axis=-1))

### Loading FinBERT

In [25]:
from transformers import (
    AutoTokenizer, 
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    AutoModelForSequenceClassification,
)

from datasets import Dataset, load_metric

import torch
import torch.nn as nn
from tqdm.auto import tqdm

from scipy.optimize import differential_evolution
from tensorflow.keras import layers
from keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical,plot_model

In [26]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall}

In [27]:
def prep_testing_data(headline_df, tokenizer):

    def tokenize_fn(examples):
        return tokenizer(examples['text'], truncation=True)

    headline_df['sentiment'] = LabelEncoder().fit_transform(headline_df['sentiment'])
    X, y = headline_df['title'].values, headline_df['sentiment'].values
    
    
    dataset_raw = Dataset.from_dict({'text':X, 'labels': y})
    dataset = dataset_raw.map(tokenize_fn, batched=True)
    
    return dataset

In [28]:
def get_pred(model, dataset, tokenizer):
    data_collator = DataCollatorWithPadding(tokenizer)
    
    # arguments for Trainer
    test_args = TrainingArguments(
        output_dir = bert_cpt,
        do_train = False,
        do_predict = True,
        per_device_eval_batch_size = 64,   
        dataloader_drop_last = False    
    )

    # init trainer
    trainer = Trainer(
                  model = model, 
                  args = test_args, 
                  data_collator=data_collator,
                  compute_metrics = compute_metrics)
    
    test_results = trainer.predict(dataset)
    
    return test_results

In [29]:
def get_bert_train_test_pred(train_df, test_df):
    model = AutoModelForSequenceClassification.from_pretrained(bert_cpt)
    tokenizer = AutoTokenizer.from_pretrained(bert_cpt)

    train_dataset = prep_testing_data(train_df, tokenizer)
    train_results = get_pred(model, train_dataset, tokenizer)
    bert_preds = train_results.predictions
    
    test_dataset = prep_testing_data(test_df, tokenizer)
    test_results = get_pred(model,test_dataset, tokenizer)
    test_bert_preds = test_results.predictions
    
    return (bert_preds, test_bert_preds)
    

In [30]:
def get_bert_serve_pred(headline_df):
    model = AutoModelForSequenceClassification.from_pretrained(bert_cpt)
    tokenizer = AutoTokenizer.from_pretrained(bert_cpt)

    def tokenize_fn(examples):
        return tokenizer(examples['text'], truncation=True)

    X = headline_df['title'].values
    
    dataset_raw = Dataset.from_dict({'text':X})
    dataset = dataset_raw.map(tokenize_fn, batched=True)
    
    train_results = get_pred(model, dataset, tokenizer)
    bert_preds = train_results.predictions.argmax(axis=-1)
    
    return bert_preds
    

In [31]:
def prediction_df(cnn_preds, lstm_preds, bert_preds, y):
    train_cnn_preds, train_lstm_preds, train_bert_preds, y_train = cnn_preds.argmax(axis=-1), lstm_preds.argmax(axis=-1), bert_preds.argmax(axis=-1), y.argmax(axis=-1)
    train_bert2cnn_preds = np.vectorize(bert2cnn.get)(train_bert_preds)
    preds_df = pd.DataFrame()
    preds_df['cnn'], preds_df['lstm'], preds_df['bert'], preds_df['label'] = train_cnn_preds, train_lstm_preds, train_bert2cnn_preds, y_train
    
    return preds_df

In [32]:
def dataframe_to_dataset(dataframe):
    dataframe = dataframe.copy()
    labels = dataframe.pop("label")
    y = to_categorical(labels, num_classes=3)
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), y))
    ds = ds.shuffle(buffer_size=len(dataframe))
    return ds

In [33]:
from tensorflow.keras.layers import IntegerLookup
from tensorflow.keras.layers import StringLookup

def encode_categorical_feature(feature, name, dataset, is_string):
    lookup_class = StringLookup if is_string else IntegerLookup
    # Create a lookup layer which will turn strings into integer indices
    lookup = lookup_class(output_mode="binary")

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the set of possible string values and assign them a fixed integer index
    lookup.adapt(feature_ds)

    # Turn the string input into integer indices
    encoded_feature = lookup(feature)
    return encoded_feature


In [34]:
def create_callbacks():
    reduce_lr = ReduceLROnPlateau(monitor='val_f1_m', 
                                  mode = 'max', 
                                  factor=0.5, 
                                  patience=5, 
                                  min_lr=0.0001, 
                                  verbose=10)

    checkpoint = ModelCheckpoint(f"ensemble_training", 
                                   monitor="val_loss", 
                                   mode="min", 
                                   save_best_only = True, 
                                   verbose=1)

    earlystop = EarlyStopping(monitor = 'val_loss', 
                                mode="min", 
                                min_delta = 0, 
                                patience = 5,
                                verbose=1)
    callbacks = [reduce_lr, checkpoint, earlystop]
    return callbacks

In [35]:
def return_train_val_test_ds(preds_df, test_preds_df):
    testing_df = preds_df.sample(frac=0.1)
    training_df = preds_df.drop(testing_df.index)
    train_ds = dataframe_to_dataset(training_df).batch(64)
    val_ds = dataframe_to_dataset(testing_df).batch(64)
    test_ds = dataframe_to_dataset(test_preds_df).batch(64)   
    
    return (train_ds, val_ds, test_ds)

In [36]:
def encode_features_input(train_ds):
    # Categorical features encoded as integers
    cnn_preds_input = keras.Input(shape=(1,), name="cnn", dtype="int64")
    lstm_preds_input = keras.Input(shape=(1,), name="lstm", dtype="int64")
    bert2cnn_preds_input = keras.Input(shape=(1,), name="bert", dtype="int64")

    all_inputs = [cnn_preds_input, lstm_preds_input, bert2cnn_preds_input]

    # Integer categorical features
    cnn_preds_encoded = encode_categorical_feature(cnn_preds_input, "cnn", train_ds, False)
    lstm_preds_encoded = encode_categorical_feature(lstm_preds_input, "lstm", train_ds, False)
    bert2cnn_preds_encoded = encode_categorical_feature(bert2cnn_preds_input, "bert", train_ds, False)

    all_features = layers.concatenate(
        [ cnn_preds_encoded, lstm_preds_encoded, bert2cnn_preds_encoded]
    )
    
    return all_inputs, all_features

In [37]:
def ensembler(all_inputs, all_features):
    x = layers.Dense(64, activation="relu")(all_features)
    x = layers.Dropout(0.5)(x)

    x = layers.Dense(32, activation="relu")(all_features)
    x = layers.Dropout(0.5)(x)

    output = layers.Dense(3, activation='softmax')(x)
    model = keras.Model(all_inputs, output)

    opt = tf.keras.optimizers.Adam(learning_rate = 0.001)
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=[f1_m, 'acc', precision_m, recall_m])
    
    return model

In [38]:
def plot_metric(score):
    pretty_table = PrettyTable()
    pretty_table.field_names = ['Parameters','Value']
    pretty_table.add_row(['Loss Score'                     ,score[0]])
    pretty_table.add_row(['F1 Score'                       ,score[1]])
    pretty_table.add_row(['Accuracy Score'                 ,score[2]])
    pretty_table.add_row(['Recall Score'                   ,score[3]])
    pretty_table.add_row(['Precision Score'                ,score[4]])

    print(pretty_table)

In [39]:
(cnn_preds, lstm_preds), (test_cnn_preds, test_lstm_preds), (y, y_test) = get_cnn_lstm_train_test_pred(train_df, test_df)
(bert_preds, test_bert_preds) = get_bert_train_test_pred(train_df, test_df)
train_preds_df = prediction_df(cnn_preds, lstm_preds, bert_preds, y)
test_preds_df = prediction_df(test_cnn_preds, test_lstm_preds, test_bert_preds, y_test)

(train_ds, val_ds, test_ds) = return_train_val_test_ds(train_preds_df, test_preds_df)
all_inputs, all_features = encode_features_input(train_ds)
model = ensembler(all_inputs, all_features)
model.fit(train_ds, epochs=50, callbacks=create_callbacks(), validation_data=val_ds, verbose = 0)
score = model.evaluate(test_ds)
print("---------------------------------------------------")
print("Test Metric")
plot_metric(score)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == "":
2022-11-03 19:21:15.259249: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2022-11-03 19:21:15.860108: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


  0%|          | 0/2 [00:00<?, ?ba/s]

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1842
  Batch size = 64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


  0%|          | 0/1 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 205
  Batch size = 64



Epoch 00001: val_loss improved from inf to 0.98484, saving model to ensemble_training


2022-11-03 19:24:52.297315: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.



Epoch 00002: val_loss improved from 0.98484 to 0.88715, saving model to ensemble_training

Epoch 00003: val_loss improved from 0.88715 to 0.81677, saving model to ensemble_training

Epoch 00004: val_loss improved from 0.81677 to 0.76114, saving model to ensemble_training

Epoch 00005: val_loss improved from 0.76114 to 0.70907, saving model to ensemble_training

Epoch 00006: val_loss improved from 0.70907 to 0.67046, saving model to ensemble_training

Epoch 00007: val_loss improved from 0.67046 to 0.64566, saving model to ensemble_training

Epoch 00008: val_loss improved from 0.64566 to 0.63191, saving model to ensemble_training

Epoch 00009: val_loss improved from 0.63191 to 0.62467, saving model to ensemble_training

Epoch 00010: val_loss improved from 0.62467 to 0.62112, saving model to ensemble_training

Epoch 00011: val_loss improved from 0.62112 to 0.61905, saving model to ensemble_training

Epoch 00012: val_loss improved from 0.61905 to 0.61767, saving model to ensemble_training

In [40]:
model.save(f"ensemble_{model_name}")

In [41]:
serve_df = serving_headline_df
(serve_cnn_preds, serve_lstm_preds) = get_cnn_lstm_serving_pred(serve_df)
serve_bert_preds = get_bert_serve_pred(serve_df)
serve_bert2cnn_preds = np.vectorize(bert2cnn.get)(serve_bert_preds)

serve_preds_df = pd.DataFrame()
serve_preds_df['cnn'], serve_preds_df['lstm'], serve_preds_df['bert'] = serve_cnn_preds, serve_lstm_preds, serve_bert2cnn_preds
le = LabelEncoder()
serve_preds_df['label'] = le.fit_transform(serve_preds_df['label'])
serve_preds_df.head()

loading configuration file ../input/finbert/finbert_finetuned_combined_data.bin/config.json
Model config BertConfig {
  "_name_or_path": "../input/finbert/finbert_finetuned_combined_data.bin",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "positive",
    "1": "negative",
    "2": "neutral"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 1,
    "neutral": 2,
    "positive": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_ca

  0%|          | 0/9 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 8409
  Batch size = 64


Unnamed: 0,cnn,lstm,bert
0,1,1,2
1,1,1,0
2,0,1,2
3,1,1,2
4,2,2,2


In [43]:
serve_ds = dataframe_to_dataset(serve_preds_df).batch(64)
le = LabelEncoder()
le.fit_transform(test_df['sentiment'])
predictions = model.predict(serve_ds)
submission = pd.DataFrame()
submission['title'] = serve_df['title']
submission['sentiment'] = serve_df['sentiment']
submission['prediction'] = le.inverse_transform(predictions.argmax(axis=-1))
submission.to_csv(f'ensemble_{model_name}_serving.csv', index=False)