In [1]:
import pandas as pd
import numpy as np
import json
import tensorflow as tf

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from transformers import BertTokenizer, TFBertModel
from keras.layers import Input, Embedding, LSTM, Dense, Dropout, Concatenate, BatchNormalization, Bidirectional
from tensorflow.keras import regularizers
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.python.client import device_lib
from keras.optimizers import Adam

from utils.system import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(tf.__version__)
print(device_lib.list_local_devices())

2.10.0
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 12983914648270708445
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 14267973632
locality {
  bus_id: 1
  links {
  }
}
incarnation: 70117088583653821
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3080 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6"
xla_global_id: 416903419
]


#### Function

In [8]:
def get_metric(label, prediction):
    cm = confusion_matrix(label, prediction)
    TP = cm[1, 1]  # True Positives
    TN = cm[0, 0]  # True Negatives
    FP = cm[0, 1]  # False Positives
    FN = cm[1, 0]  # False Negatives
    
    # Calculate precision and recall for the positive class
    precision_pos = TP / (TP + FP) if (TP + FP) != 0 else 0
    recall_pos = TP / (TP + FN) if (TP + FN) != 0 else 0
    f1_pos = 2 * (precision_pos * recall_pos) / (precision_pos + recall_pos) if (precision_pos + recall_pos) != 0 else 0
    
    # Calculate precision and recall for the negative class
    precision_neg = TN / (TN + FN) if (TN + FN) != 0 else 0
    recall_neg = TN / (TN + FP) if (TN + FP) != 0 else 0
    f1_neg = 2 * (precision_neg * recall_neg) / (precision_neg + recall_neg) if (precision_neg + recall_neg) != 0 else 0
    
    # Display in a table
    metrics = pd.DataFrame({
        'Metric': ['Precision (Positive)', 'Recall (Positive)', 'F1 Score (Positive)',
                   'Precision (Negative)', 'Recall (Negative)', 'F1 Score (Negative)'],
        'Value': [precision_pos, recall_pos, f1_pos, precision_neg, recall_neg, f1_neg]
    })
    
    print(metrics)
    return metrics

#### Prep Data

In [9]:
article = pd.read_csv(get_data() / 'human_annotations_all_8000_overall.csv')
art_emb = pd.read_parquet(get_data() / 'bert_article_emb.parquet.brotli')
sentence_emb = pd.read_parquet(get_data() / 'bert_sentence_cosine.parquet.brotli')  
sent = pd.read_parquet(get_data() / 'bert_sentiment.parquet.brotli')

In [10]:
sent['sent_score'] = sent.apply(
    lambda row: 1 if row['sent_article'] == 'POSITIVE' and row['conf_article'] > 0.75 else 
    (-1 if row['sent_article'] == 'NEGATIVE' and row['conf_article'] > 0.75 else 0), axis=1)

In [11]:
merged_emb = pd.merge(art_emb, sentence_emb, on='id', how='inner').merge(sent[['sent_score']], on='id', how='inner')

In [12]:
merged_emb['comb_emb'] = merged_emb.apply(lambda row: [*row['bert_emb_art'], *row['bert_emb_min'], *row['bert_emb_max']], axis=1)

#### Out of Sample Train Model (Embedding + Sent + Bert)

In [53]:
undersample = merged_emb.sort_values('overall_label')
df_class_0 = undersample[undersample['overall_label'] == 0]
df_class_1 = undersample[undersample['overall_label'] == 1]
n_samples = min(len(df_class_0), len(df_class_1))
# Randomly sample from each class
df_class_0_under = df_class_0.sample(n_samples)
df_class_1_under = df_class_1.sample(n_samples)
# Combine the two dataframes
merged_undersample = pd.concat([df_class_0_under, df_class_1_under], axis=0)
# Shuffle the balanced dataset
merged_undersample = merged_undersample.sample(frac=1).reset_index(drop=True)

In [54]:
raw_text_data = merged_undersample['cleaned_article']
precomputed_embeddings_np = np.stack(merged_undersample['comb_emb'].values)
labels = merged_undersample['overall_label']
sent_scores_np = np.array(merged_undersample['sent_score']).reshape(-1, 1)

In [51]:
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = TFBertModel.from_pretrained(model_name)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['distilbert.transformer.layer.5.ffn.lin1.bias', 'pre_classifier.weight', 'distilbert.transformer.layer.3.attention.k_lin.bias', 'distilbert.transformer.layer.1.attention.q_lin.weight', 'distilbert.transformer.layer.1.attention.v_lin.weight', 'distilbert.transformer.layer.1.attention.k_lin.bias', 'distilbert.transformer.layer.3.attention.v_lin.bias', 'distilbert.transformer.layer.3.sa_layer_norm.bias', 'distilbert.transformer.layer.2.attention.k_lin.bi

In [55]:
tokenized_data = tokenizer(raw_text_data.tolist(), padding=True, truncation=True, return_tensors='tf')
input_ids = tokenized_data['input_ids']
attention_mask = tokenized_data['attention_mask']

In [56]:
input_ids_np = input_ids.numpy()
attention_mask_np = attention_mask.numpy()
labels_np = labels.to_numpy()

In [57]:
# Split the data into training and testing sets
input_ids_train, input_ids_test, attention_mask_train, attention_mask_test, embeddings_train, embeddings_test, sent_scores_train, sent_scores_test, labels_train, labels_test = train_test_split(
    input_ids_np, attention_mask_np, precomputed_embeddings_np, sent_scores_np, labels_np, test_size=0.2, random_state=42
)

##### BERT

In [64]:
# BERT input
input_ids_layer = Input(shape=(input_ids_np.shape[1],), dtype=tf.int32, name='input_ids')
attention_mask_layer = Input(shape=(attention_mask_np.shape[1],), dtype=tf.int32, name='attention_mask')

# BERT model
bert_output = bert_model(input_ids_layer, attention_mask=attention_mask_layer)[1]

# Additional features input
sentiment_input = Input(shape=(sent_scores_np.shape[1],), name='sent_score')
embedding_input = Input(shape=(precomputed_embeddings_np.shape[1],), name='comb_emb')

# Concatenate BERT output with additional features
concatenated = Concatenate()([bert_output, sentiment_input, embedding_input])

# Dense layers
dense = Dense(64, activation='relu')(concatenated)
dropout = Dropout(0.5)(dense)
batch_norm = BatchNormalization()(dropout)

# Output layer
output = Dense(1, activation='sigmoid')(batch_norm)

# Construct the model
model = Model(inputs=[input_ids_layer, attention_mask_layer, sentiment_input, embedding_input], outputs=output)

# Compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [65]:
# BERT 
model.fit(
    [input_ids_train, attention_mask_train, sent_scores_train, embeddings_train], 
    labels_train, 
    epochs=100, 
    batch_size=8, 
    validation_split=0.10,
    callbacks=[early_stopping]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


<keras.callbacks.History at 0x1a44a8a3400>

In [66]:
predictions_test = model.predict([input_ids_test, attention_mask_test, sent_scores_test, embeddings_test])
predicted_labels_test = (predictions_test > 0.5).astype(int)



In [67]:
# All Embedding + Sent + Bert
metric = get_metric(labels_test, predicted_labels_test)

                 Metric     Value
0  Precision (Positive)  0.700000
1     Recall (Positive)  0.775087
2   F1 Score (Positive)  0.735632
3  Precision (Negative)  0.754253
4     Recall (Negative)  0.675127
5   F1 Score (Negative)  0.712500
