In [75]:
import pandas as pd
import numpy as np
import json
import tensorflow as tf
import time

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from transformers import BertTokenizer, TFBertModel
from keras.layers import Input, Embedding, LSTM, Dense, Dropout, Concatenate, BatchNormalization, Bidirectional
from tensorflow.keras import regularizers
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.python.client import device_lib
from keras.optimizers import Adam

from utils.system import *

In [76]:
print(tf.__version__)
print(device_lib.list_local_devices())

2.10.0
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 1806171404303208736
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 14267973632
locality {
  bus_id: 1
  links {
  }
}
incarnation: 17531785447253545036
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3080 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6"
xla_global_id: 416903419
]


#### Function

In [77]:
def get_metric(label, prediction):
    cm = confusion_matrix(label, prediction)
    TP = cm[1, 1]  # True Positives
    TN = cm[0, 0]  # True Negatives
    FP = cm[0, 1]  # False Positives
    FN = cm[1, 0]  # False Negatives
    
    # Calculate precision and recall for the positive class
    precision_pos = TP / (TP + FP) if (TP + FP) != 0 else 0
    recall_pos = TP / (TP + FN) if (TP + FN) != 0 else 0
    f1_pos = 2 * (precision_pos * recall_pos) / (precision_pos + recall_pos) if (precision_pos + recall_pos) != 0 else 0
    
    # Calculate precision and recall for the negative class
    precision_neg = TN / (TN + FN) if (TN + FN) != 0 else 0
    recall_neg = TN / (TN + FP) if (TN + FP) != 0 else 0
    f1_neg = 2 * (precision_neg * recall_neg) / (precision_neg + recall_neg) if (precision_neg + recall_neg) != 0 else 0
    
    # Display in a table
    metrics = pd.DataFrame({
        'Metric': ['Precision (Positive)', 'Recall (Positive)', 'F1 Score (Positive)',
                   'Precision (Negative)', 'Recall (Negative)', 'F1 Score (Negative)'],
        'Value': [precision_pos, recall_pos, f1_pos, precision_neg, recall_neg, f1_neg]
    })
    
    print(metrics)
    return metrics

#### Prep Data

In [78]:
# Read in data
article = pd.read_csv(get_data() / 'human_annotations_all_8000_overall.csv')
art_emb = pd.read_parquet(get_data() / 'bert_article_emb.parquet.brotli')
sentence_emb = pd.read_parquet(get_data() / 'bert_sentence_cosine.parquet.brotli')  
sent = pd.read_parquet(get_data() / 'bert_sentiment.parquet.brotli')
art_cos = pd.read_parquet(get_data() / 'bert_art_cosine.parquet.brotli')  

In [79]:
# Merge all data together
merged_emb = (pd.merge(art_emb, sentence_emb, on='id', how='inner')
              .merge(sent, on='id', how='inner')
              .merge(art_cos, on='id', how='inner'))

In [80]:
# Stack article embeddings + sentence embeddings together into one array
merged_emb['comb_emb'] = merged_emb.apply(lambda row: [*row['bert_emb_art'], *row['bert_emb_min'], *row['bert_emb_max']], axis=1)

#### Out of Sample Train Model (Embedding + LSTM)

In [81]:
# Undersample
undersample = merged_emb.sort_values('overall_label')
df_class_0 = undersample[undersample['overall_label'] == 0]
df_class_1 = undersample[undersample['overall_label'] == 1]
n_samples = min(len(df_class_0), len(df_class_1))
df_class_0_under = df_class_0.sample(n_samples)
df_class_1_under = df_class_1.sample(n_samples)
merged_undersample = pd.concat([df_class_0_under, df_class_1_under], axis=0)
merged_undersample = merged_undersample.sample(frac=1).reset_index(drop=True)

In [82]:
# Retrieve values
raw_text = merged_undersample['cleaned_article']
emb_art = np.stack(merged_undersample['bert_emb_art'].values)
emb_sent = np.stack(merged_undersample['bert_emb_max'].values)
emb_all = np.stack(merged_undersample['comb_emb'].values)
labels = merged_undersample['overall_label']
sent_scores = np.array(merged_undersample['sent_score']).reshape(-1, 1)
art_cos_sim = np.array(merged_undersample['cosine_sim_art_mean']).reshape(-1, 1)

In [83]:
# Tokenize the data
raw_text_data_list = raw_text.tolist()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoded_texts = tokenizer(raw_text_data_list, padding=True, truncation=True, max_length=max_len, return_tensors="tf")
data = encoded_texts['input_ids']
data = data.numpy()

In [84]:
# Split the data
data_train, data_test, emb_all_train, emb_all_test, emb_art_train, emb_art_test, emb_sent_train, emb_sent_test, sent_train, sent_test, cos_train, cos_test, labels_train, labels_test = train_test_split(
    data, 
    emb_all,
    emb_art, 
    emb_sent,
    sent_scores,
    art_cos_sim,
    labels, 
    test_size=0.2, 
    random_state=42
)

##### LSTM

In [52]:
# Define the inputs
input_text = Input(shape=(max_len,), name='input_text')
input_art_embedding = Input(shape=(emb_art.shape[1],), name='input_art_embedding')
input_sent_max_embedding = Input(shape=(emb_sent.shape[1],), name='input_sent_max_embedding')
input_sentiment = Input(shape=(1,), name='sentiment_score')
input_cosine = Input(shape=(1,), name='cosine_similarity')

# Text processing branch
embedding_layer = Embedding(len(word_index) + 1, 100, input_length=max_len)(input_text)
lstm_layer = LSTM(64, dropout=0.2, recurrent_dropout=0.2)(embedding_layer)

# First precomputed embedding branch
dense_art_embedding_layer = Dense(64, activation='relu')(input_art_embedding)
dropout_art_embedding = Dropout(0.5)(dense_art_embedding_layer)

# Second precomputed embedding branch
dense_sent_max_embedding_layer = Dense(64, activation='relu')(input_sent_max_embedding)
dropout_sent_max_embedding = Dropout(0.5)(dense_sent_max_embedding_layer)

# Sentiment score branch
dense_sentiment_layer = Dense(32, activation='relu')(input_sentiment)
dropout_sentiment = Dropout(0.5)(dense_sentiment_layer)

# Cosine Similarity score branch
dense_cos_layer = Dense(32, activation='relu')(input_cosine)
dropout_cos = Dropout(0.5)(dense_cos_layer)

# Concatenate LSTM output, both precomputed embeddings, and sentiment score
concat_layer = Concatenate()([lstm_layer, dropout_art_embedding, dropout_sent_max_embedding, dropout_sentiment, dropout_cos])
batch_norm = BatchNormalization()(concat_layer)
dense_layer = Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01))(batch_norm)
dropout_dense = Dropout(0.5)(dense_layer)

# Output layer
output_layer = Dense(1, activation='sigmoid')(dropout_dense)

# Construct the model
model = Model(inputs=[input_text, input_art_embedding, input_sent_max_embedding, input_sentiment, input_cosine], outputs=output_layer)

# Compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)



In [None]:
model.fit(
    [data_train, emb_art_train, emb_sent_train, sent_train, cos_train], 
    labels_train, 
    epochs=100, 
    batch_size=32, 
    validation_split=0.10,
    callbacks=[early_stopping]
)

In [56]:
predictions_test = model.predict([data_test, emb_art_test, emb_sent_test, sent_test, cos_test])
predicted_labels_test = (predictions_test > 0.5).astype(int)



In [91]:
# Article Embedding + Sentence Embedding + Sentiment + LSTM + 0.001 LR + No ReduceLR
metric = get_metric(labels_test, predicted_labels_test)

                 Metric     Value
0  Precision (Positive)  0.741379
1     Recall (Positive)  0.516295
2   F1 Score (Positive)  0.608696
3  Precision (Negative)  0.647059
4     Recall (Negative)  0.831190
5   F1 Score (Negative)  0.727657


In [25]:
# All Embedding + Sent Score + 0.001 LR + BERT Tokenizer + BiLSTM + No RL
metric = get_metric(labels_test, predicted_labels_test)

                 Metric     Value
0  Precision (Positive)  0.655957
1     Recall (Positive)  0.855148
2   F1 Score (Positive)  0.742424
3  Precision (Negative)  0.803318
4     Recall (Negative)  0.568792
5   F1 Score (Negative)  0.666012


In [259]:
# All Embedding + 0.001 LR + BERT Tokenizer + BiLSTM + No RL
metric = get_metric(labels_test, predicted_labels_test)

                 Metric     Value
0  Precision (Positive)  0.711480
1     Recall (Positive)  0.798305
2   F1 Score (Positive)  0.752396
3  Precision (Negative)  0.765286
4     Recall (Negative)  0.670121
5   F1 Score (Negative)  0.714549


In [254]:
# All Embedding + 0.001 LR + BERT Tokenizer + BiLSTM
metric = get_metric(labels_test, predicted_labels_test)

                 Metric     Value
0  Precision (Positive)  0.679947
1     Recall (Positive)  0.867797
2   F1 Score (Positive)  0.762472
3  Precision (Negative)  0.812500
4     Recall (Negative)  0.583765
5   F1 Score (Negative)  0.679397


In [214]:
# Max + Article Embedding + 0.0001 LR
metric = get_metric(labels_test, predicted_labels_test)

                 Metric     Value
0  Precision (Positive)  0.771318
1     Recall (Positive)  0.682676
2   F1 Score (Positive)  0.724295
3  Precision (Negative)  0.716692
4     Recall (Negative)  0.798635
5   F1 Score (Negative)  0.755448


In [204]:
# Max + Article Embedding + 0.001 LR
metric = get_metric(labels_test, predicted_labels_test)

                 Metric     Value
0  Precision (Positive)  0.685157
1     Recall (Positive)  0.783877
2   F1 Score (Positive)  0.731200
3  Precision (Negative)  0.749004
4     Recall (Negative)  0.641638
5   F1 Score (Negative)  0.691176


In [161]:
# Max Embedding
metric = get_metric(labels_test, predicted_labels_test)

                 Metric     Value
0  Precision (Positive)  0.696466
1     Recall (Positive)  0.573630
2   F1 Score (Positive)  0.629108
3  Precision (Negative)  0.638081
4     Recall (Negative)  0.750427
5   F1 Score (Negative)  0.689709


In [151]:
# All Embedding
metric = get_metric(labels_test, predicted_labels_test)

                 Metric     Value
0  Precision (Positive)  0.647837
1     Recall (Positive)  0.899833
2   F1 Score (Positive)  0.753319
3  Precision (Negative)  0.821958
4     Recall (Negative)  0.485965
5   F1 Score (Negative)  0.610805
