In [312]:
import pandas as pd
import numpy as np
import json
import tensorflow as tf
import time

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from transformers import BertTokenizer, TFBertModel
from keras.layers import Input, Embedding, LSTM, GRU, Dense, Dropout, Concatenate, BatchNormalization, Bidirectional, Reshape
from tensorflow.keras import regularizers
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.python.client import device_lib
from keras.optimizers import Adam

from utils.system import *
from metric import get_metric

In [289]:
print(tf.__version__)
print(device_lib.list_local_devices())

2.10.0
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 14287166054919514467
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 14267973632
locality {
  bus_id: 1
  links {
  }
}
incarnation: 7454526811992541762
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3080 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6"
xla_global_id: 416903419
]


### Prep Data

In [343]:
# Read in data
article = pd.read_csv(get_data() / 'human_annotations_all_8000_overall.csv')
art_emb = pd.read_parquet(get_data() / 'bert_article_emb.parquet.brotli')
sentence_emb = pd.read_parquet(get_data() / 'bert_sentence_cosine.parquet.brotli')  
sent = pd.read_parquet(get_data() / 'bert_sentiment.parquet.brotli')
art_cos = pd.read_parquet(get_data() / 'bert_art_cosine.parquet.brotli')  
emotion = pd.read_parquet(get_data() / 'bert_emotion.parquet.brotli')
topic = pd.read_parquet(get_data() / 'lda_topic.parquet.brotli')
n_gram = pd.read_parquet(get_data() / 'n_gram.parquet.brotli')
lex_div = pd.read_parquet(get_data() / 'lexical_div.parquet.brotli')
readability = pd.read_parquet(get_data() / 'readability.parquet.brotli')
time = pd.read_parquet(get_data() / 'time.parquet.brotli')
lexicon = pd.read_parquet(get_data() / 'bert_word_cosine.parquet.brotli')

In [344]:
# Merge all data together
merged_emb = (pd.merge(art_emb, sentence_emb, on='id', how='inner')
              .merge(sent, on='id', how='inner')
              .merge(art_cos, on='id', how='inner')
              .merge(emotion, on='id', how='inner')
              .merge(lex_div, on='id', how='inner')
              .merge(topic, on='id', how='inner')
              .merge(n_gram, on='id', how='inner')
              .merge(time, on='id', how='inner'))

In [345]:
# Retrieve top 500 words
lexicon = lexicon.head(500)
lexicon = lexicon.reset_index(level=0, drop=True)

### Undersample

In [346]:
undersample = merged_emb.sort_values('overall_label')
df_class_0 = undersample[undersample['overall_label'] == 0]
df_class_1 = undersample[undersample['overall_label'] == 1]
n_samples = min(len(df_class_0), len(df_class_1))
# Randomly sample from each class
df_class_0_under = df_class_0.sample(n_samples)
df_class_1_under = df_class_1.sample(n_samples)
# Combine the two dataframes
merged_undersample = pd.concat([df_class_0_under, df_class_1_under], axis=0)
# Shuffle the balanced dataset
merged_undersample = merged_undersample.sample(frac=1).reset_index(drop=True)

### Convert Lexicon Dictionary to Numerical Format

In [347]:
def create_binary_features(article, lexicon):
    features = {}
    words = set(article.split())
    for word in lexicon:
        features[f'binary_{word}'] = word in words
    return features

def create_count_features(article, lexicon):
    features = {}
    word_counts = Counter(article.split())
    for word in lexicon:
        features[f'count_{word}'] = word_counts[word]
    return features

In [348]:
lexicon_list = lexicon['word'].tolist()
# For binary features
merged_undersample['binary_features'] = merged_undersample['cleaned_article'].apply(lambda x: create_binary_features(x, lexicon_list))
# For count features
merged_undersample['count_features'] = merged_undersample['cleaned_article'].apply(lambda x: create_count_features(x, lexicon_list))

#### Format Features

In [349]:
merged_undersample.columns

Index(['text', 'overall_label', 'cleaned_article', 'bert_emb_art',
       'bert_emb_min', 'bert_emb_max', 'sent_score', 'cosine_sim_art_mean',
       'cosine_sim_0', 'cosine_sim_1', 'cosine_sim_2', 'cosine_sim_3',
       'cosine_sim_4', 'emotion_num', 'ttr', 'Topic_0', 'Topic_1', 'Topic_2',
       'Topic_3', 'Topic_4', 'Topic_5', 'Topic_6', 'Topic_7', 'Topic_8',
       'Topic_9', 'n_gram_1', 'n_gram_2', 'n_gram_3', 'n_gram_4', 'n_gram_5',
       'n_gram_6', 'n_gram_7', 'n_gram_8', 'n_gram_9', 'n_gram_10',
       'n_gram_11', 'n_gram_12', 'n_gram_13', 'n_gram_14', 'n_gram_15',
       'n_gram_16', 'n_gram_17', 'n_gram_18', 'n_gram_19', 'n_gram_20',
       'n_gram_21', 'n_gram_22', 'n_gram_23', 'n_gram_24', 'n_gram_25',
       'n_gram_26', 'n_gram_27', 'n_gram_28', 'n_gram_29', 'n_gram_30',
       'n_gram_31', 'n_gram_32', 'n_gram_33', 'n_gram_34', 'n_gram_35',
       'n_gram_36', 'n_gram_37', 'n_gram_38', 'n_gram_39', 'n_gram_40',
       'time_reference_count', 'binary_features', 'count_

In [350]:
lexicon_feature = pd.json_normalize(merged_undersample['count_features'])
article_emb_feature = np.stack(merged_undersample['bert_emb_art'].to_numpy())
max_sentence_emb_feature = np.stack(merged_undersample['bert_emb_max'].to_numpy())
min_sentence_emb_feature = np.stack(merged_undersample['bert_emb_min'].to_numpy())
emotion_feature = np.array(merged_undersample['emotion_num']).reshape(-1, 1)
cosine_feature = np.array(merged_undersample['cosine_sim_art_mean']).reshape(-1, 1)

label = merged_undersample['overall_label'].to_numpy() 

#### Out of Sample Train Model

In [353]:
# Split the data
emb_art_train, emb_art_test, max_emb_sent_train, max_emb_sent_test, min_emb_sent_train, min_emb_sent_test, lexicon_train, lexicon_test, emotion_train, emotion_test, cosine_train, cosine_test, label_train, label_test = train_test_split(
    article_emb_feature,
    max_sentence_emb_feature,
    min_sentence_emb_feature,
    lexicon_feature,
    emotion_feature,
    cosine_feature,
    label,
    test_size=0.2, 
    random_state=42
)

#### LSTM

In [363]:
def get_metric(label, prediction):
    cm = confusion_matrix(label, prediction)
    TP = cm[1, 1]  # True Positives
    TN = cm[0, 0]  # True Negatives
    FP = cm[0, 1]  # False Positives
    FN = cm[1, 0]  # False Negatives

    # Calculate precision and recall for the positive class
    precision_pos = TP / (TP + FP) if (TP + FP) != 0 else 0
    recall_pos = TP / (TP + FN) if (TP + FN) != 0 else 0
    f1_pos = 2 * (precision_pos * recall_pos) / (precision_pos + recall_pos) if (precision_pos + recall_pos) != 0 else 0

    # Calculate precision and recall for the negative class
    precision_neg = TN / (TN + FN) if (TN + FN) != 0 else 0
    recall_neg = TN / (TN + FP) if (TN + FP) != 0 else 0
    f1_neg = 2 * (precision_neg * recall_neg) / (precision_neg + recall_neg) if (precision_neg + recall_neg) != 0 else 0

    # Display in a table
    metrics = pd.DataFrame({
        'Metric': ['Precision (Positive)', 'Recall (Positive)', 'F1 Score (Positive)',
                   'Precision (Negative)', 'Recall (Negative)', 'F1 Score (Negative)'],
        'Value': [precision_pos, recall_pos, f1_pos, precision_neg, recall_neg, f1_neg]
    })
    print(metrics)
    return metrics

def eval_result(model, feature_test, label_test):
    predictions_test = model.predict(feature_test)
    predicted_labels_test = (predictions_test > 0.5).astype(int)
    metric = get_metric(label_test, predicted_labels_test)
    return metric

def create_feature(units, dropout, feature_data):
    input_feature = Input(shape=(feature_data.shape[1],))
    dense_feature = Dense(units, activation='relu')(input_feature)
    dropout_feature = Dropout(dropout)(dense_feature)
    return input_feature, dropout_feature
    
def train_lstm(units, dropout, l2, learn_rate, feature_train, label_train,feature_test, label_test):
    # Create feature layer
    inputs = []
    features = []
    for feature_data in feature_train:
        input, feature = create_feature(units, dropout, feature_data)
        inputs.append(input)
        features.append(feature)
    
    # LSTM branch (Processing Article Embeddings)
    lstm_art_input = Reshape((1, units))(features[0])
    lstm_art = LSTM(units, dropout=dropout, recurrent_dropout=dropout)(lstm_art_input)

    # LSTM branch (Processing Lexicon)
    lstm_lexicon_input = Reshape((1, units))(features[1])
    lstm_lexicon = LSTM(units, dropout=dropout, recurrent_dropout=dropout)(lstm_lexicon_input)

    # Concatenate
    concat_layer = Concatenate()([lstm_art, lstm_lexicon] + features[2:])
    batch_norm = BatchNormalization()(concat_layer)
    dense_layer = Dense(units, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l2))(batch_norm)
    dropout_dense = Dropout(dropout)(dense_layer)

    # Output layer
    output_layer = Dense(1, activation='sigmoid')(dropout_dense)
    
    # Compile Model
    model = Model(inputs=inputs, outputs=output_layer)
    optimizer = Adam(learning_rate=learn_rate)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    model.fit(
        [data for data in feature_train],
        label_train,
        epochs=1000,
        batch_size=32,
        validation_split=0.10,
        callbacks=[early_stopping]
    )

    # Evaluate Results
    metric = eval_result(model, feature_test, label_test)
    return model, metric

def train_gru(units, dropout, l2, learn_rate, feature_train, label_train, feature_test, label_test):
    # Create feature layer
    inputs = []
    features = []
    for feature_data in feature_train:
        input, feature = create_feature(units, dropout, feature_data)
        inputs.append(input)
        features.append(feature)
    
    # GRU branch
    gru_input = Reshape((1, units))(features[0])
    gru_layer = GRU(units, dropout=dropout, recurrent_dropout=dropout)(gru_input)

    # Concatenate
    concat_layer = Concatenate()([gru_layer] + features[1:])
    batch_norm = BatchNormalization()(concat_layer)
    dense_layer = Dense(units, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l2))(batch_norm)
    dropout_dense = Dropout(dropout)(dense_layer)

    # Output layer
    output_layer = Dense(1, activation='sigmoid')(dropout_dense)
    
    # Compile Model
    model = Model(inputs=inputs, outputs=output_layer)
    optimizer = Adam(learning_rate=learn_rate)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    model.fit(
        [data for data in feature_train],
        label_train,
        epochs=1000,
        batch_size=32,
        validation_split=0.10,
        callbacks=[early_stopping]
    )

    # Evaluate Results
    print("-"*60)
    metric = eval_result(model, feature_test, label_test)
    return model, metric

In [364]:
units = 64
dropout = 0.5
l2 = 0.01
learn_rate = 0.0001
feature_train = [emb_art_train, ma`x_emb_sent_train, lexicon_train]
label_train = label_train
feature_test = [emb_art_test, max_emb_sent_test, lexicon_test]
label_test = label_test

In [365]:
lstm_model = train_lstm(units, dropout, l2, learn_rate, feature_train, label_train, feature_test, label_test)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
                 Metric     Value
0  Precision (Positive)  0.697865
1  

In [None]:
gru_model = train_gru(units, dropout, l2, learn_rate, feature_train, label_train, feature_test, label_test)