In [None]:
import tensorflow as tf
import tensorflow_hub as hub
from keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Concatenate, Flatten
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.preprocessing import StandardScaler
from keras.optimizers import Adam

## CNN that takes in text and numerical features for rumor detection

### Retrieve Test and Training Data

In [None]:
data = pd.read_csv('updated_data.csv')
label = data['label']
text_data = data['tweet_text']
numerical_data = data[['retweet_count', 'favorite_count', 'follower_count', 'difference', 'ratio', 'verified', 'time_delayed (min)'
                 , 'neg_sent', 'pos_sent', 'comp_sent']].values

In [None]:
# Split data
X_train_text, X_train_num, X_test_text, X_test_num, y_train, y_test = train_test_split([text_data, numerical_data], label, test_size=0.2, random_state=42, stratify = label)

print(f'Shape of X_train_text: {X_train_text.shape}'), print(f'Shape of X_train_num: {X_train_num.shape}')
print(f'Shape of X_test_text: {X_test_text.shape}'), print(f'Shape of X_test_num: {X_test_num.shape}')
print(f'Shape of y_train: {y_train.shape}'), print(f'Shape of y_test: {y_test.shape}')

### Preprocess

#### Preprocess Text

In [None]:
elmo = hub.load("https://tfhub.dev/google/elmo/3")

In [None]:
def elmo_embeddings(input_tensor):
    text_list = tf.squeeze(input_tensor, axis=1)
    embeddings = elmo(text_list)["elmo"]
    return embeddings

In [None]:
def process_in_batches(texts, batch_size=32):
    elmo_embed_np = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        batch_embeddings = elmo_embeddings(batch_texts)
        elmo_embed_np.extend([embed.numpy() for embed in batch_embeddings])
    return np.array(elmo_embed_np)

In [None]:
elmo_train = process_in_batches(X_train_text)
elmo_test = process_in_batches(X_test_text)

print(f'SHape of X_test_text: {X_test_text.shape}')

#### Preprocess Numbers

In [None]:
scaler = StandardScaler()
norm_train_num=scaler.transform_fit(X_train_num)
norm_test_num = scaler.transform_fit(X_test_num)

print(norm_test_num.shape)

#### Build Model

In [None]:
def cnn_model():
    # Define textual input
    input_layer = Input(shape=(1,), dtype=tf.string, name='input_text')
    # Obtain ELMo embeddings
    embedding_layer = elmo_embeddings(input_layer)
    conv_layer = Conv1D(filters=128, kernel_size=5, activation='relu')(embedding_layer)
    pooling_layer = GlobalMaxPooling1D()(conv_layer)
    flattened_text = Flatten()(pooling_layer)
    
    # Define numerical input
    input_numeric = Input(shape=(10,), name="input_numeric")
    dense_num = Dense(128, activation='relu')(input_num)
    dropout_num = Dropout(0.5)(dense_num)
    
    # Concatenate textual and numerical inputs
    concatenated_layer = Concatenate()([flattened_text, dropout_num])
    dense_merged = Dense(128, activation='relu')(concatenated_layer)
    dropout_merged = Dropout(0.5)(dense_merged)
    output = Dense(1, activation='sigmoid')(dropout_merged)
    
    
    model = Model(inputs=[input_layer, input_numeric], outputs=output)
    model.compile(loss='binary_crossentropy',
                  optimizer=Adam(learning_rate = 0.001),
                  metrics=['accuracy', 'recall', 'Precision', 'FalseNegatives'])
    return model

In [None]:
model = cnn_model()
history = model.fit( [elmo_train, norm_train_num], 
                    y_train, 
                    batch_size=16, 
                    epochs=1, 
                    validation_data=0.2,)

#### Code for working on Bayesian Optimization (work in progress)

In [None]:
score_acc = make_scorer(accuracy_score)

In [None]:
def model_cnn():
    ## build model architecture
    input_x = Input(shape=(1034,))
    print(f"Concatenated model: {x.shape}")
    ## Reshape to fit Conv1D input
    x = Reshape((x.shape[1], 1))(x)
    print(f"Concatenated model after reshape: {x.shape}")
    ## Define Convolutional Layers
    for i in range(conv_layers):
        ### create a 1D-conv layers with given number of filters and kernel sizes and a given activation function 
        ### from the function list with input staring from the concatenated neurons
        x = Conv1D(filters=conv_filters, kernel_size=conv_kernel_size, activation=activation)(x)
        print(f"Concatenated model after iteration {i+1}: {x.shape}")
        if normalization > 0.5:
            x = BatchNormalization()(x)
    ## apply pooling to the given output x
    x = GlobalMaxPooling1D()(x)
    print(f"Shape after GlobalMaxPooling1D layer {i + 1}: {x.shape}")
    ## flatten the layer 
    x = Flatten()(x)

    ## Define dense layers
    for i in range(dense_layers):
        ### define dense layer with the given amout of neurons and activation function
        x = Dense(neurons, activation=activation)(x)

        if dropout>0.5:
            x = Dropout(dropout_rate, seed=123)()

    ## Define an output layer where the final prediction is made
    output = Dense(1, activation='sigmoid')(x)
    ## Define the model
    model = Model(inputs=input_x, outputs = output)
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy', 'recall', 'Precision', 'FalseNegatives'])
    
    return model

In [None]:
def text_num_cnn(neurons, learning_rate, batch_size, epochs,
                conv_layers, conv_filters, conv_kernel_size,
                dense_layers, normalization, dropout, dropout_rate):

    #set variables by getting the next int and set parameters for optimizer and activation function
    neurons = round(neurons)
    activation ='relu'
    optimizer = Adam(learning_rate = 0.001)
    batch_size = round(batch_size)
    learning_rate = round(learning_rate)
    epochs = round(epochs)
    conv_layers = round(conv_layers)
    conv_filters = round(conv_filters)
    conv_kernel_size = round(conv_kernel_size)
    dense_layers = round(dense_layers)

    # Define the Keras model with the provided parameters
    def create_model():
        return cnn_model(
            neurons=neurons,
            activation=activation,
            optimizer=optimizer,
            learning_rate=learning_rate,
            batch_size=batch_size,
            epochs=epochs,
            conv_layers=conv_layers,
            conv_filters=conv_filters,
            conv_kernel_size=conv_kernel_size,
            dense_layers=dense_layers,
            normalization=normalization,
            dropout=dropout,
            dropout_rate=dropout_rate
        )

    ## Add early stopping to prevent model from continuing to find optimal parameters after the 20th
    es = EarlyStopping(monitor='accuracy', mode = 'max', verbose=0, patience=20)
    nn = KerasClassifier(build_fn =model, epochs=epochs, batch_size=batch_size, verbose=0)
    kfold = StratifiedKFold(n_splits =5, shuffle=True, random_state=123)
    score = cross_val_score(nn, X_train_norm, y_train, scoring=score_acc, cv=kfold, fit_params={'callbacks':[es]}).mean()
    return score

In [None]:
from bayes_opt import BayesianOptimization

params_cnn = {
    'neurons': (10, 100),
    'learning_rate': (0.001, 0.1),
    'batch_size': (16, 128),
    'epochs': (10, 50),
    'conv_layers': (1, 3),
    'conv_filters': (32, 128),
    'conv_kernel_size': (2, 5),
    'dense_layers': (1, 3),
    'dropout': (0.0, 0.5),
    'normalization': (0.0, 1.0),
    'dropout_rate': (0.1, 0.5)
}

# Bayesian Optimization
cnn_bo_optimizer = BayesianOptimization(text_num_cnn,pbounds=params_cnn,random_state=111)

cnn_bo_optimizer.maximize(init_points=10, n_iter=10)