In [25]:
import re
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import BertTokenizer, BertModel
import torch.nn.functional as F
import nltk
from sklearn.metrics import roc_auc_score
from keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from keras.layers import Input, Embedding, Bidirectional, GRU, Dense, Concatenate
from tensorflow.keras.callbacks import ReduceLROnPlateau

In [26]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

In [27]:
# Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ki_shari\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ki_shari\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ki_shari\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ki_shari\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [28]:
# Define the preprocessing functions
def remove_special_chars(text):
    # Remove special characters and punctuation
    clean_text = re.sub(r'[^\w\s]', '', text)
    return clean_text

def preprocess_text(text):
    # Tokenization
    tokens = nltk.word_tokenize(text.lower())

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]

    return lemmatized_tokens

In [29]:
df = pd.read_csv(r'C:\Users\ki_shari\Downloads\DFF.csv', encoding='latin-1')

In [30]:
df.loc[df["LABEL"] == "__label1__", "LABEL"] = 1
df.loc[df["LABEL"] == "__label2__", "LABEL"] = 0
df['LABEL']=pd.to_numeric(df['LABEL'])

In [31]:
# Apply the preprocessing steps
df['ORIGINAL_TEXT'] = df['ORIGINAL_TEXT'].apply(remove_special_chars)
df['ORIGINAL_TEXT'] = df['ORIGINAL_TEXT'].apply(preprocess_text)

In [32]:
# Split into features and target
X = df['ORIGINAL_TEXT'].values
y = df['LABEL'].values

# Convert labels to binary format (0, 1)
le = LabelEncoder()
y = le.fit_transform(y)

In [33]:
# Define the parameters for tokenization and padding
max_features = 10000  # Maximum number of words to keep based on word frequency
maxlen = 100  # Maximum length of each review (truncate or pad with zeros)

# Tokenize the text
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X)
X_tokenized = tokenizer.texts_to_sequences(X)

# Pad the sequences
X_padded = pad_sequences(X_tokenized, maxlen=maxlen)

# Load GloVe embeddings
embedding_dim = 300
embedding_path = 'C:\\Users\\ki_shari\\Downloads\\glove.6B.300d.txt\\glove.6B.300d.txt'

embedding_matrix = np.zeros((max_features, embedding_dim))
with open(embedding_path, encoding='utf-8') as f:
    for line in f:
        word, vec = line.split(' ', 1)
        if word in tokenizer.word_index and tokenizer.word_index[word] < max_features:
            embedding_matrix[tokenizer.word_index[word]] = np.fromstring(vec, sep=' ')

In [10]:
from keras.models import Model
from keras.layers import Input, Embedding, Bidirectional, GRU, Dense
from keras.callbacks import ReduceLROnPlateau
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score

# Define the model architecture
input1 = Input(shape=(maxlen,))
embedding_layer = Embedding(max_features, embedding_dim, weights=[embedding_matrix], trainable=False)(input1)
gru_layer = Bidirectional(GRU(64))(embedding_layer)
output = Dense(1, activation='sigmoid')(gru_layer)

model = Model(inputs=input1, outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Perform 5-fold cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_scores = []
f1_scores = []
recall_scores = []
precision_scores = []
auc_scores = []

for train_index, test_index in kf.split(X_padded, df['LABEL']):
    X_train, X_test = X_padded[train_index], X_padded[test_index]
    y_train, y_test = df['LABEL'][train_index], df['LABEL'][test_index]

    # Train the model
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=3, factor=0.5, min_lr=0.0001)
    model.fit(X_train, y_train, validation_data=(X_test, y_test),
              batch_size=16, epochs=10, callbacks=[reduce_lr], verbose=0)

    # Evaluate the model
    y_pred = model.predict(X_test)
    y_pred_binary = np.round(y_pred).flatten()
    accuracy_scores.append(accuracy_score(y_test, y_pred_binary))
    f1_scores.append(f1_score(y_test, y_pred_binary))
    recall_scores.append(recall_score(y_test, y_pred_binary))
    precision_scores.append(precision_score(y_test, y_pred_binary))
    auc_scores.append(roc_auc_score(y_test, y_pred))

# Print evaluation metrics
print("Accuracy:", np.mean(accuracy_scores))
print("F1 Score:", np.mean(f1_scores))
print("Recall:", np.mean(recall_scores))
print("Precision:", np.mean(precision_scores))
print("AUC:", np.mean(auc_scores))

Accuracy: 0.8595714285714285
F1 Score: 0.858105313811819
Recall: 0.8554285714285715
Precision: 0.8609623058905225
AUC: 0.9089017006802722


In [None]:
####CNN+BiGRU

In [23]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Concatenate, Bidirectional, GRU
from tensorflow.keras.callbacks import ReduceLROnPlateau
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
import numpy as np

# Define the CNN model
def create_cnn_model(vocab_size, embedding_dim, maxlen):
    input_text = Input(shape=(maxlen,))
    
    embedding_layer = Embedding(vocab_size, embedding_dim, input_length=maxlen)(input_text)
    conv1 = Conv1D(128, 3, activation='relu')(embedding_layer)
    conv2 = Conv1D(128, 4, activation='relu')(embedding_layer)
    conv3 = Conv1D(128, 5, activation='relu')(embedding_layer)
    
    pooling1 = GlobalMaxPooling1D()(conv1)
    pooling2 = GlobalMaxPooling1D()(conv2)
    pooling3 = GlobalMaxPooling1D()(conv3)
    
    concatenated = Concatenate()([pooling1, pooling2, pooling3])
    
    dense1 = Dense(64, activation='relu')(concatenated)
    dropout1 = Dropout(0.5)(dense1)
    
    output = Dense(1, activation='sigmoid')(dropout1)
    
    model = Model(inputs=input_text, outputs=output)
    return model

# Define the BiGRU model
input1 = Input(shape=(maxlen,))
embedding_layer = Embedding(max_features, embedding_dim, weights=[embedding_matrix], trainable=False)(input1)
gru_layer = Bidirectional(GRU(64))(embedding_layer)
output = Dense(1, activation='sigmoid')(gru_layer)

model = Model(inputs=input1, outputs=output)

# Combine the models
combined_input = Input(shape=(maxlen,))
cnn_model = create_cnn_model(max_features, embedding_dim, maxlen)(combined_input)
gru_model = model(combined_input)

concatenated = Concatenate()([cnn_model, gru_model])

output = Dense(1, activation='sigmoid')(concatenated)

hybrid_model = Model(inputs=combined_input, outputs=output)

# Compile the hybrid model
hybrid_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define evaluation metrics
def evaluate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)
    return accuracy, precision, recall, f1, auc

# Define the adaptive learning rate callback
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, min_lr=0.0001)

# Perform 5-fold cross-validation
num_folds = 5
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

accuracies = []
precisions = []
recalls = []
f1_scores = []
auc_scores = []

for train_index, test_index in skf.split(X_padded, y):
    X_train, X_test = X_padded[train_index], X_padded[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Train the hybrid model
    hybrid_model.fit(X_train, y_train, epochs=10, batch_size=16, verbose=0, callbacks=[reduce_lr], validation_data=(X_test, y_test))
    
    # Predict on the test set
    y_pred = hybrid_model.predict(X_test)
    y_pred = np.round(y_pred).flatten()
    
    # Evaluate metrics
    accuracy, precision, recall, f1, auc = evaluate_metrics(y_test, y_pred)
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    auc_scores.append(auc)

# Calculate average metrics
average_accuracy = np.mean(accuracies)
average_precision = np.mean(precisions)
average_recall = np.mean(recalls)
average_f1 = np.mean(f1_scores)
average_auc = np.mean(auc_scores)

print("Average Metrics:")
print(f"Accuracy: {average_accuracy}")
print(f"Precision: {average_precision}")
print(f"Recall: {average_recall}")
print(f"F1-Score: {average_f1}")
print(f"AUC: {average_auc}")

Average Metrics:
Accuracy: 0.8111428571428572
Precision: 0.8217282210176166
Recall: 0.8135238095238094
F1-Score: 0.8156437380161308
AUC: 0.8111428571428572


In [None]:
####CNN+BiLSTM

In [24]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Concatenate, Bidirectional, LSTM
from tensorflow.keras.callbacks import ReduceLROnPlateau
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
import numpy as np

# Define the CNN model
def create_cnn_model(vocab_size, embedding_dim, maxlen):
    input_text = Input(shape=(maxlen,))
    
    embedding_layer = Embedding(vocab_size, embedding_dim, input_length=maxlen)(input_text)
    conv1 = Conv1D(128, 3, activation='relu')(embedding_layer)
    conv2 = Conv1D(128, 4, activation='relu')(embedding_layer)
    conv3 = Conv1D(128, 5, activation='relu')(embedding_layer)
    
    pooling1 = GlobalMaxPooling1D()(conv1)
    pooling2 = GlobalMaxPooling1D()(conv2)
    pooling3 = GlobalMaxPooling1D()(conv3)
    
    concatenated = Concatenate()([pooling1, pooling2, pooling3])
    
    dense1 = Dense(64, activation='relu')(concatenated)
    dropout1 = Dropout(0.5)(dense1)
    
    output = Dense(1, activation='sigmoid')(dropout1)
    
    model = Model(inputs=input_text, outputs=output)
    return model

# Define the BiLSTM model
input1 = Input(shape=(maxlen,))
embedding_layer = Embedding(max_features, embedding_dim, weights=[embedding_matrix], trainable=False)(input1)
lstm_layer = Bidirectional(LSTM(64))(embedding_layer)
output = Dense(1, activation='sigmoid')(lstm_layer)

model = Model(inputs=input1, outputs=output)

# Combine the models
combined_input = Input(shape=(maxlen,))
cnn_model = create_cnn_model(max_features, embedding_dim, maxlen)(combined_input)
lstm_model = model(combined_input)

concatenated = Concatenate()([cnn_model, lstm_model])

output = Dense(1, activation='sigmoid')(concatenated)

hybrid_model = Model(inputs=combined_input, outputs=output)

# Compile the hybrid model
hybrid_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define evaluation metrics
def evaluate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)
    return accuracy, precision, recall, f1, auc

# Define the adaptive learning rate callback
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, min_lr=0.0001)

# Perform 5-fold cross-validation
num_folds = 5
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

accuracies = []
precisions = []
recalls = []
f1_scores = []
auc_scores = []

for train_index, test_index in skf.split(X_padded, y):
    X_train, X_test = X_padded[train_index], X_padded[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Train the hybrid model
    hybrid_model.fit(X_train, y_train, epochs=10, batch_size=16, verbose=0, callbacks=[reduce_lr], validation_data=(X_test, y_test))
    
    # Predict on the test set
    y_pred = hybrid_model.predict(X_test)
    y_pred = np.round(y_pred).flatten()
    
    # Evaluate metrics
    accuracy, precision, recall, f1, auc = evaluate_metrics(y_test, y_pred)
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    auc_scores.append(auc)

# Calculate average metrics
average_accuracy = np.mean(accuracies)
average_precision = np.mean(precisions)
average_recall = np.mean(recalls)
average_f1 = np.mean(f1_scores)
average_auc = np.mean(auc_scores)

print("Average Metrics:")
print(f"Accuracy: {average_accuracy}")
print(f"Precision: {average_precision}")
print(f"Recall: {average_recall}")
print(f"F1-Score: {average_f1}")
print(f"AUC: {average_auc}")

Average Metrics:
Accuracy: 0.8149047619047618
Precision: 0.820073064625516
Recall: 0.8078095238095238
F1-Score: 0.8135864794501128
AUC: 0.8149047619047618


In [None]:
##BiGRU+gridsearch

In [36]:
from keras.models import Model
from keras.layers import Input, Embedding, Bidirectional, GRU, Dense, Dropout
from keras.callbacks import ReduceLROnPlateau
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

# Define the model architecture
def create_model(dropout_rate=0.0, gru_units=64):
    input1 = Input(shape=(maxlen,))
    embedding_layer = Embedding(max_features, embedding_dim, weights=[embedding_matrix], trainable=False)(input1)
    gru_layer = Bidirectional(GRU(gru_units, dropout=dropout_rate))(embedding_layer)
    dropout_layer = Dropout(dropout_rate)(gru_layer)
    output = Dense(1, activation='sigmoid')(dropout_layer)

    model = Model(inputs=input1, outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Perform grid search for hyperparameters
param_grid = {
    'dropout_rate': [0.2, 0.3, 0.4],
    'gru_units': [32, 64, 128],
}

model = KerasClassifier(build_fn=create_model, epochs=10, batch_size=16, verbose=0)
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid_result = grid.fit(X_padded, df['LABEL'])

# Get the best model from grid search
best_params = grid_result.best_params_
best_dropout_rate = best_params['dropout_rate']
best_gru_units = best_params['gru_units']

best_model = create_model(dropout_rate=best_dropout_rate, gru_units=best_gru_units)

# Perform 5-fold cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_scores = []
f1_scores = []
recall_scores = []
precision_scores = []
auc_scores = []

for train_index, test_index in kf.split(X_padded, df['LABEL']):
    X_train, X_test = X_padded[train_index], X_padded[test_index]
    y_train, y_test = df['LABEL'][train_index], df['LABEL'][test_index]

    # Train the model
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=3, factor=0.5, min_lr=0.0001)
    best_model.fit(X_train, y_train, validation_data=(X_test, y_test),
                   batch_size=16, epochs=10, callbacks=[reduce_lr], verbose=0)

    # Evaluate the model
    y_pred = best_model.predict(X_test)
    y_pred_binary = np.round(y_pred).flatten()
    accuracy_scores.append(accuracy_score(y_test, y_pred_binary))
    f1_scores.append(f1_score(y_test, y_pred_binary))
    recall_scores.append(recall_score(y_test, y_pred_binary))
    precision_scores.append(precision_score(y_test, y_pred_binary))
    auc_scores.append(roc_auc_score(y_test, y_pred))

# Print evaluation metrics
print("Best Hyperparameters:")
print("Dropout Rate:", best_dropout_rate)
print("GRU Units:", best_gru_units)
print("Accuracy:", np.mean(accuracy_scores))
print("F1 Score:", np.mean(f1_scores))
print("Recall:", np.mean(recall_scores))
print("Precision:", np.mean(precision_scores))
print("AUC:", np.mean(auc_scores))

  model = KerasClassifier(build_fn=create_model, epochs=10, batch_size=16, verbose=0)


Best Hyperparameters:
Dropout Rate: 0.2
GRU Units: 128
Accuracy: 0.8752380952380954
F1 Score: 0.8788474229083162
Recall: 0.8908571428571428
Precision: 0.8675390121611393
AUC: 0.9165354648526078


In [None]:
##CNN+BiGRU+gridsearch

In [37]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Concatenate, Bidirectional, GRU, BatchNormalization
from tensorflow.keras.callbacks import ReduceLROnPlateau
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV
import numpy as np

# Define the CNN model
def create_cnn_model(vocab_size, embedding_dim, maxlen, kernel_size=3, filters=128, dense_units=64, dropout_rate=0.0):
    input_text = Input(shape=(maxlen,))
    
    embedding_layer = Embedding(vocab_size, embedding_dim, input_length=maxlen)(input_text)
    conv = Conv1D(filters, kernel_size, activation='relu')(embedding_layer)
    batchnorm = BatchNormalization()(conv)
    
    pooling = GlobalMaxPooling1D()(batchnorm)
    
    dense = Dense(dense_units, activation='relu')(pooling)
    dropout = Dropout(dropout_rate)(dense)
    
    return dropout

# Define the BiGRU model
def create_gru_model(embedding_dim, maxlen, gru_units=64, dropout_rate=0.0):
    input1 = Input(shape=(maxlen,))
    embedding_layer = Embedding(max_features, embedding_dim, weights=[embedding_matrix], trainable=False)(input1)
    gru_layer = Bidirectional(GRU(gru_units))(embedding_layer)
    dropout = Dropout(dropout_rate)(gru_layer)
    return dropout

# Combine the models
combined_input = Input(shape=(maxlen,))
cnn_model = create_cnn_model(max_features, embedding_dim, maxlen)
gru_model = create_gru_model(embedding_dim, maxlen)
concatenated = Concatenate()([cnn_model, gru_model])

output = Dense(1, activation='sigmoid')(concatenated)

hybrid_model = Model(inputs=combined_input, outputs=output)

# Compile the hybrid model
hybrid_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define evaluation metrics
def evaluate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)
    return accuracy, precision, recall, f1, auc

# Define the adaptive learning rate callback
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, min_lr=0.0001)

# Perform grid search for hyperparameters
param_grid = {
    'kernel_size': [3, 4, 5],
    'filters': [64, 128, 256],
    'dense_units': [32, 64],
    'gru_units': [32, 64, 128],
    'dropout_rate': [0.2, 0.5],
}

grid = GridSearchCV(estimator=hybrid_model, param_grid=param_grid, cv=5)
grid_result = grid.fit(X_padded, y)

# Get the best hyperparameters
best_kernel_size = grid_result.best_params_['kernel_size']
best_filters = grid_result.best_params_['filters']
best_dense_units = grid_result.best_params_['dense_units']
best_gru_units = grid_result.best_params_['gru_units']
best_dropout_rate = grid_result.best_params_['dropout_rate']

# Perform 5-fold cross-validation
num_folds = 5
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

accuracies = []
precisions = []
recalls = []
f1_scores = []
auc_scores = []

for train_index, test_index in skf.split(X_padded, y):
    X_train, X_test = X_padded[train_index], X_padded[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Train the hybrid model with the best hyperparameters
    cnn_model = create_cnn_model(max_features, embedding_dim, maxlen, kernel_size=best_kernel_size, filters=best_filters, dense_units=best_dense_units, dropout_rate=best_dropout_rate)
    gru_model = create_gru_model(embedding_dim, maxlen, gru_units=best_gru_units, dropout_rate=best_dropout_rate)
    concatenated = Concatenate()([cnn_model, gru_model])
    output = Dense(1, activation='sigmoid')(concatenated)
    hybrid_model = Model(inputs=combined_input, outputs=output)
    hybrid_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    hybrid_model.fit(X_train, y_train, epochs=10, batch_size=16, verbose=0, callbacks=[reduce_lr], validation_data=(X_test, y_test))
    
    # Predict on the test set
    y_pred = hybrid_model.predict(X_test)
    y_pred = np.round(y_pred).flatten()
    
    # Evaluate metrics
    accuracy, precision, recall, f1, auc = evaluate_metrics(y_test, y_pred)
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    auc_scores.append(auc)

# Calculate average metrics
average_accuracy = np.mean(accuracies)
average_precision = np.mean(precisions)
average_recall = np.mean(recalls)
average_f1 = np.mean(f1_scores)
average_auc = np.mean(auc_scores)

print("Best Hyperparameters:")
print("Kernel Size:", best_kernel_size)
print("Filters:", best_filters)
print("Dense Units:", best_dense_units)
print("GRU Units:", best_gru_units)
print("Dropout Rate:", best_dropout_rate)

print("\nAverage Metrics:")
print(f"Accuracy: {average_accuracy}")
print(f"Precision: {average_precision}")
print(f"Recall: {average_recall}")
print(f"F1-Score: {average_f1}")
print(f"AUC: {average_auc}")

ValueError: Graph disconnected: cannot obtain value for tensor KerasTensor(type_spec=TensorSpec(shape=(None, 100), dtype=tf.float32, name='input_63'), name='input_63', description="created by layer 'input_63'") at layer "embedding_57". The following previous layers were accessed without issue: []

In [None]:
##CNN+BiLSTM+gridsearch

In [38]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Concatenate, Bidirectional, GRU, BatchNormalization
from tensorflow.keras.callbacks import ReduceLROnPlateau
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV
import numpy as np

# Define the CNN model
def create_cnn_model(vocab_size, embedding_dim, maxlen, kernel_size=3, filters=128, dense_units=64, dropout_rate=0.0):
    input_text = Input(shape=(maxlen,))
    
    embedding_layer = Embedding(vocab_size, embedding_dim, input_length=maxlen)(input_text)
    conv = Conv1D(filters, kernel_size, activation='relu')(embedding_layer)
    batchnorm = BatchNormalization()(conv)
    
    pooling = GlobalMaxPooling1D()(batchnorm)
    
    dense = Dense(dense_units, activation='relu')(pooling)
    dropout = Dropout(dropout_rate)(dense)
    
    return dropout

# Define the BiGRU model
def create_gru_model(embedding_dim, maxlen, gru_units=64, dropout_rate=0.0):
    input1 = Input(shape=(maxlen,))
    embedding_layer = Embedding(max_features, embedding_dim, weights=[embedding_matrix], trainable=False)(input1)
    lst_layer = Bidirectional(LSTM(gru_units))(embedding_layer)
    dropout = Dropout(dropout_rate)(lst_layer)
    return dropout

# Combine the models
combined_input = Input(shape=(maxlen,))
cnn_model = create_cnn_model(max_features, embedding_dim, maxlen)
gru_model = create_gru_model(embedding_dim, maxlen)
concatenated = Concatenate()([cnn_model, gru_model])

output = Dense(1, activation='sigmoid')(concatenated)

hybrid_model = Model(inputs=combined_input, outputs=output)

# Compile the hybrid model
hybrid_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define evaluation metrics
def evaluate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)
    return accuracy, precision, recall, f1, auc

# Define the adaptive learning rate callback
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, min_lr=0.0001)

# Perform grid search for hyperparameters
param_grid = {
    'kernel_size': [3, 4, 5],
    'filters': [64, 128, 256],
    'dense_units': [32, 64],
    'gru_units': [32, 64, 128],
    'dropout_rate': [0.2, 0.5],
}

grid = GridSearchCV(estimator=hybrid_model, param_grid=param_grid, cv=5)
grid_result = grid.fit(X_padded, y)

# Get the best hyperparameters
best_kernel_size = grid_result.best_params_['kernel_size']
best_filters = grid_result.best_params_['filters']
best_dense_units = grid_result.best_params_['dense_units']
best_gru_units = grid_result.best_params_['gru_units']
best_dropout_rate = grid_result.best_params_['dropout_rate']

# Perform 5-fold cross-validation
num_folds = 5
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

accuracies = []
precisions = []
recalls = []
f1_scores = []
auc_scores = []

for train_index, test_index in skf.split(X_padded, y):
    X_train, X_test = X_padded[train_index], X_padded[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Train the hybrid model with the best hyperparameters
    cnn_model = create_cnn_model(max_features, embedding_dim, maxlen, kernel_size=best_kernel_size, filters=best_filters, dense_units=best_dense_units, dropout_rate=best_dropout_rate)
    gru_model = create_gru_model(embedding_dim, maxlen, lst_units=best_lst_units, dropout_rate=best_dropout_rate)
    concatenated = Concatenate()([cnn_model, gru_model])
    output = Dense(1, activation='sigmoid')(concatenated)
    hybrid_model = Model(inputs=combined_input, outputs=output)
    hybrid_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    hybrid_model.fit(X_train, y_train, epochs=10, batch_size=16, verbose=0, callbacks=[reduce_lr], validation_data=(X_test, y_test))
    
    # Predict on the test set
    y_pred = hybrid_model.predict(X_test)
    y_pred = np.round(y_pred).flatten()
    
    # Evaluate metrics
    accuracy, precision, recall, f1, auc = evaluate_metrics(y_test, y_pred)
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    auc_scores.append(auc)

# Calculate average metrics
average_accuracy = np.mean(accuracies)
average_precision = np.mean(precisions)
average_recall = np.mean(recalls)
average_f1 = np.mean(f1_scores)
average_auc = np.mean(auc_scores)

print("Best Hyperparameters:")
print("Kernel Size:", best_kernel_size)
print("Filters:", best_filters)
print("Dense Units:", best_dense_units)
print("LSTM Units:", best_lst_units)
print("Dropout Rate:", best_dropout_rate)

print("\nAverage Metrics:")
print(f"Accuracy: {average_accuracy}")
print(f"Precision: {average_precision}")
print(f"Recall: {average_recall}")
print(f"F1-Score: {average_f1}")
print(f"AUC: {average_auc}")

ValueError: Graph disconnected: cannot obtain value for tensor KerasTensor(type_spec=TensorSpec(shape=(None, 100), dtype=tf.float32, name='input_66'), name='input_66', description="created by layer 'input_66'") at layer "embedding_59". The following previous layers were accessed without issue: []