# Building and Optimizing Neural Network

In [3]:
import pandas as pd
import numpy as np


In [4]:
data=pd.read_csv('DataSet.csv')
params=pd.read_csv('bestparameters.csv')


In [7]:
data.columns

Index(['text', 'generated', 'Text_length', 'n_sentences', 'n_1pov',
       'Unique_words', 'Unique_Per_Size', 'sent_per_text', '1pov_per_size',
       'n_ubigrams', 'n_ubigrams_per_size', 'stopwords', 'stop_ratio',
       'entity_counts', 'person_counts', 'person_ratio', 'brunets_index',
       'entity_dictionary', 'org_count', 'loc_count', 'date_count',
       'time_count', 'quantity_count', 'cardinal_count', 'money_count',
       'percent_count', 'health_count', 'product_count', 'event_count',
       'law_count', 'nationality_count', 'title_count', 'avg_word_length',
       'pos_tag_dict', 'flesch_reading_ease', 'ari_score', 'cli_score',
       'smog_score', 'clean_text', 'org_count_ratio', 'loc_count_ratio',
       'date_count_ratio', 'time_count_ratio', 'quantity_count_ratio',
       'cardinal_count_ratio', 'money_count_ratio', 'percent_count_ratio',
       'health_count_ratio', 'product_count_ratio', 'event_count_ratio',
       'law_count_ratio', 'nationality_count_ratio', 'title_

In [73]:
columns_to_exclude = ['entity_counts','entity_dictionary','pos_tag_dict','4_group_cluster']
subset_columns = [col for col in data.columns if col not in columns_to_exclude]
data = data[subset_columns]


In [74]:
numeric_count = sum(data[col].dtype in [int, float] for col in data.columns)
non_numeric_count = len(data.columns) - numeric_count

In [75]:
def getMax(arr):
    max_index=0
    for i in range(len(arr)):
        if arr[i][1]>arr[max_index][1]:
            max_index=i
    print(arr[max_index][0],arr[max_index][1])     
    return arr[max_index][0]

In [76]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler  # Import MinMaxScaler from scikit-learn
from sklearn.metrics import precision_score, recall_score, f1_score

params=pd.read_csv('bestparameters.csv')

row = params.loc[params['f1'].idxmax()]


#Getting the best parameters till now
splitting_state_1=row['splitting_state_1']
splitting_state_2=row['splitting_state_2']
splitting_size_1=row['splitting_size_1']
splitting_size_2=row['splitting_size_2']
tf_state=row['tf_state']
layer_1_neurons=row['layer_1_neurons']
layer_2_neurons=row['layer_2_neurons']
layer_3_neurons=row['layer_3_neurons']
layer_4_neurons=row['layer_4_neurons']
layer_1_act=row['layer_1_act']
layer_2_act=row['layer_2_act']
layer_3_act=row['layer_3_act']
layer_4_act=row['layer_4_act']
optimizer=row['optimizer']
output_act=row['output_act']
n_epochs=row['n_epochs']
batch_s=row['batch_s']


maxF1=-1
maxAcc=-1
maxRec=-1
maxLoss=-1
maxPrec=-1
maxSpec=-1
maxRoc=-1
maxFpr=-1
maxFnr=-1
#shuffling the data
shuffled_data = data.sample(frac=1, random_state=42).reset_index(drop=True)

# Select columns for input features (X) and target (y)
X = shuffled_data.drop(columns=['generated', 'text', 'clean_text'])  # Input features
y = shuffled_data['generated']  # Target variable

results=[]

optimizers=['SGD', 'Adam', 'RMSprop', 'Adagrad', 'Adadelta', 'Adamax', 'Nadam', 'FTRL']
hidden_activation_functions = ['relu',  'tanh', 'sigmoid', 'swish', 'elu', 'selu']


for i in [16,32,64]:
    
    batch_s=i #The parameter we want to optimize
    tf.random.set_seed(tf_state)
    

    # Split scaled data into training, validation, and testing sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=splitting_size_1, random_state=splitting_state_1)
    X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=splitting_size_2, random_state=splitting_state_2)

    scaler = MinMaxScaler()

    X_train = scaler.fit_transform(X_train)
    X_validation = scaler.transform(X_validation)
    X_test = scaler.transform(X_test)

    # Define the model
    model = Sequential([
        Dense(layer_1_neurons, activation=layer_1_act, input_shape=(len(X_train[0]),)),
        Dense(layer_2_neurons, activation=layer_2_act),
        Dense(layer_3_neurons, activation=layer_3_act),
        Dense(layer_4_neurons, activation=layer_4_act),
        Dense(1, activation=output_act)  # Output layer for binary classification
    ])

    # Compile the model
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Define callbacks for model checkpointing and early stopping
    checkpoint = ModelCheckpoint("best_model.h5", monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1, restore_best_weights=True)

    # Train the model with callbacks
    history = model.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_s, validation_data=(X_validation, y_validation),
                        callbacks=[checkpoint, early_stopping],shuffle=False)

    # Load the best model weights saved during training
    best_model = tf.keras.models.load_model("best_model.h5")

    # Evaluate the best model on the testing set
    test_loss, test_accuracy = best_model.evaluate(X_test, y_test)
    print("Test Loss:", test_loss)
    print("Test Accuracy:", test_accuracy)

    # Calculate F1 score for the test set
    y_pred = best_model.predict(X_test)
    y_pred_binary = (y_pred > 0.5).astype(int)  # Convert probabilities to binary predictions
    precision = precision_score(y_test, y_pred_binary)
    recall = recall_score(y_test, y_pred_binary)
    f1 = f1_score(y_test, y_pred_binary)

    results.append((i,f1))
    print("Test Precision:", precision)
    print("Test Recall:", recall)
    print("Test F1 Score:", f1)
    
    conf_matrix = confusion_matrix(y_test, y_pred_binary)

    # Calculate specificity (TNR)
    specificity = conf_matrix[1, 1] / (conf_matrix[1, 0] + conf_matrix[1, 1])
    print('Specificity: ',specificity)
    # Calculate ROC-AUC score
    roc_auc = roc_auc_score(y_test, y_pred)
    print('roc_auc: ',roc_auc)

    # Calculate True Positive Rate (TPR)
    tpr = recall

    # Calculate True Negative Rate (TNR)
    tnr = specificity

    # Calculate False Positive Rate (FPR)
    fpr = 1 - tnr

    # Calculate False Negative Rate (FNR)
    fnr = 1 - tpr
    print('FNR: ',fnr)
    print('FPR: ',fpr)
    
    if f1>maxF1:
        maxF1=f1
        maxPrec=precision
        maxAcc=test_accuracy
        maxRec=recall
        maxLoss=test_loss
        maxSpec=specificity
        maxFnr=fnr
        maxFpr=fpr
        maxRoc=roc_auc
        

    

print(results)
batch_s=getMax(results) #Getting the best value for the parameter

param={'splitting_state_1':splitting_state_1,'splitting_state_2':splitting_state_2,'splitting_size_1':splitting_size_1,'splitting_size_2':splitting_size_2,
      'tf_state':tf_state,'layer_1_neurons':layer_1_neurons,'layer_2_neurons':layer_2_neurons,'layer_3_neurons':layer_3_neurons,
      'layer_4_neurons':layer_4_neurons,'layer_1_act':layer_1_act,'layer_2_act':layer_2_act,'layer_3_act':layer_3_act,
      'layer_4_act':layer_4_act,'optimizer':optimizer,'output_act':output_act,'n_epochs':n_epochs,'batch_s':batch_s,'test_loss':maxLoss,'test_accuracy':maxAcc,'precision':maxPrec,
      'recall':maxRec,'f1':maxF1,'specificity':maxSpec,'roc_auc':maxRoc,'FPR':maxFpr,'FNR':maxFnr}

par=pd.DataFrame([param])
params=pd.concat([params,par])

params.to_csv('bestparameters.csv',index=False)

Epoch 1/20
Epoch 00001: val_loss improved from inf to 0.36398, saving model to best_model.h5
Epoch 2/20
Epoch 00002: val_loss improved from 0.36398 to 0.35791, saving model to best_model.h5
Epoch 3/20
Epoch 00003: val_loss improved from 0.35791 to 0.34699, saving model to best_model.h5
Epoch 4/20
Epoch 00004: val_loss improved from 0.34699 to 0.33593, saving model to best_model.h5
Epoch 5/20
Epoch 00005: val_loss improved from 0.33593 to 0.32684, saving model to best_model.h5
Epoch 6/20
Epoch 00006: val_loss improved from 0.32684 to 0.32013, saving model to best_model.h5
Epoch 7/20
Epoch 00007: val_loss did not improve from 0.32013
Epoch 8/20
Epoch 00008: val_loss improved from 0.32013 to 0.31856, saving model to best_model.h5
Epoch 9/20
Epoch 00009: val_loss improved from 0.31856 to 0.31451, saving model to best_model.h5
Epoch 10/20
Epoch 00010: val_loss did not improve from 0.31451
Epoch 11/20
Epoch 00011: val_loss improved from 0.31451 to 0.30343, saving model to best_model.h5
Epoch

Epoch 11/20
Epoch 00011: val_loss improved from 0.31169 to 0.30523, saving model to best_model.h5
Epoch 12/20
Epoch 00012: val_loss did not improve from 0.30523
Epoch 13/20
Epoch 00013: val_loss did not improve from 0.30523
Epoch 14/20
Epoch 00014: val_loss did not improve from 0.30523
Epoch 15/20
Epoch 00015: val_loss improved from 0.30523 to 0.30334, saving model to best_model.h5
Epoch 16/20
Epoch 00016: val_loss did not improve from 0.30334
Epoch 17/20
Epoch 00017: val_loss did not improve from 0.30334
Epoch 18/20
Epoch 00018: val_loss did not improve from 0.30334
Epoch 19/20
Epoch 00019: val_loss did not improve from 0.30334
Epoch 20/20
Epoch 00020: val_loss did not improve from 0.30334
Restoring model weights from the end of the best epoch.
Epoch 00020: early stopping
Test Loss: 0.27873727679252625
Test Accuracy: 0.8802644610404968
Test Precision: 0.9079450821516993
Test Recall: 0.9048900852400179
Test F1 Score: 0.9064150095494888
Specificity:  0.9048900852400179
roc_auc:  0.94836

Epoch 18/20
Epoch 00018: val_loss improved from 0.30645 to 0.30424, saving model to best_model.h5
Epoch 19/20
Epoch 00019: val_loss did not improve from 0.30424
Epoch 20/20
Epoch 00020: val_loss did not improve from 0.30424
Test Loss: 0.2784191370010376
Test Accuracy: 0.8766709566116333
Test Precision: 0.9164738546968996
Test Recall: 0.8885150291610587
Test F1 Score: 0.9022779043280182
Specificity:  0.8885150291610587
roc_auc:  0.9481227663788709
FNR:  0.11148497083894127
FPR:  0.11148497083894127
[(16, 0.901322482197355), (32, 0.9064150095494888), (64, 0.9022779043280182)]
32 0.9064150095494888


# Grid Search

In [78]:
#Trying random parameters to get find higher scores

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler  # Import MinMaxScaler from scikit-learn
from sklearn.metrics import precision_score, recall_score, f1_score
import random

params=pd.read_csv('bestparameters.csv')

row = params.loc[params['f1'].idxmax()]


#Getting the best parameters till now
splitting_state_1=row['splitting_state_1']
splitting_state_2=row['splitting_state_2']
splitting_size_1=row['splitting_size_1']
splitting_size_2=row['splitting_size_2']
tf_state=row['tf_state']
layer_1_neurons=row['layer_1_neurons']
layer_2_neurons=row['layer_2_neurons']
layer_3_neurons=row['layer_3_neurons']
layer_4_neurons=row['layer_4_neurons']
layer_1_act=row['layer_1_act']
layer_2_act=row['layer_2_act']
layer_3_act=row['layer_3_act']
layer_4_act=row['layer_4_act']
optimizer=row['optimizer']
output_act=row['output_act']
n_epochs=row['n_epochs']
batch_s=row['batch_s']


maxF1=-1
maxAcc=-1
maxRec=-1
maxLoss=-1
maxPrec=-1
maxSpec=-1
maxRoc=-1
maxFpr=-1
maxFnr=-1
#shuffling the data
shuffled_data = data.sample(frac=1, random_state=42).reset_index(drop=True)

# Select columns for input features (X) and target (y)
X = shuffled_data.drop(columns=['generated', 'text', 'clean_text'])  # Input features
y = shuffled_data['generated']  # Target variable

results=[]

optimizers=['SGD', 'Adam', 'RMSprop', 'Adagrad', 'Adadelta', 'Adamax', 'Nadam', 'FTRL']
hidden_activation_functions = ['relu',  'tanh', 'sigmoid', 'swish', 'elu', 'selu']



for i in range(0,10):
    
    splitting_state_1=random.randint(1, 100)
    splitting_state_2=random.randint(1, 100)
    tf_state=random.randint(1, 100)
    print(splitting_state_1)
    print(splitting_state_2)
    print(tf_state)


    tf.random.set_seed(tf_state)
    

    # Split scaled data into training, validation, and testing sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=splitting_size_1, random_state=splitting_state_1)
    X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=splitting_size_2, random_state=splitting_state_2)

    scaler = MinMaxScaler()

    X_train = scaler.fit_transform(X_train)
    X_validation = scaler.transform(X_validation)
    X_test = scaler.transform(X_test)

    # Define the model
    model = Sequential([
        Dense(layer_1_neurons, activation=layer_1_act, input_shape=(len(X_train[0]),)),
        Dense(layer_2_neurons, activation=layer_2_act),
        Dense(layer_3_neurons, activation=layer_3_act),
        Dense(layer_4_neurons, activation=layer_4_act),
        Dense(1, activation=output_act)  # Output layer for binary classification
    ])

    # Compile the model
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Define callbacks for model checkpointing and early stopping
    checkpoint = ModelCheckpoint("best_model.h5", monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1, restore_best_weights=True)

    # Train the model with callbacks
    history = model.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_s, validation_data=(X_validation, y_validation),
                        callbacks=[checkpoint, early_stopping],shuffle=False)

    # Load the best model weights saved during training
    best_model = tf.keras.models.load_model("best_model.h5")

    # Evaluate the best model on the testing set
    test_loss, test_accuracy = best_model.evaluate(X_test, y_test)
    print("Test Loss:", test_loss)
    print("Test Accuracy:", test_accuracy)

    # Calculate F1 score for the test set
    y_pred = best_model.predict(X_test)
    y_pred_binary = (y_pred > 0.5).astype(int)  # Convert probabilities to binary predictions
    precision = precision_score(y_test, y_pred_binary)
    recall = recall_score(y_test, y_pred_binary)
    f1 = f1_score(y_test, y_pred_binary)

    results.append((i,f1))
    print("Test Precision:", precision)
    print("Test Recall:", recall)
    print("Test F1 Score:", f1)
    
    conf_matrix = confusion_matrix(y_test, y_pred_binary)

    # Calculate specificity (TNR)
    specificity = conf_matrix[1, 1] / (conf_matrix[1, 0] + conf_matrix[1, 1])
    print('Specificity: ',specificity)
    # Calculate ROC-AUC score
    roc_auc = roc_auc_score(y_test, y_pred)
    print('roc_auc: ',roc_auc)

    # Calculate True Positive Rate (TPR)
    tpr = recall

    # Calculate True Negative Rate (TNR)
    tnr = specificity

    # Calculate False Positive Rate (FPR)
    fpr = 1 - tnr

    # Calculate False Negative Rate (FNR)
    fnr = 1 - tpr
    print('FNR: ',fnr)
    print('FPR: ',fpr)
    
    if f1>maxF1:
        maxF1=f1
        maxPrec=precision
        maxAcc=test_accuracy
        maxRec=recall
        maxLoss=test_loss
        maxSpec=specificity
        maxFnr=fnr
        maxFpr=fpr
        maxRoc=roc_auc
    print('---------------------------------------------------------')

    



29
68
21
Epoch 1/20
Epoch 00001: val_loss improved from inf to 0.37996, saving model to best_model.h5
Epoch 2/20
Epoch 00002: val_loss improved from 0.37996 to 0.33498, saving model to best_model.h5
Epoch 3/20
Epoch 00003: val_loss improved from 0.33498 to 0.32442, saving model to best_model.h5
Epoch 4/20
Epoch 00004: val_loss improved from 0.32442 to 0.31563, saving model to best_model.h5
Epoch 5/20
Epoch 00005: val_loss improved from 0.31563 to 0.30890, saving model to best_model.h5
Epoch 6/20
Epoch 00006: val_loss improved from 0.30890 to 0.30564, saving model to best_model.h5
Epoch 7/20
Epoch 00007: val_loss improved from 0.30564 to 0.30398, saving model to best_model.h5
Epoch 8/20
Epoch 00008: val_loss did not improve from 0.30398
Epoch 9/20
Epoch 00009: val_loss improved from 0.30398 to 0.30389, saving model to best_model.h5
Epoch 10/20
Epoch 00010: val_loss did not improve from 0.30389
Epoch 11/20
Epoch 00011: val_loss did not improve from 0.30389
Epoch 12/20
Epoch 00012: val_lo

Test Precision: 0.9015829318651066
Test Recall: 0.8863328822733424
Test F1 Score: 0.8938928693278745
Specificity:  0.8863328822733424
roc_auc:  0.9426425733052936
FNR:  0.11366711772665761
FPR:  0.11366711772665761
---------------------------------------------------------
42
12
40
Epoch 1/20
Epoch 00001: val_loss improved from inf to 0.36941, saving model to best_model.h5
Epoch 2/20
Epoch 00002: val_loss improved from 0.36941 to 0.33124, saving model to best_model.h5
Epoch 3/20
Epoch 00003: val_loss improved from 0.33124 to 0.32443, saving model to best_model.h5
Epoch 4/20
Epoch 00004: val_loss improved from 0.32443 to 0.31638, saving model to best_model.h5
Epoch 5/20
Epoch 00005: val_loss improved from 0.31638 to 0.30772, saving model to best_model.h5
Epoch 6/20
Epoch 00006: val_loss improved from 0.30772 to 0.30208, saving model to best_model.h5
Epoch 7/20
Epoch 00007: val_loss improved from 0.30208 to 0.30048, saving model to best_model.h5
Epoch 8/20
Epoch 00008: val_loss improved f

Epoch 6/20
Epoch 00006: val_loss improved from 0.34870 to 0.34149, saving model to best_model.h5
Epoch 7/20
Epoch 00007: val_loss improved from 0.34149 to 0.32327, saving model to best_model.h5
Epoch 8/20
Epoch 00008: val_loss improved from 0.32327 to 0.32218, saving model to best_model.h5
Epoch 9/20
Epoch 00009: val_loss improved from 0.32218 to 0.31329, saving model to best_model.h5
Epoch 10/20
Epoch 00010: val_loss did not improve from 0.31329
Epoch 11/20
Epoch 00011: val_loss did not improve from 0.31329
Epoch 12/20
Epoch 00012: val_loss improved from 0.31329 to 0.31222, saving model to best_model.h5
Epoch 13/20
Epoch 00013: val_loss did not improve from 0.31222
Epoch 14/20
Epoch 00014: val_loss did not improve from 0.31222
Epoch 15/20
Epoch 00015: val_loss did not improve from 0.31222
Epoch 16/20
Epoch 00016: val_loss did not improve from 0.31222
Epoch 17/20
Epoch 00017: val_loss did not improve from 0.31222
Restoring model weights from the end of the best epoch.
Epoch 00017: earl

Epoch 15/20
Epoch 00015: val_loss did not improve from 0.30418
Restoring model weights from the end of the best epoch.
Epoch 00015: early stopping
Test Loss: 0.3105349540710449
Test Accuracy: 0.8602846264839172
Test Precision: 0.9209080047789725
Test Recall: 0.8573971078976641
Test F1 Score: 0.8880184331797236
Specificity:  0.8573971078976641
roc_auc:  0.935149895768292
FNR:  0.1426028921023359
FPR:  0.1426028921023359
---------------------------------------------------------
61
95
45
Epoch 1/20
Epoch 00001: val_loss improved from inf to 0.38078, saving model to best_model.h5
Epoch 2/20
Epoch 00002: val_loss improved from 0.38078 to 0.37548, saving model to best_model.h5
Epoch 3/20
Epoch 00003: val_loss improved from 0.37548 to 0.35917, saving model to best_model.h5
Epoch 4/20
Epoch 00004: val_loss improved from 0.35917 to 0.33324, saving model to best_model.h5
Epoch 5/20
Epoch 00005: val_loss improved from 0.33324 to 0.32985, saving model to best_model.h5
Epoch 6/20
Epoch 00006: val_l

Epoch 2/20
Epoch 00002: val_loss improved from 0.39126 to 0.33784, saving model to best_model.h5
Epoch 3/20
Epoch 00003: val_loss improved from 0.33784 to 0.33536, saving model to best_model.h5
Epoch 4/20
Epoch 00004: val_loss improved from 0.33536 to 0.32951, saving model to best_model.h5
Epoch 5/20
Epoch 00005: val_loss did not improve from 0.32951
Epoch 6/20
Epoch 00006: val_loss did not improve from 0.32951
Epoch 7/20
Epoch 00007: val_loss did not improve from 0.32951
Epoch 8/20
Epoch 00008: val_loss did not improve from 0.32951
Epoch 9/20
Epoch 00009: val_loss did not improve from 0.32951
Restoring model weights from the end of the best epoch.
Epoch 00009: early stopping
Test Loss: 0.32633379101753235
Test Accuracy: 0.8460543155670166
Test Precision: 0.904639175257732
Test Recall: 0.8532596685082873
Test F1 Score: 0.8781985670419652
Specificity:  0.8532596685082873
roc_auc:  0.9267104354463507
FNR:  0.14674033149171273
FPR:  0.14674033149171273
------------------------------------

In [5]:
#The best score till now

column_names = params.columns.tolist()

last_row_values = params.iloc[-1]

for column, value in zip(column_names, last_row_values):
    print(f"{column}: {value}")

splitting_state_1: 112
splitting_state_2: 124
splitting_size_1: 0.25
splitting_size_2: 0.5
tf_state: 227
layer_1_neurons: 120
layer_2_neurons: 130
layer_3_neurons: 150
layer_4_neurons: 130
layer_1_act: relu
layer_2_act: relu
layer_3_act: relu
layer_4_act: relu
optimizer: adam
output_act: sigmoid
n_epochs: 20
batch_s: 32
test_loss: 0.2787372767925262
test_accuracy: 0.8802644610404968
precision: 0.9079450821516992
recall: 0.904890085240018
f1: 0.9064150095494888
specificity: 0.904890085240018
roc_auc: 0.9483662464537184
FPR: 0.0951099147599821
FNR: 0.0951099147599821
