In [1]:
# -*- coding: utf-8 -*-
"""
Created on Friday Febr 22 12:16:19 2023

@author: iliaskaloup
"""

import tensorflow

import os, json, glob, time, sys, re
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import json
import math

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras import regularizers
from tensorflow.keras import layers
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Masking
from tensorflow.keras.callbacks import CSVLogger
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model
from tensorflow.keras import optimizers
from tensorflow.keras.constraints import max_norm
from tensorflow.keras.layers import Bidirectional, BatchNormalization
from tensorflow.keras.initializers import glorot_uniform, RandomUniform, lecun_uniform, Constant
from collections import OrderedDict
from sklearn.model_selection import StratifiedKFold
import tensorflow.keras.backend as K
from tensorflow.keras.constraints import max_norm

import io
from contextlib import redirect_stdout
#from sklearn import metrics
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, \
roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from sklearn.utils import shuffle
import time
import random


# define seeder
seed = 123
np.random.seed(seed)
random.seed(seed)
tensorflow.random.set_seed(seed)


In [2]:
def recall_metric(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = (true_positives + K.epsilon()) / (possible_positives + K.epsilon())
        return recall

def precision_metric(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = (true_positives + K.epsilon()) / (predicted_positives + K.epsilon())
        return precision

def f1_metric(y_true, y_pred):

    prec = precision_metric(y_true, y_pred)
    rec = recall_metric(y_true, y_pred)
    f1 = 2*((prec*rec)/(prec+rec+K.epsilon()))
    return f1

def f2_metric(y_true, y_pred):

    prec = precision_metric(y_true, y_pred)
    rec = recall_metric(y_true, y_pred)
    f2 = 5*((prec*rec)/(4*prec+rec+K.epsilon()))
    return f2


In [3]:
def buildMLP():
    learning_rate = 0.001
    nIn = 768
    model = Sequential()
    model.add(Dense(100, input_dim=nIn)) # hidden
    model.add(Activation('relu'))
    #model.add(Dropout(0.15))
    model.add(Dense(1))
    model.add(Activation('sigmoid')) # Output
    #model.add(Dropout(0.15))
    sgd = optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=[f2_metric])
    return model


In [4]:
def buildLstm():
    model = Sequential()
    model.add(LSTM(100, input_dim=1, input_length=768, stateful=False))
    model.add(Activation('relu')) #dropout=0.2, recurrent_dropout=0.2, kernel_constraint=max_norm(3), bias_constraint=max_norm(3)
    #model.add(BatchNormalization(momentum=0.0))
    model.add(Dense(1,activation='sigmoid'))
    #model.compile(loss=f2_loss, optimizer='adam', metrics=[f2_metric])
    model.compile(loss="binary_crossentropy", optimizer='adam', metrics=[f2_metric])
    return model

In [5]:
train_val = pd.read_csv('train_val_embeddings.csv', sep =',')
train_val = train_val.sample(random_state=seed, frac=1).reset_index(drop=True)
train_val_y = train_val['Label'].values.tolist()
train_val_y = pd.DataFrame(train_val_y)
train_val_y = np.array(train_val_y)
train_val_X = train_val.drop('Label', axis=1)
train_val_X = np.array(train_val_X)

test = pd.read_csv('test_embeddings.csv', sep =',')
test = test.sample(random_state=seed, frac=1).reset_index(drop=True)
test_y = test['Label'].values.tolist()
test_y = pd.DataFrame(test_y)
test_y = np.array(test_y)
test_X = test.drop('Label', axis=1)
test_X = np.array(test_X)

train = pd.read_csv('train_embeddings.csv', sep =',')
train = train.sample(random_state=seed, frac=1).reset_index(drop=True)
train_y = train['Label'].values.tolist()
train_y = pd.DataFrame(train_y)
train_y = np.array(train_y)
train_X = train.drop('Label', axis=1)
train_X = np.array(train_X)

val = pd.read_csv('val_embeddings.csv', sep =',')
val = val.sample(random_state=seed, frac=1).reset_index(drop=True)
val_y = val['Label'].values.tolist()
val_y = pd.DataFrame(val_y)
val_y = np.array(val_y)
val_X = val.drop('Label', axis=1)
val_X = np.array(val_X)

train_X

array([[-3.7029777e-03,  4.0793600e-01, -2.5765502e-01, ...,
        -3.2464272e-01, -4.6883753e-01,  6.6156566e-01],
       [-5.4515070e-01,  3.5717532e-01, -3.4790817e-01, ...,
         4.3896335e-01, -2.9661950e-01, -3.9072060e-01],
       [-4.5674380e-01,  5.6255543e-01, -1.0349541e+00, ...,
        -1.0931984e-01, -8.8110840e-03, -6.4844990e-01],
       ...,
       [-1.1811732e+00,  1.1784925e+00, -2.5114307e+00, ...,
         2.3985744e+00,  2.2563240e-01, -3.8030213e-01],
       [ 4.5845336e-01,  3.5698715e-01,  3.5300934e-01, ...,
         3.6557147e-01, -1.3089369e-01,  1.9057815e-01],
       [ 9.6241530e-02,  3.5551400e-01,  4.9386650e-04, ...,
        -9.7265035e-02,  7.0922000e-02, -3.7174487e-01]])

In [6]:
'''train_val_X = train_val_X.reshape((train_val_X.shape[0], 768, 1))

train_X = train_X.reshape((train_X.shape[0], 768, 1))

val_X = val_X.reshape((val_X.shape[0], 768, 1))

test_X = test_X.reshape((test_X.shape[0], 768, 1))'''

'train_val_X = train_val_X.reshape((train_val_X.shape[0], 768, 1))\n\ntrain_X = train_X.reshape((train_X.shape[0], 768, 1))\n\nval_X = val_X.reshape((val_X.shape[0], 768, 1))\n\ntest_X = test_X.reshape((test_X.shape[0], 768, 1))'

In [7]:
# train-val
nb_epoch = 100
BS = 64

myModel = buildMLP() 
print("model summary\m",myModel.summary())

csv_logger = CSVLogger('log.csv', append=True, separator=',')
es = EarlyStopping(monitor='val_f2_metric', mode='max', verbose=1, patience=10)
mc = ModelCheckpoint('best_mlp.h5', monitor='val_f2_metric', mode='max', verbose=1, save_best_only=True)
history = myModel.fit(train_X, train_y, validation_data=(val_X, val_y), epochs = nb_epoch, batch_size = BS, shuffle=True, verbose=1, callbacks=[csv_logger,es,mc])

#load best model
#myModel = load_model('best_model.h5')
myModel.load_weights("best_mlp.h5")

scores = myModel.evaluate(val_X, val_y, verbose=0)
#predictions = myModel.predict_classes(X_test, verbose=0)
predictions = (myModel.predict(val_X) > 0.5).astype("int32")
predScores = myModel.predict(val_X)

accuracy=accuracy_score(val_y, predictions)
precision=precision_score(val_y, predictions)
recall=recall_score(val_y, predictions)
f1=f1_score(val_y, predictions)
roc_auc=roc_auc_score(val_y, predictions)
f2 = 5*precision*recall / (4*precision+recall)
#f2=fbeta_score(Y_test, predictions, beta=0.5)
print(confusion_matrix(val_y, predictions, labels=[0, 1]))
tn, fp, fn, tp = confusion_matrix(val_y, predictions).ravel()
fpr = fp / (fp+tn)
acc = ((tp+tn)/(tp+tn+fp+fn))
print("Accuracy:%.2f%%"%(acc*100))
print("Precision:%.2f%%"%(precision*100))
print("Recall:%.2f%%"%(recall*100))
print("F1 score:%.2f%%"%(f1*100))
print("Roc_Auc score:%.2f%%"%(roc_auc*100))
print("F2 score:%.2f%%"%(f2*100))
print("FPR score:%.2f%%"%(fpr*100))
print(classification_report(val_y, predictions))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 100)               76900     
                                                                 
 activation (Activation)     (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
 activation_1 (Activation)   (None, 1)                 0         
                                                                 
Total params: 77,001
Trainable params: 77,001
Non-trainable params: 0
_________________________________________________________________
model summary\m None
Epoch 1/100
Epoch 1: val_f2_metric improved from -inf to 0.42611, saving model to best_mlp.h5
Epoch 2/100
Epoch 2: val_f2_metric improved from 0.42611 to 0.67145, 

Epoch 25: val_f2_metric improved from 0.91526 to 0.92962, saving model to best_mlp.h5
Epoch 26/100
Epoch 26: val_f2_metric did not improve from 0.92962
Epoch 27/100
Epoch 27: val_f2_metric did not improve from 0.92962
Epoch 28/100
Epoch 28: val_f2_metric did not improve from 0.92962
Epoch 29/100
Epoch 29: val_f2_metric did not improve from 0.92962
Epoch 30/100
Epoch 30: val_f2_metric did not improve from 0.92962
Epoch 31/100
Epoch 31: val_f2_metric did not improve from 0.92962
Epoch 32/100
Epoch 32: val_f2_metric did not improve from 0.92962
Epoch 33/100
Epoch 33: val_f2_metric did not improve from 0.92962
Epoch 34/100
Epoch 34: val_f2_metric improved from 0.92962 to 0.93006, saving model to best_mlp.h5
Epoch 35/100
Epoch 35: val_f2_metric did not improve from 0.93006
Epoch 36/100
Epoch 36: val_f2_metric did not improve from 0.93006
Epoch 37/100
Epoch 37: val_f2_metric did not improve from 0.93006
Epoch 38/100
Epoch 38: val_f2_metric did not improve from 0.93006
Epoch 39/100
Epoch 39: 

In [8]:
# train-test
del myModel

nb_epoch = 16
BS = 64

myModel = buildMLP() 
print("model summary\m",myModel.summary())

myModel.fit(train_val_X, train_val_y)

scores = myModel.evaluate(test_X, test_y, verbose=0)
#predictions = myModel.predict_classes(X_test, verbose=0)
predictions = (myModel.predict(test_X) > 0.5).astype("int32")
predScores = myModel.predict(test_X)

accuracy=accuracy_score(test_y, predictions)
precision=precision_score(test_y, predictions)
recall=recall_score(test_y, predictions)
f1=f1_score(test_y, predictions)
roc_auc=roc_auc_score(test_y, predictions)
f2 = 5*precision*recall / (4*precision+recall)
#f2=fbeta_score(Y_test, predictions, beta=0.5)
print(confusion_matrix(test_y, predictions, labels=[0, 1]))
tn, fp, fn, tp = confusion_matrix(test_y, predictions).ravel()
fpr = fp / (fp+tn)
acc = ((tp+tn)/(tp+tn+fp+fn))
print("Accuracy:%.2f%%"%(acc*100))
print("Precision:%.2f%%"%(precision*100))
print("Recall:%.2f%%"%(recall*100))
print("F1 score:%.2f%%"%(f1*100))
print("Roc_Auc score:%.2f%%"%(roc_auc*100))
print("F2 score:%.2f%%"%(f2*100))
print("FPR score:%.2f%%"%(fpr*100))
print(classification_report(test_y, predictions))

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_2 (Dense)             (None, 100)               76900     
                                                                 
 activation_2 (Activation)   (None, 100)               0         
                                                                 
 dense_3 (Dense)             (None, 1)                 101       
                                                                 
 activation_3 (Activation)   (None, 1)                 0         
                                                                 
Total params: 77,001
Trainable params: 77,001
Non-trainable params: 0
_________________________________________________________________
model summary\m None
[[857  33]
 [153 122]]
Accuracy:84.03%
Precision:78.71%
Recall:44.36%
F1 score:56.74%
Roc_Auc score:70.33%
F2 score:48.61%
FPR score:3.71%
              precisio

In [9]:
predictions

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])