<a href="https://colab.research.google.com/github/idoFinder/RNN_LSTM_API_CALLS/blob/master/RNN_LSTM_experiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from keras.models import Sequential, load_model ,Model
from keras.layers import Dense, LSTM,Embedding,GlobalMaxPooling1D,Activation
from keras.utils import to_categorical
from keras.optimizers import SGD
from sklearn.metrics import roc_curve, precision_recall_curve, roc_auc_score, confusion_matrix, accuracy_score, recall_score,plot_confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,StandardScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing, metrics
from itertools import groupby
import matplotlib.pyplot as plt 
import tensorflow as tf
import numpy as np
import pandas as pd
import random
import os

Using TensorFlow backend.


In [2]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data

## Read raw data

In [0]:
def load_raw_parquets(benign_path, malware_path):
  # load benign parquets
  all_data = pd.DataFrame(columns=['file_name','API_sequence','class'])
  for idx, batch_path in enumerate(os.listdir(benign_path)):
      path = '{}/{}'.format(folder_path,batch_path)
      batch = pd.read_parquet(path)
      files = batch['file_name'].unique()
      new_records = []
      for file in files:
          sub_df = batch[batch['file_name']==file].sort_values(['normalized_time'])
          new_record = {'file_name':file,'API_sequence':sub_df['API'].values,'class':'benign'}
          new_records.append(new_record)
      all_data = all_data.append(new_records, ignore_index=True)
      print('batch',idx+1, ' loaded')
      
  # load malicious parquets
  for idx, batch_path in enumerate(os.listdir(malware_path)):
      path = '{}/{}'.format(folder_path,batch_path)
      batch = pd.read_parquet(path)
      files = batch['file_name'].unique()
      new_records = []
      for file in files:
          sub_df = batch[batch['file_name']==file].sort_values(['normalized_time'])
          new_record = {'file_name':file,'API_sequence':sub_df['API'].values,'class':'malicious'}
          new_records.append(new_record)
      all_data = all_data.append(new_records, ignore_index=True)
      print('batch',idx+1, ' loaded')
    
  return all_data



def get_classes_distribution_df(df):
    num_of_benign = df[df['class'] == 'benign'].shape[0]
    num_of_malware = df[df['class'] == 'malicious'].shape[0]
    total = num_of_benign + num_of_malware
    benign_perc = round((num_of_benign / total) * 100, 2)
    malware_perc = round((num_of_malware / total) * 100, 2)
    res_str = 'Benign:{} ({}%) | Malicious:{} ({}%)'.format(num_of_benign, benign_perc, num_of_malware, malware_perc)
    return res_str
  

## Data cleaning + creating API corpus


In [0]:
def remove_repitations(sequence):
    new_sequence = [x[0] for x in groupby(sequence)]
    return new_sequence


def data_cleaning(sequence_MIN_length,all_data):
  # remove repitation in all data
  all_data['API_sequence'] =all_data.apply(lambda x: remove_repitations(x['API_sequence']), axis=1)

  # remove sequences shorter then sequence_MIN_length
  print('Remove files with short sequence')
  print('files before:',all_data.shape[0])
  to_drop = []
  for file in all_data['file_name']:
      if len(all_data[all_data['file_name']==file].API_sequence.values[0]) < sequence_MIN_length:
          to_drop.append(file)
  all_data = all_data[~all_data['file_name'].isin(to_drop)]
  print('files after:',all_data.shape[0])
  all_data = all_data.reset_index(drop=True)
  return all_data


def use_spesific_files(dataset_path,all_data):
  balanced_csv = pd.read_csv(dataset_path)
  print('files before:',all_data.shape[0])
  all_data = all_data[all_data['file_name'].isin(balanced_csv['file_name'].values)]
  print('files after:',all_data.shape[0])
  all_data = all_data.reset_index(drop=True)
  return all_data

# create API calls corpus
def create_API_corpus(all_data):
  API_corpus = []
  for file in all_data['file_name']:
      seq = all_data[all_data['file_name']==file].API_sequence.values[0]
      API_corpus.extend(set(seq))
  API_corpus = list(set(API_corpus))
  print('API_corpus:',len(API_corpus))
  return API_corpus

# Representation Learning Model

## Functions: one-hot-encoding


In [0]:
def convert_API_to_vector(Unique_API_calls,API):
    if API != 0:
        vec = np.zeros((len(Unique_API_calls),), dtype=np.float32)
        vec[Unique_API_calls.index(API)] = 1.0
    else:
        vec = np.zeros((len(Unique_API_calls),), dtype=np.float32)
    return list(vec)


def create_sequence_n_nextWord_one_hot(data):
    vectorized_data = []
    for idx in range(len(data)):
        seq = data.iloc[idx]
        vector = []
        for api in seq:
            vector.append(convert_API_to_vector(API_corpus,api))
        vectorized_data.append(vector)
    
    sequences = []
    next_words = []
    for seq in vectorized_data:
        sequences.append(seq[:-1])
        next_words.append(seq[len(seq)-1])
    
    return np.array(sequences),np.array(next_words)


def create_sequence_one_hot(data):
    vectorized_data = []
    for idx in range(len(data)):
        seq = data.iloc[idx]
        vector = []
        for api in seq:
            vector.append(convert_API_to_vector(API_corpus,api))
        vectorized_data.append(vector)
    
    return np.array(vectorized_data)


def pruning_n_padding(seq,sequence_MAX_length):
    final_seq = []
    if len(seq) < sequence_MAX_length+1:
        gap = (sequence_MAX_length) - len(seq)
        zeros = [0] * gap
        final_seq = list(zeros) + list(seq)
    
    elif len(seq) == sequence_MAX_length:
        final_seq = list(seq)
    
    else:
        final_seq = seq[:sequence_MAX_length]
    return final_seq



## Predict next API call 

### Prepare the data

In [0]:
def prepare_data_nextWord(data,sequence_MAX_length):

  # pruning the sequences to match sequence_MAX_length + next API as a label
  LSTM_train_seq =  data.apply(lambda x: pruning_n_padding(x,sequence_MAX_length))
  # create sequences and nextWords
  train_vectorized_seq, train_vectorized_nextWord = create_sequence_n_nextWord_one_hot(LSTM_train_seq)

  # print('train_vectorized_seq:',train_vectorized_seq.shape, ' train_vectorized_nextWord:',train_vectorized_nextWord.shape)

  return train_vectorized_seq, train_vectorized_nextWord

### Architecture 1:###
- Input: One-hot-encoding
- Train model with 2 LSTM layers + Dense using softmax
- Predict the next API call using the second layer last-cell (argmax)

**First stage - train LSTM**

In [0]:
def LSTM_nextWord_arch_1(output_dim,input_seq_size,input_time_dim,nb_units,dropout):
  seed_value= 10
  os.environ['PYTHONHASHSEED']=str(seed_value)
  random.seed(seed_value)
  np.random.seed(seed_value)
  # tf.set_random_seed(seed_value)
  tf.random.set_seed(seed_value)

  # build the model
  model_1 = Sequential()
  model_1.add(LSTM(units=nb_units, input_shape=(input_time_dim,input_seq_size),
                  activation='tanh',dropout=dropout ,return_sequences=True))
  model_1.add(LSTM(units=nb_units, input_shape=(input_time_dim,input_seq_size),
                  activation='tanh',dropout=dropout,return_sequences=False))
  model_1.add(Dense(output_dim,activation='softmax'))
  model_1.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
  # print(model_1.summary())
  return model_1

**Evaluate the LSTM training**

In [0]:
def evaluate_LSTM_nextWord(model,x_test,y_test):
  predictions = model.predict(x_test)
  decoded_preds = []
  for idx in range(predictions.shape[0]):
      decoded = API_corpus[np.argmax(predictions[idx])]
      decoded_preds.append(decoded)
      
  decoded_true = []
  for idx in range(y_test.shape[0]):
      decoded = API_corpus[np.argmax(x_test[idx])]
      decoded_true.append(decoded)

  print('predicted corpus:', str(set(decoded_preds)))

  correct = 0
  for idx in range(len(decoded_preds)):
      if decoded_preds[idx] == decoded_true[idx]:
          correct += 1
          
  print('LSTM representation accuracy:', round(correct/len(decoded_preds),3))

### Architecture 2: (Optional)###
- Input: integers
- Train model with 2 LSTM layers + Dense using softmax
- Predict the next API call using the second layer last-cell (argmax)

### Architecture 3: (Optional)###
- Input: One-hot-encoding
- Train model with 1 LSTM layers (return_sequence=True)+ Dense using softmax
- Feed the model with the next API for each input cell (offset of the APIs)
- Next, remove the last Dense layer and perform GlobalMaxPooling for the representation

## Predict final label (0/1)

### Prepare the data

In [0]:
def prepare_data_finalLabel(x_data, y_data,sequence_MAX_length):
  # print('Preparing data - final Label LSTM')

  # pruning the sequences to match sequence_MAX_length
  LSTM_train_seq =  x_data.apply(lambda x: pruning_n_padding(x,sequence_MAX_length))

  # convert API sequence to one-hot-encoding vectors
  train_vectorized_seq = create_sequence_one_hot(LSTM_train_seq)

  # convert class to 0 and 1
  rep_train_labels = np.array(y_data.apply(lambda x: 1 if x == 'malicious' else 0).values,dtype=np.float32)

  # print('x_data shape:',train_vectorized_seq.shape, ' y_data shape:',rep_train_labels.shape)

  return np.array(train_vectorized_seq), np.array(rep_train_labels)


### Architecture 1:###
- Train model with 2 LSTM layers + Dense using sigmoid
- Predict the class using the second layer last-cell 
- Next, remove the second LSTM layer and perform GlobalMaxPooling for the representation

**First stage - train LSTM**

In [0]:
def LSTM_finalLabel_arch_1(input_seq_size,input_time_dim,nb_units,dropout):
  # Seed value
  seed_value= 10
  os.environ['PYTHONHASHSEED']=str(seed_value)
  random.seed(seed_value)
  np.random.seed(seed_value)
  # tf.set_random_seed(seed_value)
  tf.random.set_seed(seed_value)

  print('Bulding LSTM model - final Label (arch 1)')

  # build the model
  model_2 = Sequential()
  model_2.add(LSTM(units=nb_units, input_shape=(input_time_dim,input_seq_size),
                  activation='tanh',dropout=dropout ,return_sequences=True))
  model_2.add(LSTM(units=nb_units, input_shape=(input_time_dim,input_seq_size),
                  activation='tanh',dropout=dropout,return_sequences=False))
  model_2.add(Dense(1, activation='sigmoid'))
  model_2.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
  # print(model_2.summary())
  return model_2

**Evaluate the LSTM training**

In [0]:
def evaluate_LSTM_finalLabel(model,x_test,y_test):

  predictions = model.predict(x_test)
  final_preds = []
  for pred in predictions:
      if pred > 0.5:
          final_preds.append(1.0)
      else:
          final_preds.append(0.0)
          
  correct = 0
  for idx, pred in enumerate(final_preds):
      if final_preds[idx] == y_test[idx]:
          correct+=1
  print('LSTM representation accuracy:',round(correct/len(final_preds),4))

# Feature Extraction - from LSTM

## Extract features using LSTM

In [0]:
def feature_vector_maxPool_n_lastCell(trained_model, sequence_data):
  # Seed value
  seed_value= 0
  os.environ['PYTHONHASHSEED']=str(seed_value)
  random.seed(seed_value)
  np.random.seed(seed_value)
  # tf.set_random_seed(seed_value)
  tf.random.set_seed(seed_value)

  # remove the last two layers: Dense + last LSTM 
  new_model = Sequential()
  for layer in trained_model.layers[:-2]:
      new_model.add(layer)

  preds = new_model.predict(sequence_data)

  # save all the last lstm cell outputs
  last_cell_outputs = []
  for file in preds:
      last_cell_outputs.append(file[-1]) 

  # remove the last two layers: Dense + last LSTM 
  new_model_max_pooling = Sequential()
  for layer in trained_model.layers[:-2]:
      new_model_max_pooling.add(layer)

  # add softmax + max pooling layer
  new_model_max_pooling.add(GlobalMaxPooling1D())
  max_pooling_vectors = new_model_max_pooling.predict(sequence_data)

  
  # Creating the representation vectors:
  # concatenating the max-pooling with the last output
  final_representation = []
  for idx,max_pool in enumerate(max_pooling_vectors):
      new_vector = list(max_pool) + list(last_cell_outputs[idx])
      final_representation.append(new_vector)
  final_representation = np.array(final_representation,dtype=np.float32)
  # print('final shape:', str(final_representation.shape) )
  
  return final_representation


In [0]:
def final_feature_vector(trained_model,x_data,y_data,one_hot_encoding,sequence_MAX_length):
  # pruning the sequences to match sequence_MAX_length 
  rep_train_seq =  x_data.apply(lambda x: pruning_n_padding(x,sequence_MAX_length))

  # convert class to 0 and 1
  labels = y_data.apply(lambda x: 1 if x == 'malicious' else 0).values

  if one_hot_encoding:
    # print('creating one-hot-encoding')
    # create sequences (one-hote encoding format) *we dont use the nextWord here
    train_vectorized_seq = create_sequence_one_hot(rep_train_seq)
    # print('Creating feature vectors')
    train_feature_vector = feature_vector_maxPool_n_lastCell(trained_model, train_vectorized_seq)

  return np.array(train_feature_vector), np.array(labels, dtype=np.float32)

# Classification

In [0]:
def calc_metrics(conf_matrix, Y_proba, Y_test):
    AUC = roc_auc_score(Y_test, Y_proba)

    TP = conf_matrix[1][1]
    TN = conf_matrix[0][0]
    FP = conf_matrix[0][1]
    FN = conf_matrix[1][0]

    TPR = TP / (TP + FN)
    FPR = FP / (FP + TN)
    ACC = (TP + TN) / (TP + TN + FP + FN)
    Precision = TP / (TP + FP)
    F1_score = 2 * ((Precision * TPR) / (Precision + TPR))

    metrics_dict = {
        'TPR': round(TPR, 2),
        'FPR': round(FPR, 2),
        'ACC': round(ACC, 2),
        'Precision': round(Precision, 2),
        'F1_score': round(F1_score, 2),
        'AUC': round(AUC, 4),
    }

    return metrics_dict


def convert_class_to_binary(label):
    if label == 'malicious':
        return 1
    else:
        return 0



def get_predictions(Y_proba, threshold):
    Y_pred_final = []
    for p in Y_proba:
        if p >= threshold:
            Y_pred_final.append(1.0)
        else:
            Y_pred_final.append(0.0)

    return np.array(Y_pred_final)

def LR_fit(x_train, y_train, max_iter):

  # normalizing the data
  x_train_final = StandardScaler().fit_transform(x_train)
  # init model + fit
  LR = LogisticRegression(max_iter=max_iter)
  LR.fit(x_train_final, y_train)

  return LR


def LR_predict(model,x_test):
  # Get probability for each class
  x_test_final = StandardScaler().fit_transform(x_test)
  y_proba = model.predict_proba(x_test_final)[:, 1]
  # convert probabilities into predictions based on the threshold
  y_pred = get_predictions(y_proba, threshold=0.5)

  return y_pred,y_proba

def LSTM_predict(model, x_test):
  # Get probability for each class
  y_proba = model.predict(x_test)
  # convert probabilities into predictions based on the threshold
  y_pred = get_predictions(y_proba, threshold=0.5)

  return y_pred,y_proba


def print_final_results(y_test,folds_TPR,folds_FPR,folds_ACC,folds_Precision,folds_F1_score,folds_AUC):
  print('\nFinal Results: {} samples'.format(y_test.shape[0]))
  print('TPR:', round(np.mean(folds_TPR),4))
  print('FPR:', round(np.mean(folds_FPR),4))
  print('ACC:', round(np.mean(folds_ACC),4))
  print('Precision:', round(np.mean(folds_Precision),4))
  print('F1_score:', round(np.mean(folds_F1_score),4))
  print('AUC:', round(np.mean(folds_AUC),4))



# Main



## Load & clean data

In [16]:
benign_path = '../Cuckoo_Parser/Parsed_Data/win10/benign_parquet_updated'
malware_path = '../Cuckoo_Parser/Parsed_Data/win10/malware_parquet_updated'

# all_data = load_raw_parquets(benign_path, malware_path)
# all_data.to_pickle('data_for_LSTM.pkl')
all_data = pd.read_pickle('/content/drive/My Drive/Thesis/RNN_LSTM/data_for_LSTM.pkl')

# Clean data (remove short sequence & remove duplications)
sequence_MIN_length = 15 # (Pascano, 2015)
all_data = data_cleaning(sequence_MIN_length,all_data)

# use file from other experiments
dataset_name = '1_Balanced_Frequencies_first1000sec_minAPI_15_minRunTime_0_withYara_No.csv'
folder_path = '/content/drive/My Drive/Thesis/RNN_LSTM/balaned_datasets'
full_path = '{}/{}'.format(folder_path,dataset_name)
all_data = use_spesific_files(full_path,all_data)

API_corpus = create_API_corpus(all_data)

all_data_distribuation = get_classes_distribution_df(all_data)
print(all_data_distribuation)

Remove files with short sequence
files before: 7156
files after: 6166
files before: 6166
files after: 4531
API_corpus: 303
Benign:2277 (50.25%) | Malicious:2254 (49.75%)


## strat cross-validation

In [17]:
### --- set experiment parameters --- ###

# prepare data for representation model (Optional)
LSTM_train_seq_MAX_length = 50 # (Athiwaratkun, 2017)
LSTM_train_seq_MAX_length = 100 # (Pascanu, 2015)

# LSTM learning params
output_dim = len(API_corpus)
input_seq_size = len(API_corpus)
input_time_dim = None
nb_units = 1500 # (pascanu 2015, Agrawal 2018, Athiwaratkun 2017)
nb_units = 100 # (Zhang 2020)
dropout = 0.1
epochs = 5
nb_folds = 10

# Classification params
classification_seq_MAX_length = 200 #(Agrawal 2018 , Athiwaratkun 2017)
one_hot_encoding = True


data_sequences = all_data['API_sequence']
data_labels = all_data['class']


folds_TPR, folds_FPR, folds_ACC, folds_F1_score, folds_Precision, folds_AUC = [], [], [], [], [], []
kfold = StratifiedKFold(n_splits=nb_folds, shuffle=True, random_state=10)
fold = 0
for train, test in kfold.split(data_sequences, data_labels):
  fold += 1

  print(' ------------- Fold {} -------------'.format(fold))

  x_train, y_train = data_sequences.iloc[train], data_labels.iloc[train]
  x_test, y_test = data_sequences.iloc[test], data_labels.iloc[test]
  
  x_train = x_train.reset_index(drop=True)
  y_train = y_train.reset_index(drop=True)
  x_test = x_test.reset_index(drop=True)
  y_test = y_test.reset_index(drop=True)

  ### --------------- Phase 1: Representation Learning ------------------- ##

  # prepare data for LSTM training with the final label (0/1)
  x_train_rep,y_train_rep = prepare_data_finalLabel(x_train,y_train,LSTM_train_seq_MAX_length)
  x_test_rep, y_test_rep = prepare_data_finalLabel(x_test, y_test,LSTM_train_seq_MAX_length)




  # create the LSTM model
  LSTM_model = LSTM_finalLabel_arch_1(input_seq_size,
                                      input_time_dim,
                                      nb_units,
                                      dropout)

  # train model on training data
  LSTM_model.fit(x_train_rep,y_train_rep, epochs=epochs)

  
  # mid-process evaluation of the representation learning model
  evaluate_LSTM_finalLabel(LSTM_model,x_test_rep,y_test_rep)

  # save the model
  # model_2.save('./LSTM_models/arch_1_LSTM_finalLabel.hdf5')

  ### --------------- Phase 2: Classification --------------------------- ##

  # create feature vector - feature extraction
  x_train_final, y_train_final = final_feature_vector(LSTM_model,
                                       x_train,
                                       y_train,
                                       one_hot_encoding,
                                       classification_seq_MAX_length)
  
  # create feature vector - feature extraction
  x_test_final, y_test_final = final_feature_vector(LSTM_model,
                                       x_test,
                                       y_test,
                                       one_hot_encoding,
                                       classification_seq_MAX_length)

  # train classifier
  LR_classifier = LR_fit(x_train_final, y_train_final, max_iter=1000)

  # predict 
  y_preds,y_proba = LR_predict(LR_classifier,x_test_final)

  # evaluate classifier
  conf_matrix = confusion_matrix(y_test_final, y_preds)
  # calc all the relevant metrics
  metrics = calc_metrics(conf_matrix, y_proba, y_test_final)
  metrics_print = '\tTPR:{}\tFPR:{}\tACC:{}\tPrecision:{}\tF1_score:{}\tAUC:{}'.format(metrics['TPR'],
                                                                                        metrics['FPR'],
                                                                                        metrics['ACC'],
                                                                                        metrics['Precision'],
                                                                                        metrics['F1_score'],
                                                                                        metrics['AUC'])

  print('[Fold {}/{}] -\t{}:\t{} '.format(fold, nb_folds, 'LR', metrics_print))
  folds_TPR.append(metrics['TPR'])
  folds_FPR.append(metrics['FPR'])
  folds_ACC.append(metrics['ACC'])
  folds_Precision.append(metrics['Precision'])
  folds_F1_score.append(metrics['F1_score'])
  folds_AUC.append(metrics['AUC'])



print_final_results(y_train_final,folds_TPR,folds_FPR,folds_ACC,folds_Precision,folds_F1_score,folds_AUC)


 ------------- Fold 1 -------------
Bulding LSTM model - final Label (arch 1)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
LSTM representation accuracy: 0.8767
[Fold 1/10] -	LR:		TPR:0.87	FPR:0.09	ACC:0.89	Precision:0.91	F1_score:0.89	AUC:0.9555 
 ------------- Fold 2 -------------
Bulding LSTM model - final Label (arch 1)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
LSTM representation accuracy: 0.9139
[Fold 2/10] -	LR:		TPR:0.92	FPR:0.09	ACC:0.91	Precision:0.91	F1_score:0.91	AUC:0.9499 
 ------------- Fold 3 -------------
Bulding LSTM model - final Label (arch 1)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
LSTM representation accuracy: 0.9051
[Fold 3/10] -	LR:		TPR:0.91	FPR:0.07	ACC:0.92	Precision:0.93	F1_score:0.92	AUC:0.9668 
 ------------- Fold 4 -------------
Bulding LSTM model - final Label (arch 1)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
LSTM representation accuracy: 0.8653
[Fold 4/10] -	LR:		TPR:0.85	FPR:0.07	ACC:0.89	Precision:0.92	F1_score:0.89	AUC:

In [20]:
# save results

columns = ['dataset', 'classes_distribution','analysis_time',
           'num_of_features','model','representation_model',
           'representation_model_conf','representation_Max_seq','classification_Max_seq','classifier','classifier_conf',
           'TPR','FPR','ACC','Precision','F1_score','AUC']


result_df = pd.DataFrame(columns=columns)

new_record = {
    'dataset':dataset_name,
    'classes_distribution':all_data_distribuation,
    'analysis_time':1000,
    'num_of_features':2*nb_units,
    'model':'LSTM_finalLabel_MaxPooling_LR',
    'representation_model':'LSTM_finalLabel',
    'representation_model_conf':LSTM_model.get_config(),
    'representation_Max_seq':LSTM_train_seq_MAX_length,
    'classification_Max_seq':classification_seq_MAX_length,
    'classifier':'LogisticRegression',
    'classifier_conf':LR_classifier.get_params(),
    'TPR':np.mean(folds_TPR),
    'FPR':np.mean(folds_FPR),
    'ACC': np.mean(folds_ACC),
    'Precision':np.mean(folds_Precision),
    'F1_score':np.mean(folds_F1_score),
    'AUC':np.mean(folds_AUC)
}

result_df = result_df.append(new_record, ignore_index=True)
result_df

Unnamed: 0,dataset,classes_distribution,analysis_time,num_of_features,model,representation_model,representation_model_conf,representation_Max_seq,classification_Max_seq,classifier,classifier_conf,TPR,FPR,ACC,Precision,F1_score,AUC
0,1_Balanced_Frequencies_first1000sec_minAPI_15_...,Benign:2277 (50.25%) | Malicious:2254 (49.75%),1000,200,LSTM_finalLabel_MaxPooling_LR,LSTM_finalLabel,"{'name': 'sequential_46', 'layers': [{'class_n...",100,200,LogisticRegression,"{'C': 1.0, 'class_weight': None, 'dual': False...",0.878,0.082,0.898,0.912,0.897,0.94951


In [0]:
result_df.to_csv('/content/drive/My Drive/Thesis/RNN_LSTM/LSTM_finalLabel_MaxPooling_LR_level_0_results')