<a href="https://colab.research.google.com/github/idoFinder/RNN_LSTM_API_CALLS/blob/master/RNN_LSTM_experiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from keras.models import Sequential, load_model ,Model
from keras.layers import Dense, LSTM,Embedding,GlobalMaxPooling1D,Activation
from keras.utils import to_categorical
from keras.optimizers import SGD
from sklearn.metrics import roc_curve, precision_recall_curve, roc_auc_score, confusion_matrix, accuracy_score, recall_score,plot_confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,StandardScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing, metrics
from itertools import groupby
import matplotlib.pyplot as plt 
import tensorflow as tf
import numpy as np
import pandas as pd
import random
import os

In [11]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


# Data

## Read raw data

In [0]:
def load_raw_parquets(benign_path, malware_path):
  # load benign parquets
  all_data = pd.DataFrame(columns=['file_name','API_sequence','class'])
  for idx, batch_path in enumerate(os.listdir(benign_path)):
      path = '{}/{}'.format(folder_path,batch_path)
      batch = pd.read_parquet(path)
      files = batch['file_name'].unique()
      new_records = []
      for file in files:
          sub_df = batch[batch['file_name']==file].sort_values(['normalized_time'])
          new_record = {'file_name':file,'API_sequence':sub_df['API'].values,'class':'benign'}
          new_records.append(new_record)
      all_data = all_data.append(new_records, ignore_index=True)
      print('batch',idx+1, ' loaded')
      
  # load malicious parquets
  for idx, batch_path in enumerate(os.listdir(malware_path)):
      path = '{}/{}'.format(folder_path,batch_path)
      batch = pd.read_parquet(path)
      files = batch['file_name'].unique()
      new_records = []
      for file in files:
          sub_df = batch[batch['file_name']==file].sort_values(['normalized_time'])
          new_record = {'file_name':file,'API_sequence':sub_df['API'].values,'class':'malicious'}
          new_records.append(new_record)
      all_data = all_data.append(new_records, ignore_index=True)
      print('batch',idx+1, ' loaded')
    
  return all_data

## Data cleaning + creating API corpus


In [0]:
def remove_repitations(sequence):
    new_sequence = [x[0] for x in groupby(sequence)]
    return new_sequence


def data_cleaning(sequence_MIN_length,all_data):
  # remove repitation in all data
  all_data['API_sequence'] =all_data.apply(lambda x: remove_repitations(x['API_sequence']), axis=1)

  # remove sequences shorter then sequence_MIN_length
  print('Remove files with short sequence')
  print('files before:',all_data.shape[0])
  to_drop = []
  for file in all_data['file_name']:
      if len(all_data[all_data['file_name']==file].API_sequence.values[0]) < sequence_MIN_length:
          to_drop.append(file)
  all_data = all_data[~all_data['file_name'].isin(to_drop)]
  print('files after:',all_data.shape[0])
  return all_data


def use_spesific_files(dataset_path,all_data):
  balanced_csv = pd.read_csv(dataset_path)
  print('files before:',all_data.shape[0])
  all_data = all_data[all_data['file_name'].isin(balanced_csv['file_name'].values)]
  print('files after:',all_data.shape[0])
  return all_data

# create API calls corpus
def create_API_corpus(all_data):
  API_corpus = []
  for file in all_data['file_name']:
      seq = all_data[all_data['file_name']==file].API_sequence.values[0]
      API_corpus.extend(set(seq))
  API_corpus = list(set(API_corpus))
  print('API_corpus:',len(API_corpus))
  return API_corpus

# Representation Learning Model

## Functions: one-hot-encoding


In [0]:
def convert_API_to_vector(Unique_API_calls,API):
    if API != 0:
        vec = np.zeros((len(Unique_API_calls),), dtype=np.float32)
        vec[Unique_API_calls.index(API)] = 1.0
    else:
        vec = np.zeros((len(Unique_API_calls),), dtype=np.float32)
    return list(vec)


def create_sequence_n_nextWord_one_hot(data):
    vectorized_data = []
    for idx in range(len(data)):
        seq = data.iloc[idx]
        vector = []
        for api in seq:
            vector.append(convert_API_to_vector(API_corpus,api))
        vectorized_data.append(vector)
    
    sequences = []
    next_words = []
    for seq in vectorized_data:
        sequences.append(seq[:-1])
        next_words.append(seq[len(seq)-1])
    
    return np.array(sequences),np.array(next_words)


def create_sequence_one_hot(data):
    vectorized_data = []
    for idx in range(len(data)):
        seq = data.iloc[idx]
        vector = []
        for api in seq:
            vector.append(convert_API_to_vector(API_corpus,api))
        vectorized_data.append(vector)
    
    return np.array(vectorized_data)


def pruning_n_padding(seq,sequence_MAX_length):
    final_seq = []
    if len(seq) < sequence_MAX_length+1:
        gap = (sequence_MAX_length) - len(seq)
        zeros = [0] * gap
        final_seq = list(zeros) + list(seq)
    
    elif len(seq) == sequence_MAX_length:
        final_seq = list(seq)
    
    else:
        final_seq = seq[:sequence_MAX_length]
    return final_seq



## Predict next API call 

### Prepare the data

In [0]:
def prepare_data_nextWord(data,sequence_MAX_length):

  # pruning the sequences to match sequence_MAX_length + next API as a label
  LSTM_train_seq =  data.apply(lambda x: pruning_n_padding(x,sequence_MAX_length))
  # create sequences and nextWords
  train_vectorized_seq, train_vectorized_nextWord = create_sequence_n_nextWord_one_hot(LSTM_train_seq)

  print('train_vectorized_seq:',train_vectorized_seq.shape, ' train_vectorized_nextWord:',train_vectorized_nextWord.shape)

  return train_vectorized_seq, train_vectorized_nextWord

### Architecture 1:###
- Input: One-hot-encoding
- Train model with 2 LSTM layers + Dense using softmax
- Predict the next API call using the second layer last-cell (argmax)

**First stage - train LSTM**

In [0]:
# LSTM Params
output_dim = len(API_corpus)
input_seq_size = len(API_corpus)
input_time_dim = None
nb_units =500
epochs = 10

In [0]:
# Seed value
seed_value= 0
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
tf.set_random_seed(seed_value)

# build the model
model_1 = Sequential()
model_1.add(LSTM(units=nb_units, input_shape=(input_time_dim,input_seq_size),
                 activation='tanh',dropout=0.1 ,return_sequences=True))
model_1.add(LSTM(units=nb_units, input_shape=(input_time_dim,input_seq_size),
                 activation='tanh',dropout=0.1,return_sequences=False))
model_1.add(Dense(output_dim,activation='softmax'))
model_1.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
model_1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_21 (LSTM)               (None, None, 500)         1608000   
_________________________________________________________________
lstm_22 (LSTM)               (None, 500)               2002000   
_________________________________________________________________
dense_11 (Dense)             (None, 303)               151803    
Total params: 3,761,803
Trainable params: 3,761,803
Non-trainable params: 0
_________________________________________________________________


In [0]:
model_1.fit(train_vectorized_seq,train_vectorized_nextWord, epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
 928/4932 [====>.........................] - ETA: 22:33 - loss: 3.4824 - acc: 0.1670

KeyboardInterrupt: 

**Evaluate the LSTM training**

In [0]:
predictions = model_1.predict(test_vectorized_seq)
decoded_preds = []
for idx in range(predictions.shape[0]):
    decoded = API_corpus[np.argmax(predictions[idx])]
    decoded_preds.append(decoded)
    
decoded_true = []
for idx in range(test_vectorized_nextWord.shape[0]):
    decoded = API_corpus[np.argmax(test_vectorized_nextWord[idx])]
    decoded_true.append(decoded)

set(decoded_preds)

In [0]:
correct = 0
for idx in range(len(decoded_preds)):
    if decoded_preds[idx] == decoded_true[idx]:
        correct += 1
        
print('test accuracy:', round(correct/len(decoded_preds),3))

**Save Model**

In [0]:
# save the model
model_1.save('./LSTM_models/arch_1_LSTM_nextWord.hdf5')

### Architecture 2: (Optional)###
- Input: integers
- Train model with 2 LSTM layers + Dense using softmax
- Predict the next API call using the second layer last-cell (argmax)

### Architecture 3: (Optional)###
- Input: One-hot-encoding
- Train model with 1 LSTM layers (return_sequence=True)+ Dense using softmax
- Feed the model with the next API for each input cell (offset of the APIs)
- Next, remove the last Dense layer and perform GlobalMaxPooling for the representation

## Predict final label (0/1)

### Prepare the data

In [0]:
def prepare_data_finalLabel(x_data, y_data,sequence_MAX_length):

  # pruning the sequences to match sequence_MAX_length
  LSTM_train_seq =  x_data.apply(lambda x: pruning_n_padding(x,sequence_MAX_length))

  # convert API sequence to one-hot-encoding vectors
  train_vectorized_seq = create_sequence_one_hot(LSTM_train_seq)

  # convert class to 0 and 1
  rep_train_labels = np.array(y_data.apply(lambda x: 1 if x == 'malicious' else 0).values,dtype=np.float32)

  print('train_vectorized_seq:',train_vectorized_seq.shape, ' train_labels:',rep_train_labels.shape)

  return train_vectorized_seq, rep_train_labels


### Architecture 1:###
- Train model with 2 LSTM layers + Dense using sigmoid
- Predict the class using the second layer last-cell 
- Next, remove the second LSTM layer and perform GlobalMaxPooling for the representation

**First stage - train LSTM**

In [0]:
def LSTM_finalLabel_arch_1(input_seq_size,input_time_dim,nb_units,dropout):
  # Seed value
  seed_value= 0
  os.environ['PYTHONHASHSEED']=str(seed_value)
  random.seed(seed_value)
  np.random.seed(seed_value)
  tf.set_random_seed(seed_value)

  # build the model
  model_2 = Sequential()
  model_2.add(LSTM(units=nb_units, input_shape=(input_time_dim,input_seq_size),
                  activation='tanh',dropout=dropout ,return_sequences=True))
  model_2.add(LSTM(units=nb_units, input_shape=(input_time_dim,input_seq_size),
                  activation='tanh',dropout=dropout,return_sequences=False))
  model_2.add(Dense(1, activation='sigmoid'))
  model_2.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
  print(model_2.summary())
  return model_2

**Evaluate the LSTM training**

In [0]:
def evaluate_LSTM(model,test_vectorized_seq,rep_test_labels):
  predictions = model.predict(test_vectorized_seq)

  final_preds = []

  for pred in predictions:
      if pred > 0.5:
          final_preds.append(1.0)
      else:
          final_preds.append(0.0)
          
  correct = 0
  for idx, pred in enumerate(final_preds):
      if final_preds[idx] == rep_test_labels[idx]:
          correct+=1
  print('accuracy:',round(correct/len(final_preds),4))

accuracy: 0.889


# Feature Extraction - from LSTM

## Prepare Data

In [0]:

def prepare_data_classification(data, sequence_MAX_length):
  # pruning the sequences to match sequence_MAX_length 
  rep_train_seq =  data.apply(lambda x: pruning_n_padding(x,sequence_MAX_length))
  return rep_train_seq

## Extract features using LSTM

In [0]:
def feature_vector_maxPool_n_lastCell(trained_model, sequence_data):
  # Seed value
  seed_value= 0
  os.environ['PYTHONHASHSEED']=str(seed_value)
  random.seed(seed_value)
  np.random.seed(seed_value)
  tf.set_random_seed(seed_value)

  # remove the last two layers: Dense + last LSTM 
  new_model = Sequential()
  for layer in trained_model.layers[:-2]:
      new_model.add(layer)

  preds = new_model.predict(sequence_data)

  # save all the last lstm cell outputs
  last_cell_outputs = []
  for file in preds:
      last_cell_outputs.append(file[-1]) 

  # remove the last two layers: Dense + last LSTM 
  new_model_max_pooling = Sequential()
  for layer in trained_model.layers[:-2]:
      new_model_max_pooling.add(layer)

  # add softmax + max pooling layer
  new_model_max_pooling.add(GlobalMaxPooling1D())
  max_pooling_vectors = new_model_max_pooling.predict(sequence_data)

  
  # Creating the representation vectors:
  # concatenating the max-pooling with the last output
  final_representation = []
  for idx,max_pool in enumerate(max_pooling_vectors):
      new_vector = list(max_pool) + list(last_cell_outputs[idx])
      final_representation.append(new_vector)
  final_representation = np.array(final_representation,dtype=np.float32)
  
  return final_representation


In [0]:
def create_final_feature_vector(trained_model,rep_train_seq,one_hot_encoding):
  if one_hot_encoding:
    print('creating one-hot-encoding')
    # create sequences (one-hote encoding format) *we dont use the nextWord here
    train_vectorized_seq = create_sequence_one_hot(rep_train_seq)
    print('Creating feature vectors')
    train_feature_vector = feature_vector_maxPool_n_lastCell(trained_model, train_vectorized_seq)

  return train_feature_vector

# Classification

## Model Training

In [0]:
def data_preprocessing(x_data,y_data):
  # convert class to 0 and 1
  labels = y_data.apply(lambda x: 1 if x == 'malicious' else 0).values
  # Normalizing the data
  data = StandardScaler().fit_transform(x_data)

  return data,labels



## Model Evaluation

### Functions: classifier evaluation


In [0]:
def calc_metrics(conf_matrix, Y_proba, Y_test):
    AUC = roc_auc_score(Y_test, Y_proba)

    TP = conf_matrix[1][1]
    TN = conf_matrix[0][0]
    FP = conf_matrix[0][1]
    FN = conf_matrix[1][0]

    TPR = TP / (TP + FN)
    FPR = FP / (FP + TN)
    ACC = (TP + TN) / (TP + TN + FP + FN)
    Precision = TP / (TP + FP)
    F1_score = 2 * ((Precision * TPR) / (Precision + TPR))

    metrics_dict = {
        'TPR': round(TPR, 2),
        'FPR': round(FPR, 2),
        'ACC': round(ACC, 2),
        'Precision': round(Precision, 2),
        'F1_score': round(F1_score, 2),
        'AUC': round(AUC, 4),
    }

    return metrics_dict


def convert_class_to_binary(label):
    if label == 'malicious':
        return 1
    else:
        return 0



def get_predictions(Y_proba, threshold):
    Y_pred_final = []
    for p in Y_proba:
        if p >= threshold:
            Y_pred_final.append(1)
        else:
            Y_pred_final.append(0)

    return Y_pred_final

### Evaluation

**CrossValidation**

In [0]:
# TODO: 

# Main



## Load & clean data

In [31]:
benign_path = '../Cuckoo_Parser/Parsed_Data/win10/benign_parquet_updated'
malware_path = '../Cuckoo_Parser/Parsed_Data/win10/malware_parquet_updated'

# all_data = load_raw_parquets(benign_path, malware_path)
# all_data.to_pickle('data_for_LSTM.pkl')
all_data = pd.read_pickle('/content/drive/My Drive/Thesis/RNN_LSTM/data_for_LSTM.pkl')

# Clean data (remove short sequence & remove duplications)
sequence_MIN_length = 15 # (Pascano, 2015)
all_data = data_cleaning(sequence_MIN_length,all_data)

# use file from other experiments
dataset_name = '1_Balanced_Frequencies_first1000sec_minAPI_15_minRunTime_0_withYara_No.csv'
folder_path = '/content/drive/My Drive/Thesis/RNN_LSTM/balaned_datasets'
full_path = '{}/{}'.format(folder_path,dataset_name)
all_data = use_spesific_files(full_path,all_data)


Remove files with short sequence
files before: 7156
files after: 6166
files before: 6166
files after: 4531


## strat cross-validation

In [0]:

data_sequences = all_data['API_sequence']
data_labels = all_data['class']


folds_TPR, folds_FPR, folds_ACC, folds_F1_score, folds_Precision, folds_AUC = [], [], [], [], [], []
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)
fold = 0
for train, test in kfold.split(data_sequences, data_labels):
    fold += 1

    X_train, Y_train, X_test, Y_test = data_sequences[train], data_labels[train], data_sequences[test], data_labels[test]

    # prepare data for representation model (Optional)
    sequence_MAX_length = 50 # (Athiwaratkun, 2017)
    sequence_MAX_length = 100 # (Pascanu, 2015)

    x_train,Y_train = prepare_data_finalLabel(X_train,Y_train,sequence_MAX_length)

    # Init LSTM model
    output_dim = len(API_corpus)
    input_seq_size = len(API_corpus)
    input_time_dim = None
    nb_units = 100
    dropout = 0.1
    epochs = 5

    LSTM_model = LSTM_finalLabel_arch_1(input_seq_size,input_time_dim,nb_units,dropout)

    # train model on training data
    LSTM_model.fit(train_vectorized_seq,rep_train_labels, epochs=epochs)

    
    # mid-process evaluation of the rpresentation learning model
    evaluate_LSTM(LSTM_model,test_vectorized_seq,rep_test_labels)

    # save the model
    # model_2.save('./LSTM_models/arch_1_LSTM_finalLabel.hdf5')

    # create feature vector - feature extraction
    sequence_MAX_length = 200 #(Agrawal 2018 , Athiwaratkun 2017)
    one_hot_encoding = True

    # TODO: orgenaize all the preprocessing functions
    rep_train_seq = prepare_data_classification(train_data, sequence_MAX_length)

    final_train_vector = create_final_feature_vector(LSTM_model,rep_train_seq,one_hot_encoding)

    x_data, y_data = data_preprocessing(x_data,y_data)

    # train classifier
    LR = LogisticRegression(max_iter=1000)
    LR.fit(train_feature_vector_norm, rep_train_labels)

    # predict
    # Get probability for each class
    Y_proba = LR.predict_proba(X_test)[:, 1]
    # convert probabilities into predictions based on the threshold
    Y_pred = get_predictions(Y_proba, threshold=0.5)

    conf_matrix = confusion_matrix(Y_test, Y_pred)
    # calc all the relevant metrics
    metrics = calc_metrics(conf_matrix, Y_proba, Y_test)
    metrics_print = '\tTPR:{}\tFPR:{}\tACC:{}\tPrecision:{}\tF1_score:{}\tAUC:{}'.format(metrics['TPR'],
                                                                                         metrics['FPR'],
                                                                                         metrics['ACC'],
                                                                                         metrics['Precision'],
                                                                                         metrics['F1_score'],
                                                                                         metrics['AUC'])

    print('[Fold {}/{}] -\t{}:\t{} '.format(fold, 5, 'LR', metrics_print))

    folds_TPR.append(metrics['TPR'])
    folds_FPR.append(metrics['FPR'])
    folds_ACC.append(metrics['ACC'])
    folds_Precision.append(metrics['Precision'])
    folds_F1_score.append(metrics['F1_score'])
    folds_AUC.append(metrics['AUC'])
  
# print CV results
print('\nFinal Results: {} samples'.format(rep_test_labels.shape[0]))
print('TPR:', round(np.mean(folds_TPR),4))
print('FPR:', round(np.mean(folds_FPR),4))
print('ACC:', round(np.mean(folds_ACC),4))
print('Precision:', round(np.mean(folds_Precision),4))
print('F1_score:', round(np.mean(folds_F1_score),4))
print('AUC:', round(np.mean(folds_AUC),4))