# Сессия 3
## Команда Cerebro
### Участники: Власов Глеб, Калентьев Леон, Арастумян Александр


In [22]:
import pandas as pd
import tensorflow as tf
import numpy as np
import os
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, average_precision_score, roc_auc_score


data_train = pd.read_csv('RESULT/train_ord_encode.csv')
data_test = pd.read_csv('RESULT/test_ord_encode.csv')

## Подготовка данных к передаче в нейросеть

In [2]:
def parse_sample(sample):
    record_defaults = [0] * 64
    parsed = tf.io.decode_csv(records=sample, record_defaults=record_defaults)
    x = tf.stack(parsed[0:63])
    y = tf.stack(parsed[63])
    
    return x, y

def data_processing(filename, batch_size, epochs, has_header=True):
    if has_header:
        dataset = tf.data.TextLineDataset(filename).skip(1)
    else:
        dataset = tf.data.TextLineDataset(filename)
    
    dataset = dataset.repeat(epochs)
    dataset = dataset.map(parse_sample)
    dataset = dataset.batch(batch_size)
    
    return dataset

In [3]:
batch_size = 32
epochs = 20

train_set = data_processing('train_ord_encode.csv', batch_size, epochs)
val_set = data_processing('val_ord_encode.csv', batch_size, epochs)
test_set = data_processing('test_ord_encode.csv', batch_size, epochs)

2021-11-27 10:15:40.924647: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1050] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-27 10:15:40.950348: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1050] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-27 10:15:40.950644: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1050] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-27 10:15:40.951996: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1050] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-27 10:15:40.952564: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1050] successful NUMA node read f

## Построение GRU модели
Первым делом было принято решения реализовать GRU модель.
GRU - это улучшенная версия стандартной RNN, которая решает пробелую затухающего градиента.

In [None]:
model = tf.keras.models.Sequential([
        tf.keras.layers.Embedding(input_dim=41, output_dim=128, input_length=63),
        tf.keras.layers.Conv1D(filters=128, kernel_size=3, strides=1 ,padding='same', activation='relu'),
        tf.keras.layers.MaxPool1D(pool_size=2, padding='same'),
        tf.keras.layers.GRU(units=128),
        tf.keras.layers.Dense(256, activation='tanh'),
        tf.keras.layers.Dense(1, activation='sigmoid')
        ])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

checkpoint_path = os.path.join(r'./checkpoint')

checkpoint = tf.keras.callbacks.ModelCheckpoint(
                        filepath=checkpoint_path, 
                        monitor='val_loss', 
                        verbose=0, 
                        save_best_only=True, 
                        save_weights_only=False, 
                        mode='min', 
                        save_freq='epoch')

In [110]:
model.fit(train_set, epochs=20, callbacks=checkpoint, 
            validation_data=val_set, steps_per_epoch = 124800 // 32, 
            validation_steps = 35000 // 32) 

Epoch 1/20




INFO:tensorflow:Assets written to: ./checkpoint/assets


INFO:tensorflow:Assets written to: ./checkpoint/assets


Epoch 2/20




INFO:tensorflow:Assets written to: ./checkpoint/assets


INFO:tensorflow:Assets written to: ./checkpoint/assets


Epoch 3/20




INFO:tensorflow:Assets written to: ./checkpoint/assets


INFO:tensorflow:Assets written to: ./checkpoint/assets


Epoch 4/20
Epoch 5/20
Epoch 6/20




INFO:tensorflow:Assets written to: ./checkpoint/assets


INFO:tensorflow:Assets written to: ./checkpoint/assets


Epoch 7/20




INFO:tensorflow:Assets written to: ./checkpoint/assets


INFO:tensorflow:Assets written to: ./checkpoint/assets


Epoch 8/20
Epoch 9/20




INFO:tensorflow:Assets written to: ./checkpoint/assets


INFO:tensorflow:Assets written to: ./checkpoint/assets


Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20




INFO:tensorflow:Assets written to: ./checkpoint/assets


INFO:tensorflow:Assets written to: ./checkpoint/assets


Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f858c68b400>

In [23]:
# 
# 
# 
# 
# Функции из предыдущей сессии, будут использованы для проверки качества работы обученных весов 
# 
# 
# 
# 


dictinary = {'0':0, '1':1, '2':2, '3':3, '4':4, '5':5, '6':6,
	         '7':7, '8':8, '9':9, 'a':10, 'b':11, 'c':12, 'd':13, 
	         'e':14, 'f':15, 'g':16, 'h':17, 'i':18, 'j':19, 'k':20, 
	         'l':21, 'm':22, 'n':23, 'o':24, 'p':25, 'q':26, 'r':27,
	         's':28, 't':29, 'u':30, 'v':31, 'w':32, 'x':33, 'y':34,
	         'z':35, '!':36, '-':37, '.':38, '_':39, 'S':40}

def _pad(domain, max_length=63):
    if len(domain) < max_length:
        domain = '!' * (max_length - len(domain)) + domain
    
    return domain

# Функция по преобразованию домена 
def pad_domain(data, pad_fn=_pad):
    data['domain'] = data['domain'].map(pad_fn)
#     data.to_csv('data_bin_pad.csv', index=False)
    
    return data


def _split_domain(domain, sep='?'):
    result = []
    
    for ch in domain:
        result.append(ch)
        
    result = list(map(lambda x: x.lower(), result))
                  
    return sep.join(result)


def split_domain(data, split_fn=_split_domain, sep='?'):
    data = pd.concat([pd.DataFrame(data['domain'].map(split_fn).values),
                      data['subclass']], axis=1)
                  
    data.columns = ['domain', 'subclass']
                  
    cols = ['domain%d' %d for d in range(0, 63)]
    data[cols] = data['domain'].str.split(sep, expand=True)
    data = data[cols + ['subclass']]
#     data.to_csv('model_test/data_split.csv', index=False)
                  
    return data


def id_encoding(data, dictionary, embeding_cols):
    data[embeding_cols] = data[embeding_cols]\
    .apply(lambda x: x.map(dictionary))
    data = data[embeding_cols + ['subclass']]
#     data.to_csv('bin_data\./bin_data_encode.csv', index=False)
    
    return data


def resd(model_path, x, threshold=0.5):
	model = tf.keras.models.load_model(model_path)
	y_pred_value = model.predict(x=x)
	y_pred = np.where(y_pred_value > threshold, 1, 0)

	return y_pred


def resd_csv(model_path, x, y_label, threshold=0.5):
    result = {}
    model = tf.keras.models.load_model(model_path)
    y_pred_value = model.predict(x=x)
    y_pred = np.where(y_pred_value > threshold, 1, 0)
    result['cm'] = confusion_matrix(y_label, y_pred)
    result['f1_score'] = f1_score(y_label, y_pred)
    result['precision_score'] = precision_score(y_label, y_pred)
    result['recall_score'] = precision_score(y_label, y_pred)
    result['average_precision_score'] = average_precision_score(y_label, y_pred)
    result['roc_auc_score'] = roc_auc_score(y_label, y_pred)
    result['y_pred'] = y_pred
    return result


def prepair_test_data(x, model='gru') -> pd.DataFrame:
    return helped(pd.read_csv(x) if '.csv' in x else x, model=model)


def helped(data: str or pd.DataFrame, model='gru') -> pd.DataFrame:
    if isinstance(data, str):
        data_temp = re.sub(r'\.[a-z]*','',data)
        temp =  split_domain(pad_domain((pd.DataFrame({'domain': [data_temp], 'subclass': [0]}))))
        em_cols = ['domain%d' %d for d in range(0, 63)]
        temp = id_encoding(temp, dictinary, em_cols)
        x = temp.iloc[:, 0:64].values.astype(np.float32)
        
        if model == 'gru':
            res = resd(r'./checkpoint', x)[0][0]
        elif model == 'lstm':
            res = resd(r'./checkpointLSTM', x)[0][0]
        
        return pd.DataFrame({'domain': [data], 'subclass': ['dga' if res else 'legit']})
    
    elif isinstance(data, pd.DataFrame):
        #data['subclass'] = pd.Series(1, index=data.index)
        data['domain'] = data['domain']\
        .apply(lambda x: re.sub(r'\.[a-z]*','',x))
        data_temp = split_domain(pad_domain(data))
        em_cols = ['domain%d' %d for d in range(0, 63)]
        data_temp = id_encoding(data_temp, dictinary, em_cols)
        x = data_temp.iloc[:, 0:64].values.astype(np.float32)
        y = data_temp.iloc[:, -1].values
        
        if model == 'gru':
            result = resd_csv(r'./checkpoint', x, y)
        elif model =='lstm':
            result = resd_csv(r'./checkpointLSTM', x, y)
        
        for (x, y) in result.items():
            print((x, y))
            
        return result
    
    raise NotImplementedError(
        print("Неправильный ввод"),
    )

## Вывод различных метрик:

#### F1 SCORE
Объединяет информацию о точности и полноте. Стремится к нулю, если точность или полнота стремятся к нулю. Уменьшается одинаково при уменьшении уменьшении и точности и полноты.

#### ROC AUC
Метрика для задач бинарной классификации. Значение площади под графиком, в котором y - это ture positive rate, x - false positive rate из матрицы ошибок.

In [148]:
result = prepair_test_data('testQ1.csv')







('cm', array([[980422,  19578],
       [127140, 674527]]))
('f1_score', 0.9019115212746327)
('precision_score', 0.9717938928548274)
('recall_score', 0.9717938928548274)
('average_precision_score', 0.8882406780150595)
('roc_auc_score', 0.9109137356745383)
('y_pred', array([[0],
       [1],
       [1],
       ...,
       [0],
       [0],
       [1]]))


In [9]:
def model_eval(model_path, x, y_label, threshold=0.5):
	result = {}
	model = tf.keras.models.load_model(model_path)
	y_pred_value = model.predict(x=x)
	y_pred = np.where(y_pred_value > threshold, 1, 0)
	result['cm'] = confusion_matrix(y_label, y_pred)
	result['f1_score'] = f1_score(y_label, y_pred)
	result['precision_score'] = precision_score(y_label, y_pred)
	result['recall_score'] = precision_score(y_label, y_pred)
	result['average_precision_score'] = average_precision_score(y_label, y_pred)
	result['roc_auc_score'] = roc_auc_score(y_label, y_pred)

	return result

In [130]:
test = pd.read_csv('test2.csv')

In [8]:
testset = pd.read_csv('test_ord_encode.csv')
x = testset.iloc[:, 0:64].values.astype(np.float32)
y = testset.iloc[:, -1].values
result = model_eval(r'./checkpoint', x, y)

for (x, y) in result.items():
    print((x, y))

In [154]:
prepair_test_data('habr.ru')





Unnamed: 0,domain,subclass
0,habr.ru,legit


### Построение LSTM модели

LSTM - тип RNN с ячейкой памяти, идеально подходит для данной проблемы, т.к. мы передаем последовательность символов, которую необходимо помнить.

In [5]:
modelLSTM = tf.keras.models.Sequential([
        tf.keras.layers.Embedding(input_dim=41, output_dim=128, input_length=63),
        tf.keras.layers.Conv1D(filters=128, kernel_size=3, strides=1 ,padding='same', activation='relu'),
        tf.keras.layers.MaxPool1D(pool_size=2, padding='same'),
        tf.keras.layers.LSTM(64),
        tf.keras.layers.Dropout(0.5),
#         tf.keras.layers.Dense(256, activation='tanh'),
        tf.keras.layers.Dense(1, activation='sigmoid')
        ])

modelLSTM.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

modelLSTM.summary()

checkpointLSTM_path = os.path.join(r'./checkpointLSTM')

checkpointLSTM = tf.keras.callbacks.ModelCheckpoint(
                        filepath=checkpointLSTM_path, 
                        monitor='val_loss', 
                        verbose=0, 
                        save_best_only=True, 
                        save_weights_only=False, 
                        mode='min', 
                        save_freq='epoch')

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 63, 128)           5248      
_________________________________________________________________
conv1d (Conv1D)              (None, 63, 128)           49280     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 32, 128)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 64)                49408     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 104,001
Trainable params: 104,001
Non-trainable params: 0
__________________________________________________

In [6]:
modelLSTM.fit(train_set, epochs=20, callbacks=checkpointLSTM, 
            validation_data=val_set, steps_per_epoch = 124800 // 45, 
            validation_steps = 32000 // 45) 

2021-11-27 10:16:06.507254: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/20


2021-11-27 10:16:07.992943: I tensorflow/stream_executor/cuda/cuda_dnn.cc:381] Loaded cuDNN version 8300




2021-11-27 10:16:19.976047: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ./checkpointLSTM/assets


INFO:tensorflow:Assets written to: ./checkpointLSTM/assets


Epoch 2/20




INFO:tensorflow:Assets written to: ./checkpointLSTM/assets


INFO:tensorflow:Assets written to: ./checkpointLSTM/assets


Epoch 3/20




INFO:tensorflow:Assets written to: ./checkpointLSTM/assets


INFO:tensorflow:Assets written to: ./checkpointLSTM/assets


Epoch 4/20
Epoch 5/20




INFO:tensorflow:Assets written to: ./checkpointLSTM/assets


INFO:tensorflow:Assets written to: ./checkpointLSTM/assets


Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20






<keras.callbacks.History at 0x7fc4fc2b0190>

In [11]:
testset = pd.read_csv('test_ord_encode.csv')
x = testset.iloc[:, 0:64].values.astype(np.float32)
y = testset.iloc[:, -1].values
result = model_eval(r'./checkpointLSTM', x, y)

for (x, y) in result.items():
    print((x, y))





('cm', array([[8153,  292],
       [ 563, 6243]]))
('f1_score', 0.9359118506858557)
('precision_score', 0.9553175210405509)
('recall_score', 0.9553175210405509)
('average_precision_score', 0.9132081897954123)
('roc_auc_score', 0.9413510994982834)


In [25]:
prepair_test_data('google.com', model='lstm')





Unnamed: 0,domain,subclass
0,google.com,legit
