In [None]:
!pip install hmmlearn

Collecting hmmlearn
[?25l  Downloading https://files.pythonhosted.org/packages/ff/7b/33f629a443a0671161c019e55c3f1b511c7e9fdce5ab8c8c3c33470eb939/hmmlearn-0.2.3-cp36-cp36m-manylinux1_x86_64.whl (363kB)
[K     |████████████████████████████████| 368kB 2.9MB/s 
Installing collected packages: hmmlearn
Successfully installed hmmlearn-0.2.3


In [2]:
# 挂载google云盘
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [1]:
#!/usr/bin/env python
# coding: utf-8

import pandas as pd
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
from tensorflow import keras
import tensorflow as tf
import csv
from datetime import datetime
import time
import random

from sklearn.model_selection import train_test_split

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'


### preprocessing

In [2]:
#load data
def dataLoader(input_file):
    data = pd.read_csv(input_file)
    alerts = data.event
    labels = data.stage
    return alerts.tolist(), labels.tolist()

#### metrics

In [3]:
def eva_metrics1(y_true,y_pred,stages,hmm=False):
    pre,rec = [],[]
    for t,p in zip(y_true,y_pred):
        if hmm:
            t = [t[0]]+[t[i] for i in range(1,len(t)) if t[i] != t[i-1]]
            p = [p[0]]+[p[i] for i in range(1,len(p)) if p[i] != p[i-1]]
        num_correct = len([i for i,j in zip(t,p) if i==j])
        num_true = len(t)
        num_pre = len(p)
        pre.append(num_correct / (num_pre))
        rec.append(num_correct / (num_true))
    precision = sum(pre)/len(pre)
    recall = sum(rec)/len(rec)
    f1 = 2*precision*recall/(precision+recall)    
    print("precison: ", precision)
    print("recall: ", recall)
    print("f1: ", f1)
    return precision,recall,f1

def eva_metrics2(y_true,y_pred,stages):
    pre,rec = [],[]
    for t,p in zip(y_true,y_pred):
        #正样本
        if all([stages[i] in t for i in range(len(stages))]):
            num_correct = len([i for i,j in zip(t,p) if i==j and i!='o'])
            num_true = len([i for i in t if i!='o'])
            num_pre = len([i for i in p if i!="o"])
            pre.append(num_correct / (num_pre))
            rec.append(num_correct / (num_true))
        else:#负样本
            num_correct = len([i for i,j in zip(t,p) if i==j and i=='o'])
            num_true = len([i for i in t if i=='o'])
            num_pre = len([i for i in p if i=='o'])
            pre.append(num_correct / (num_pre))
            rec.append(num_correct / (num_true))
    precision = sum(pre)/len(pre)
    recall = sum(rec)/len(rec)
    f1 = 2*precision*recall/(precision+recall)    
    print("precison: ", precision)
    print("recall: ", recall)
    print("f1: ", f1)
    return precision,recall,f1

def eva_metrics5(y_true,y_pred,stages,hmm = False):
    tp,fp,fn,tn = 0,0,0,0
    for t,p in zip(y_true,y_pred):
        if hmm:
            t = [t[0]]+[t[i] for i in range(1,len(t)) if t[i] != t[i-1]]
            p = [p[0]]+[p[i] for i in range(1,len(p)) if p[i] != p[i-1]]
        #正样本
        if all([stages[i] in t for i in range(len(stages))]):
            if t == p:
                tp += 1
            else:
                fn += 1
        #负样本
        else:
            if t == p:
                tn += 1
            else:
                fp += 1
    if tp == 0 and fp == 0:
        return 0,0,0
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1 = 2*precision*recall/(precision+recall)    
    print("precison: ", precision)
    print("recall: ", recall)
    print("f1: ", f1)
    return precision,recall,f1

#### stage data

In [4]:
def generate_2_samples(s1,s2,other,label,length,numbers,k):
    """
    随机采样正样本，负样本（长度随机、采样随机）
    numbers: 样本个数
    k：负样本比例,负样本 k*numbers
    """
    px,py,phmm_y = [],[],[]
    nx,ny,nhmm_y = [],[],[]

    for i in range(numbers):
        # length = random.randint(int(pow(length,0.5)),length) #int(random.random()*length) if int(random.random()*length) >= 1 else 1
        k1 = int(random.random()*len(s1)) if int(random.random()*len(s1))>= 1 else 1
        k2 = int(random.random()*len(s2)) if int(random.random()*len(s2))>= 1 else 1
        r1 = [s1[i] for i in sorted(random.choices(range(len(s1)),k=k1))]
        r2 = [s2[i] for i in sorted(random.choices(range(len(s2)),k=k2))]        
        p1 = random.randint(int(length*0.5),length)
        p2 = random.randint(int(length*0.5),length)
        px.append([other[i] for i in sorted(random.choices(range(len(other)),k = p1))]+
                  r1+
                  [other[i] for i in sorted(random.choices(range(len(other)),k = length))]+
                  r2 + 
                  [other[i] for i in sorted(random.choices(range(len(other)),k = p2))])

        py.append(["sos","s1","s2","eos"])
        phmm_y.append(["o"]*p1+
                      ["s1"]*len(r1)+
                      ["o"]*length+
                      ["s2"]*len(r2)+
                      ["o"]*p2)
        
        if i % k == 0:
            nx.append(random.choices(other,k = length+p1+p2+len(r1)+len(r2)))
            ny.append(["sos","o","eos"])
            nhmm_y.append(["o"]*(length+p1+p2++len(r1)+len(r2)))
    x,y,hmm_y = px+nx,py+ny,phmm_y+nhmm_y
    return x,y,hmm_y


def generate_3_samples(s1,s2,s3,other,label,length,numbers,k=1):
    """
    numbers: 样本个数
    k：负样本比例
    """
    px,py,phmm_y = [],[],[]
    nx,ny,nhmm_y = [],[],[]
    for i in range(numbers):
        k1 = int(random.random()*len(s1)) if int(random.random()*len(s1))>= 1 else 1
        k2 = int(random.random()*len(s2)) if int(random.random()*len(s2))>= 1 else 1
        k3 = int(random.random()*len(s3)) if int(random.random()*len(s3))>= 1 else 1
        r1 = [s1[i] for i in sorted(random.choices(range(len(s1)),k=k1))]
        r2 = [s2[i] for i in sorted(random.choices(range(len(s2)),k=k2))]
        r3 = [s3[i] for i in sorted(random.choices(range(len(s3)),k=k3))]    
        p1 = random.randint(int(length*0.5),length)
        p2 = random.randint(int(length*0.5),length)
        px.append([other[i] for i in sorted(random.choices(range(len(other)),k = p1))]+
                  r1+
                  [other[i] for i in sorted(random.choices(range(len(other)),k = length))] + 
                  r2 +
                  [other[i] for i in sorted(random.choices(range(len(other)),k = length))] +
                  r3+
                  [other[i] for i in sorted(random.choices(range(len(other)),k = p2))])
        py.append(["sos","s1","s2","s3","eos"])
        phmm_y.append(["o"]*p1+["s1"]*len(r1)+
                      ["o"]*length+["s2"]*len(r2)+
                      ["o"]*length+["s3"]*len(r3)+["o"]*p2)
        if i%k == 0:
            nx.append(random.choices(other,k = p1+p2+2*length+len(r1)+len(r2)+len(r3)))
            ny.append(["sos","o","eos"])
            nhmm_y.append(["o"]*(p1+p2+2*length+len(r1)+len(r2)+len(r3)))
            # ix = sorted(random.choices(range(len(other)),k = 4*length+len(r1)+len(r2)+len(r3)))
            # nx.append([other[i] for i in ix])
            # l = [label[i] for i in ix]
            # ny.append(["sos"]+sorted(set(l),key = l.index)+["eos"])
            # nhmm_y.append(l)
    x,y,hmm_y = px+nx,py+ny,phmm_y+nhmm_y    

    return x,y,hmm_y

def generate_4_samples(s1,s2,s3,s4,other,label,length,numbers,k=1):
    """
    numbers: 样本个数
    k：负样本比例
    """
    px,py,phmm_y = [],[],[]
    nx,ny,nhmm_y = [],[],[]

    for i in range(numbers):
        k1 = int(random.random()*len(s1)) if int(random.random()*len(s1))>= 1 else 1
        k2 = int(random.random()*len(s2)) if int(random.random()*len(s2))>= 1 else 1
        k3 = int(random.random()*len(s3)) if int(random.random()*len(s3))>= 1 else 1
        k4 = int(random.random()*len(s4)) if int(random.random()*len(s4))>= 1 else 1
        r1 = [s1[i] for i in sorted(random.choices(range(len(s1)),k=k1))]
        r2 = [s2[i] for i in sorted(random.choices(range(len(s2)),k=k2))]
        r3 = [s3[i] for i in sorted(random.choices(range(len(s3)),k=k3))]
        r4 = [s4[i] for i in sorted(random.choices(range(len(s4)),k=k4))]    
        p1 = random.randint(int(length*0.5),length)
        p2 = random.randint(int(length*0.5),length)
        px.append([other[i] for i in sorted(random.choices(range(len(other)),k = p1))]+
                  r1+ [other[i] for i in sorted(random.choices(range(len(other)),k = length))] +
                  r2 + [other[i] for i in sorted(random.choices(range(len(other)),k = length))] + 
                  r3+[other[i] for i in sorted(random.choices(range(len(other)),k = length))]+
                  r4 + [other[i] for i in sorted(random.choices(range(len(other)),k = p2))])
        py.append(["sos","s1","s2","s3","s4","eos"])
        phmm_y.append(["o"]*p1+
                     ["s1"]*len(r1)+["o"]*length+
                     ["s2"]*len(r2)+["o"]*length+
                     ["s3"]*len(r3)+["o"]*length+
                     ["s4"]*len(r4)+["o"]*p2)
        if i%k == 0:
            nx.append(random.choices(other,k = p1+p2+3*length+len(r1)+len(r2)+len(r3)+len(r4)))
            ny.append(["sos","o","eos"])
            nhmm_y.append(["o"]*(p1+p2+3*length+len(r1)+len(r2)+len(r3)+len(r4)))
            # ix = sorted(random.choices(range(len(other)),k = p1+p2+*length+len(r1)+len(r2)+len(r3)+len(r4)))
            # nx.append([other[i] for i in ix])
            # l = [label[i] for i in ix]
            # ny.append(["sos"]+sorted(set(l),key = l.index)+["eos"])
            # nhmm_y.append(l)
    x,y,hmm_y = px+nx,py+ny,phmm_y+nhmm_y    
    return x,y,hmm_y

def generate_5_samples(s1,s2,s3,s4,s5,other,label,length,numbers,k=1):
    """
    numbers: 样本个数
    k：负样本比例
    """
    px,py,phmm_y = [],[],[]
    nx,ny,nhmm_y = [],[],[]

    for i in range(numbers):
        k1 = int(random.random()*len(s1)) if int(random.random()*len(s1))>= 1 else 1
        k2 = int(random.random()*len(s2)) if int(random.random()*len(s2))>= 1 else 1
        k3 = int(random.random()*len(s3)) if int(random.random()*len(s3))>= 1 else 1
        k4 = int(random.random()*len(s4)) if int(random.random()*len(s4))>= 1 else 1
        k5 = int(random.random()*len(s5)) if int(random.random()*len(s5))>= 1 else 1
        r1 = [s1[i] for i in sorted(random.choices(range(len(s1)),k=k1))]
        r2 = [s2[i] for i in sorted(random.choices(range(len(s2)),k=k2))]
        r3 = [s3[i] for i in sorted(random.choices(range(len(s3)),k=k3))]
        r4 = [s4[i] for i in sorted(random.choices(range(len(s4)),k=k4))]
        r5 = [s5[i] for i in sorted(random.choices(range(len(s5)),k=k5))]    
        p1 = random.randint(int(length*0.5),length)
        p2 = random.randint(int(length*0.5),length)

        px.append([other[i] for i in sorted(random.choices(range(len(other)),k = p1))]+
                  r1+ [other[i] for i in sorted(random.choices(range(len(other)),k = length))] +
                  r2 + [other[i] for i in sorted(random.choices(range(len(other)),k = length))] + 
                  r3+[other[i] for i in sorted(random.choices(range(len(other)),k = length))]+
                  r4 + [other[i] for i in sorted(random.choices(range(len(other)),k = length))]+
                  r5 + [other[i] for i in sorted(random.choices(range(len(other)),k = p2))])

        py.append(["sos","s1","s2","s3","s4","s5","eos"])
        phmm_y.append(["o"]*p1+["s1"]*len(r1)+
                      ["o"]*length+["s2"]*len(r2)+
                      ["o"]*length+["s3"]*len(r3)+
                      ["o"]*length+["s4"]*len(r4)+
                      ["o"]*length+["s5"]*len(r5)+
                      ["o"]*p2)
        if i%k == 0:
            nx.append(random.choices(other,k = p1+p2+4*length+len(r1)+len(r2)+len(r3)+len(r4)+len(r5)))
            ny.append(["sos","o","eos"])
            nhmm_y.append(["o"]*(p1+p2+4*length+len(r1)+len(r2)+len(r3)+len(r4)+len(r5)))
            # ix = sorted(random.choices(range(len(other)),k = 6*length+len(r1)+len(r2)+len(r3)+len(r4)+len(r5)))
            # nx.append([other[i] for i in ix])
            # l = [label[i] for i in ix]
            # ny.append(["sos"]+sorted(set(l),key = l.index)+["eos"])
            # nhmm_y.append(l)
    x,y,hmm_y = px+nx,py+ny,phmm_y+nhmm_y    
    return x,y,hmm_y


def generate_6_samples(s1,s2,s3,s4,s5,s6,other,label,length,numbers,k=1):
    """
    numbers: 样本个数
    k：负样本比例
    """
    px,py,phmm_y = [],[],[]
    nx,ny,nhmm_y = [],[],[]

    for i in range(numbers):
        k1 = int(random.random()*len(s1)) if int(random.random()*len(s1))>= 1 else 1
        k2 = int(random.random()*len(s2)) if int(random.random()*len(s2))>= 1 else 1
        k3 = int(random.random()*len(s3)) if int(random.random()*len(s3))>= 1 else 1
        k4 = int(random.random()*len(s4)) if int(random.random()*len(s4))>= 1 else 1
        k5 = int(random.random()*len(s5)) if int(random.random()*len(s5))>= 1 else 1
        k6 = int(random.random()*len(s6)) if int(random.random()*len(s6))>= 1 else 1
        r1 = [s1[i] for i in sorted(random.choices(range(len(s1)),k=k1))]
        r2 = [s2[i] for i in sorted(random.choices(range(len(s2)),k=k2))]
        r3 = [s3[i] for i in sorted(random.choices(range(len(s3)),k=k3))]
        r4 = [s4[i] for i in sorted(random.choices(range(len(s4)),k=k4))]
        r5 = [s5[i] for i in sorted(random.choices(range(len(s5)),k=k5))]
        r6 = [s6[i] for i in sorted(random.choices(range(len(s6)),k=k6))]    
        p1 = random.randint(int(length*0.5),length)
        p2 = random.randint(int(length*0.5),length)

        px.append([other[i] for i in sorted(random.choices(range(len(other)),k = p1))]+
                  r1+ [other[i] for i in sorted(random.choices(range(len(other)),k = length))] +
                  r2 + [other[i] for i in sorted(random.choices(range(len(other)),k = length))] + 
                  r3+[other[i] for i in sorted(random.choices(range(len(other)),k = length))]+
                  r4 + [other[i] for i in sorted(random.choices(range(len(other)),k = length))]+
                  r5 + [other[i] for i in sorted(random.choices(range(len(other)),k = length))]+
                  r6 + [other[i] for i in sorted(random.choices(range(len(other)),k = p2))])

        py.append(["sos","s1","s2","s3","s4","s5","s6","eos"])
        phmm_y.append(["o"]*p1+["s1"]*len(r1)+
                      ["o"]*length+["s2"]*len(r2)+
                      ["o"]*length+["s3"]*len(r3)+
                      ["o"]*length+["s4"]*len(r4)+
                      ["o"]*length+["s5"]*len(r5)+
                      ["o"]*length+["s6"]*len(r6)+
                      ["o"]*p2)
        if i%k == 0:
            nx.append(random.choices(other,k = p1+p2+5*length+len(r1)+len(r2)+len(r3)+len(r4)+len(r5)+len(r6)))
            ny.append(["sos","o","eos"])
            nhmm_y.append(["o"]*(p1+p2+5*length+len(r1)+len(r2)+len(r3)+len(r4)+len(r5)+len(r6)))
            # ix = sorted(random.choices(range(len(other)),k = 7*length+len(r1)+len(r2)+len(r3)+len(r4)+len(r5)+len(r6)))
            # nx.append([other[i] for i in ix])
            # l = [label[i] for i in ix]
            # ny.append(["sos"]+sorted(set(l),key = l.index)+["eos"])
            # nhmm_y.append(l)
    x,y,hmm_y = px+nx,py+ny,phmm_y+nhmm_y    
    return x,y,hmm_y

### seq2seq

In [5]:
class Seq2Seq:
    def __init__(self,alerts,labels,X,y,latent_dim,batch_size,epochs,max_single_channel_length, max_output_length=None, multi_channel=True, norm=True, verbose=0):
        
        self.latent_dim, self.batch_size, self.max_single_channel_length = latent_dim, batch_size, max_single_channel_length
        self.epochs = epochs
        self.verbose = verbose
        self.train_x,self.test_x,self.train_y,self.test_y = train_test_split(X,y,test_size=0.2)
        
        input_characters = list(set(alerts)) + ['eos']
        # target_characters = list(set(labels))+['sos','eos']
        tmp = []
        for yi in y:
            tmp.extend(yi)
        target_characters = list(set(tmp))

        input_characters = sorted(list(input_characters))
        target_characters = sorted(list(target_characters))
        self.num_encoder_tokens = len(input_characters)
        self.num_decoder_tokens = len(target_characters)
        self.max_encoder_seq_length = max([len(txt) for txt in X])
        self.max_decoder_seq_length = max([len(txt) for txt in y])

        self.max_output_length = max_output_length
        if not max_output_length:
            self.max_output_length = self.max_decoder_seq_length 

        print('Number of samples:', len(X))
        print('Number of unique input tokens:', self.num_encoder_tokens)
        print('Number of unique output tokens:', self.num_decoder_tokens)
        print('output tokens:', target_characters)
        print('Max sequence length for inputs:', self.max_encoder_seq_length)
        print('Max sequence length for outputs:', self.max_output_length )

        self.input_token_index = dict(
            [(char, i) for i, char in enumerate(input_characters)])
        self.target_token_index = dict(
            [(char, i) for i, char in enumerate(target_characters)])

        if not multi_channel:
            self._build_single_model()
            self.channels = 1
        else:
            self.channels = (self.max_encoder_seq_length) // self.max_single_channel_length
            if (self.max_encoder_seq_length) % self.max_single_channel_length != 0:
                self.channels += 1
            self._build_multi_model(norm = norm)
        
    def _prepare_data(self,x,y,max_encoder_seq_length, num_encoder_tokens,max_decoder_seq_length, num_decoder_tokens):
        encoder_input_data = np.zeros(
            (len(x), max_encoder_seq_length, num_encoder_tokens),
            dtype='float32')
        decoder_input_data = np.zeros(
            (len(x), max_decoder_seq_length, num_decoder_tokens),
            dtype='float32')
        decoder_target_data = np.zeros(
            (len(x), max_decoder_seq_length, num_decoder_tokens),
            dtype='float32')

        for i, (input_text, target_text) in enumerate(zip(x, y)):
            for t, char in enumerate(input_text):
                encoder_input_data[i, t, self.input_token_index[char]] = 1.
            encoder_input_data[i, t + 1:, self.input_token_index['eos']] = 1.
            for t, char in enumerate(target_text):
                # decoder 输出比输入早一个step 
                decoder_input_data[i, t, self.target_token_index[char]] = 1.
                if t > 0:
                    decoder_target_data[i, t - 1, self.target_token_index[char]] = 1.
            decoder_input_data[i, t + 1:, self.target_token_index['eos']] = 1.  #长度不足，用‘eos’填充
            decoder_target_data[i, t, self.target_token_index['eos']] = 1.  # 长度不足，用‘eos’填充
        return encoder_input_data, decoder_input_data, decoder_target_data

    def _build_single_model(self):
        self.encoder_inputs = keras.layers.Input(shape=(None, self.num_encoder_tokens))
        encoder = keras.layers.LSTM(self.latent_dim, return_state=True)
        encoder_outputs, state_h, state_c = encoder(self.encoder_inputs)
        self.encoder_states = [state_h, state_c]

        #decode

        self.decoder_inputs = keras.layers.Input(shape=(None, self.num_decoder_tokens))
        self.decoder_lstm = keras.layers.LSTM(self.latent_dim, return_sequences=True, return_state=True)
        decoder_outputs, _, _ = self.decoder_lstm(self.decoder_inputs,initial_state = self.encoder_states)
        self.decoder_dense = keras.layers.Dense(self.num_decoder_tokens, activation='softmax')
        decoder_outputs = self.decoder_dense(decoder_outputs)

        #model
        self.model = keras.Model([self.encoder_inputs, self.decoder_inputs], decoder_outputs)

    def _build_multi_model(self,norm = False):
        self.encoder_inputs = []
        H, C = [], []
        for i in range(self.channels):
            self.encoder_inputs.append(keras.layers.Input(shape=(None, self.num_encoder_tokens)))
            encoder = keras.layers.LSTM(self.latent_dim, return_state=True)
            encoder_outputs, state_h, state_c = encoder(self.encoder_inputs[i])
            H.append(state_h)
            C.append(state_c)
        # k = int(np.sqrt(self.channels))+1 if self.channels>1 else 1
        # index = random.sample(range(self.channels), k=k)
        # if norm == True:
        #     self.encoder_states = [tf.reduce_mean([H[ix] for ix in index],axis=0),
                                #    tf.reduce_mean([C[ix] for ix in index],axis=0)]
        if norm == True:
            self.encoder_states = [tf.reduce_mean(H,axis=0),tf.reduce_mean(C,axis=0)]
        else:
            sum_h, sum_c = H[index[0]], C[index[0]]
            for ix in range(1,len(index)):
                sum_h = keras.layers.add([H[index[ix]],sum_h]) 
                sum_c = keras.layers.add([C[index[ix]],sum_c]) 
            self.encoder_states = [sum_h,sum_c]


        #decode

        self.decoder_inputs = keras.layers.Input(shape=(None, self.num_decoder_tokens))
        self.decoder_lstm = keras.layers.LSTM(self.latent_dim, return_sequences=True, return_state=True)
        decoder_outputs, _, _ = self.decoder_lstm(self.decoder_inputs,initial_state = self.encoder_states)
        self.decoder_dense = keras.layers.Dense(self.num_decoder_tokens, activation='softmax')
        decoder_outputs = self.decoder_dense(decoder_outputs)

        #model
        self.model = keras.Model([self.encoder_inputs, self.decoder_inputs], decoder_outputs)

    def train(self):
        t0 = time.time()
        # Run training
        self.model.compile(optimizer='adam', loss='categorical_crossentropy',
                      metrics=['accuracy'])

        encoder_input_data, decoder_input_data, decoder_target_data = self._prepare_data(
            x = self.train_x,
            y = self.train_y,
            max_encoder_seq_length = self.max_encoder_seq_length, #modefiy 0808
            num_encoder_tokens = self.num_encoder_tokens,
            max_decoder_seq_length = self.max_decoder_seq_length, 
            num_decoder_tokens = self.num_decoder_tokens
        )

        inputs = []
        for i in range(self.channels):
            inputs.append(encoder_input_data[:,i::self.channels])
        inputs.append(decoder_input_data)

        callbacks = [keras.callbacks.EarlyStopping(monitor = 'val_accuracy',patience=1)]

        self.model.fit(inputs, decoder_target_data,
                  batch_size = self.batch_size,
                  epochs = self.epochs,
                  validation_split = 0.1,
                  verbose = self.verbose)

        t1 = time.time()
        self.training_time = (t1 - t0)/len(self.train_y)
        print("training spend time:{}".format(self.training_time))
        
    def evaluation(self):
        # #### testing
        self.reverse_input_char_index = dict(
            (i, char) for char, i in self.input_token_index.items())
        self.reverse_target_char_index = dict(
            (i, char) for char, i in self.target_token_index.items())

        # Next: inference mode (sampling).

        self.encoder_model = keras.Model(self.encoder_inputs, self.encoder_states)

        decoder_state_input_h = keras.layers.Input(shape=(self.latent_dim,))
        decoder_state_input_c = keras.layers.Input(shape=(self.latent_dim,))
        decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
        decoder_outputs, state_h, state_c = self.decoder_lstm(
            self.decoder_inputs, initial_state = decoder_states_inputs)
        decoder_states = [state_h, state_c]
        decoder_outputs = self.decoder_dense(decoder_outputs)
        self.decoder_model = keras.Model(
            [self.decoder_inputs] + decoder_states_inputs,
            [decoder_outputs] + decoder_states)


        encoder_input_data, decoder_input_data, decoder_target_data = self._prepare_data(
            x = self.test_x,
            y = self.test_y,
            max_encoder_seq_length = self.max_encoder_seq_length, 
            num_encoder_tokens = self.num_encoder_tokens,
            max_decoder_seq_length = self.max_decoder_seq_length, 
            num_decoder_tokens = self.num_decoder_tokens
        )

        y_true = [yi[1:-1] for yi in self.test_y]

        input_seq = [encoder_input_data[:,c::self.channels] for c in range(self.channels)]#encoder_input_data[i:i+1]
        decode_output = self._batch_decode_sequence(input_seq)

        return y_true, decode_output
        
    def _batch_decode_sequence(self,input_seq):
        number_samples = input_seq[0].shape[0]

        states_value = self.encoder_model.predict(input_seq)
        target_seq = np.zeros((number_samples, 1, self.num_decoder_tokens))
        for i in range(number_samples):
            target_seq[i, 0, self.target_token_index['sos']] = 1.

        decoded_sentence = [[] for _ in range(number_samples)]

        stop_condition = [False for _ in range(number_samples)]
        while not all(stop_condition):

            output_tokens, h, c = self.decoder_model.predict([target_seq] + states_value)
            sampled_token_index = np.argmax(output_tokens,axis = 2)
            sampled_char = [self.reverse_target_char_index[index[0]] for index in sampled_token_index]

            for i in range(number_samples):

                if sampled_char[i] == 'eos':# or len(decoded_sentence[i]) >= self.max_output_length:
                    stop_condition[i] = True
                else:
                    decoded_sentence[i].append(sampled_char[i])

            # Update the target sequence (of length 1).
            target_seq = np.zeros((number_samples, 1, self.num_decoder_tokens))
            for i in range(number_samples):
                target_seq[i, 0, sampled_token_index[i]] = 1.

            # Update states
            states_value = [h, c]
        return decoded_sentence

### HMM

In [6]:
class HMM:
    def __init__(self,alerts,labels,x,y,use_bw = False):
        train_x,self.test_x,train_y,self.test_y = train_test_split(x, y,test_size=0.2)#,random_state=42

#         states = list(set(labels))  #labels.unique().tolist()
        states = set()
        for yi in y:
            states = states | set(yi)
        states = list(states)
        print(states)
        obs = list(set(alerts))  #alerts.unique().tolist()
       

        self.state_index = {s:index for index,s in enumerate(states)}
        self.obs_index = {o:index for index,o in enumerate(obs)}

        self.index_state = {ix:s for s,ix in self.state_index.items()}
        self.index_obs = {ix:o for o,ix in self.obs_index.items()}

        self.max_output_seq_length = max([len(set(txt)) for txt in y])
#         max_ouput_seq = [set(txt) for txt in y if len(set(txt)) >= max_output_seq_length]

        print('Max sequence length for outputs:', self.max_output_seq_length)
        print('Number of train samples:', len(train_x))
        print('Number of test samples:', len(self.test_x))
        print('Number of states:', len(states))
        print('Number of obs:', len(obs))

        self.A = self._transition_matrix(states,state_seq = train_y)
        self.B = self._emission_matrix(states,obs,state_seq = train_y, obs_seq = train_x)
        self.pi = self._pi_vector(states,state_seq = train_y)
        if not use_bw:
            self.model = self.hmm_kit(states,obs)
        else:
            self.model = self.bw_hmm(states,obs,x)
        
    # HMM 参数计算 监督学习

    # 转移概率  
    def _transition_matrix(self,states,state_seq):
        # 状态数
        state_number = len(states)
        A = [[0]*state_number for _ in range(state_number)]

        count = {}
        for si in states:
            count[si] = {}
            for sj in states:
                count[si].setdefault(sj, 0)

        #状态转移频数
        for seq in state_seq:
            index = 0
            while index < len(seq)-1:
                count[seq[index]][seq[index+1]] += 1
                index += 1

        for i in range(state_number):
            count_i = sum(count[states[i]].values())
            for j in range(state_number):
                A[i][j] = count[states[i]][states[j]] / (count_i+0.000001)

        return A

    # 发射概率
    def _emission_matrix(self,states,obs,state_seq,obs_seq):
        # 状态数
        state_number = len(states)
        obs_number = len(obs)
        B = [[0]*obs_number for _ in range(state_number)]

        count = {}
        for si in states:
            count[si] = {}
            for sj in obs:
                count[si].setdefault(sj, 0)

        #状态到观测的发射频数
        for s_seq,o_seq in zip(state_seq,obs_seq):
            for i in range(len(s_seq)):
                count[s_seq[i]][o_seq[i]] += 1

        for i in range(state_number):
            count_i = sum(count[states[i]].values())
            for j in range(obs_number):
                B[i][j] = count[states[i]][obs[j]] / (count_i+0.000001)

        return B


    # 初始状态概率
    def _pi_vector(self,states,state_seq):
        # 所有样本中初始状态为 q 的概率
        count = {}
        for s in states:
            count.setdefault(s,0)

        for seq in state_seq:
            if seq:
                count[seq[0]] += 1

        pi = np.array([freq/len(state_seq) for freq in count.values()])
        return pi/sum(pi)


    def _hmm_viterbi(self,A,B,pi,O):
        T = len(O)
        N = len(A[0])

        delta = [[0]*N for _ in range(T)]
        psi = [[0]*N for _ in range(T)]

        #step1: init
        for i in range(N):
            delta[0][i] = pi[i]*B[i][self.obs_index[O[0]]]
            psi[0][i] = 0

        #step2: iter
        for t in range(1,T):
            for i in range(N):
                temp,maxindex = 0,0
                for j in range(N):
                    res = delta[t-1][j]*A[j][i]
                    if res>temp:
                        temp = res
                        maxindex = j

                delta[t][i] = temp*B[i][self.obs_index[O[t]]]#delta
                psi[t][i] = maxindex

        #step3: end
        p = max(delta[-1])
        for i in range(N):
            if delta[-1][i] == p:
                i_T = i

        #step4：backtrack
        path = [0]*T
        i_t = i_T
        for t in reversed(range(T-1)):
            i_t = psi[t+1][i_t]
            path[t] = i_t
        path[-1] = i_T

        return delta,psi,path  
   
    def evaluation(self):
#         y_true = [sorted(set(yi), key=yi.index) for yi in self.test_y]
        y_true = self.test_y
        y_pred = []
        for i in range(len(self.test_x)):
            # _, _, path = self._hmm_viterbi(self.A, self.B, self.pi, self.test_x[i])
            seen = np.array([self.obs_index[self.test_x[i][t]] for t in range(len(self.test_x[i]))]).reshape(-1,1)
            path = self.model.predict(seen).tolist()
            y_pred.append([self.index_state[s] for s in path])
#             y_pred.append([self.index_state[s] for s in sorted(set(path), key=path.index)])
        return y_true, y_pred
    
    def hmm_kit(self,states,obs):
        from hmmlearn import hmm

        n_states = len(states)
        n_observations = len(obs)

        start_probability = self.pi
        transition_probability = self.A
        emission_probability = self.B

        model = hmm.MultinomialHMM(n_components=n_states)
        model.startprob_=start_probability
        model.transmat_=transition_probability
        model.emissionprob_=emission_probability
        return model
    
    def bw_hmm(self,states,obs,x):
        from hmmlearn import hmm
        n_states = len(states)
        n_observations = len(obs)

        model = hmm.MultinomialHMM(n_components=n_states, n_iter=10, tol=0.01)

        O = []
        for xi in x:
            o_seq = [self.obs_index[o] for o in xi]
            O.append(o_seq)

        model.fit(np.array(O))
        return model

### main

In [7]:
path = "/content/drive/My Drive/Colab Notebooks/MSA/"
# path = "./"

In [8]:
input_file = path + "data/darpa_alert.csv" 
print(input_file)
alerts, labels = dataLoader(input_file)

/content/drive/My Drive/Colab Notebooks/MSA/data/darpa_alert.csv


#### seq2seq vs. hmm 2 stage

In [None]:

output_file = path + "results/"+ datetime.now().strftime("%Y%m%d%H%M%S") + "_darpa_2_stage_eva52.csv"

print(output_file)

/content/drive/My Drive/Colab Notebooks/MSA/results/20200821100332_darpa_2_stage_eva52.csv


In [None]:
data = pd.read_csv(input_file)

s1 = data[data.stage == "s1"].event.tolist()
s2 = data[data.stage == "s2"].event.tolist()
s3 = data[data.stage == "s3"].event.tolist()
s4 = data[data.stage == "s4"].event.tolist()
s5 = data[data.stage == "s5"].event.tolist()

other = data[(data.stage != "s1") & (data.stage != "s2")].event.tolist()
label = data[(data.stage != "s1") & (data.stage != "s2")].stage.tolist()
print(len(s1),len(s2),len(other))

stages = ["s1","s2"]
output_length = len(stages)

293 109 3531


In [None]:
# import pickle   

# with open(path+"temp/darpa_x_stage_2_len_"+str(length)+".pickle",'rb') as f:
#     X = pickle.load(f)
# with open(path+"temp/darpa_y_stage_2_len_"+str(length)+".pickle",'rb') as f:
#     y = pickle.load(f)

In [None]:
for length in [1000,1500,2000]: # 100 500 1000 1500 2000 2500 3000
    for epoch in range(5):
        print("------------- epoch {} n_steps :{} -----------".format(epoch, length))

        ## run seq2seq
        
        X, y, hmm_y = generate_2_samples(s1,s2,other,label,length = length, numbers = 1500,k=1)

        # with open(path+"temp/darpa_x_stage_2_len_"+str(length)+".pickle",'wb') as f:
        #     pickle.dump(X,f)
        # with open(path+"temp/darpa_y_stage_2_len_"+str(length)+".pickle",'wb') as f:
        #     pickle.dump(y,f)

        seq2seq = Seq2Seq(alerts = alerts, 
                        labels = labels,
                        X = X,
                        y = y, 
                        latent_dim = 64, 
                        batch_size = 32, 
                        epochs = 8,
                        max_single_channel_length = 200,
                        verbose = 0
                    )
        seq2seq.train()
        y_true, output = seq2seq.evaluation()
        precision, recall, f1 = eva_metrics5(y_true,output,stages = stages)

        
        # hmm
        hmm = HMM(alerts = alerts,
                  labels = labels,
                  x = X,
                  y = hmm_y)
        hmm_true, hmm_output = hmm.evaluation()
        hmm_precision, hmm_recall, hmm_f1 = eva_metrics1(hmm_true,hmm_output,stages = stages,hmm=True)

        csvFile = open(output_file,'a+',newline='')
        try:
            writer=csv.writer(csvFile)
            writer.writerow((input_file, len(X), length,
                             np.mean([len(xi) for xi in X]), seq2seq.max_encoder_seq_length, 
                             output_length, 'seq2seq-hmm',
                             precision, hmm_precision,recall,hmm_recall, f1, hmm_f1))
        finally:
            csvFile.close()
        

------------- epoch 0 n_steps :1000 -----------
Number of samples: 3000
Number of unique input tokens: 18
Number of unique output tokens: 5
output tokens: ['eos', 'o', 's1', 's2', 'sos']
Max sequence length for inputs: 3310
Max sequence length for outputs: 4
training spend time:0.271069896419843
precison:  0.950920245398773
recall:  1.0
f1:  0.9748427672955975
['s2', 's1', 'o']
Max sequence length for outputs: 3
Number of train samples: 2400
Number of test samples: 600
Number of states: 3
Number of obs: 17
precison:  0.7012452263838721
recall:  0.8923333333333365
f1:  0.7853324663884201
------------- epoch 1 n_steps :1000 -----------
Number of samples: 3000
Number of unique input tokens: 18
Number of unique output tokens: 5
output tokens: ['eos', 'o', 's1', 's2', 'sos']
Max sequence length for inputs: 3308
Max sequence length for outputs: 4
training spend time:0.2966631856560707
precison:  0.9235294117647059
recall:  1.0
f1:  0.9602446483180428
['s2', 's1', 'o']
Max sequence length for

#### seq2seq vs. hmm 3 stage

In [None]:
output_file = path + "results/"+ datetime.now().strftime("%Y%m%d%H%M%S") + "_darpa_3_stage_eva52.csv"
# print(output_file)

In [None]:
data = pd.read_csv(input_file)

s1 = data[data.stage == "s1"].event.tolist()
s2 = data[data.stage == "s2"].event.tolist()
s3 = data[data.stage == "s3"].event.tolist()
s4 = data[data.stage == "s4"].event.tolist()
s5 = data[data.stage == "s5"].event.tolist()

other = data[(data.stage != "s1") & (data.stage != "s2") & (data.stage != "s3")].event.tolist()
label = data[(data.stage != "s1") & (data.stage != "s2") & (data.stage != "s3")].stage.tolist()
print(len(s1),len(s2),len(s3),len(other))

stages = ["s1","s2","s3"]
output_length = len(stages)

293 109 81 3450


In [None]:
for length in [100,500,1000,1500,2000]: # darpa 100,420,900,1280,2500 #800,1300,1800,5350,7100,9700
    for epoch in range(5):
        print("------------- epoch {} n_steps :{} -----------".format(epoch, length))

        ## run seq2seq
        
        X, y, hmm_y = generate_3_samples(s1,s2,s3,other,label,length = length, numbers = 1500,k=1)
        
        seq2seq = Seq2Seq(alerts = alerts, 
                        labels = labels,
                        X = X,
                        y = y, 
                        latent_dim = 64, 
                        batch_size = 32, 
                        epochs = 8,
                        max_single_channel_length = 200,
                        verbose = 0
                    )
        seq2seq.train()
        y_true, output = seq2seq.evaluation()
        precision, recall, f1 = eva_metrics5(y_true,output,stages = stages)

        # hmm
        hmm = HMM(alerts = alerts,
                  labels = labels,
                  x = X,
                  y = hmm_y)
        hmm_true, hmm_output = hmm.evaluation()
        hmm_precision, hmm_recall, hmm_f1 = eva_metrics2(hmm_true,hmm_output,stages = stages)

        csvFile = open(output_file,'a+',newline='')
        try:
            writer=csv.writer(csvFile)
            writer.writerow((input_file, len(X), length,
                             np.mean([len(xi) for xi in X]), seq2seq.max_encoder_seq_length, 
                             output_length, 'seq2seq-hmm',
                             precision, hmm_precision,recall,hmm_recall, f1, hmm_f1))
        finally:
            csvFile.close() 

------------- epoch 0 n_steps :100 -----------
Number of samples: 3000
Number of unique input tokens: 18
Number of unique output tokens: 6
output tokens: ['eos', 'o', 's1', 's2', 's3', 'sos']
Max sequence length for inputs: 849
Max sequence length for outputs: 5
training spend time:0.08231138666470846
precison:  0.587890625
recall:  0.9435736677115988
f1:  0.7244283995186523
['s2', 's1', 's3', 'o']
Max sequence length for outputs: 4
Number of train samples: 2400
Number of test samples: 600
Number of states: 4
Number of obs: 17
precison:  0.9770747765453088
recall:  0.9985972316616651
f1:  0.9877187740996424
------------- epoch 1 n_steps :100 -----------
Number of samples: 3000
Number of unique input tokens: 18
Number of unique output tokens: 6
output tokens: ['eos', 'o', 's1', 's2', 's3', 'sos']
Max sequence length for inputs: 832
Max sequence length for outputs: 5
training spend time:0.08243245442708333
precison:  1.0
recall:  1.0
f1:  1.0
['s2', 's1', 's3', 'o']
Max sequence length f

#### seq2seq vs. hmm 4 stage

In [None]:
output_file = path + "results/"+ datetime.now().strftime("%Y%m%d%H%M%S") + "_iscx_4_stage.csv"
print(output_file)

/content/drive/My Drive/Colab Notebooks/MSA/results/20200818114431_iscx_4_stage_eva1.csv


In [None]:
data = pd.read_csv(input_file)

s1 = data[data.stage == "s1"].event.tolist()
s2 = data[data.stage == "s2"].event.tolist()
s3 = data[data.stage == "s3"].event.tolist()
s4 = data[data.stage == "s4"].event.tolist()
s5 = data[data.stage == "s5"].event.tolist()

other = data[(data.stage != "s1") & (data.stage != "s2") & (data.stage != "s3") & (data.stage != "s4")].event.tolist()
label = data[(data.stage != "s1") & (data.stage != "s2") & (data.stage != "s3") & (data.stage != "s4")].stage.tolist()
print(len(s1),len(s2),len(s3),len(other))

stages = ["s1","s2","s3","s4"]
output_length = len(stages)

78 1240 249 18345


In [None]:

for epoch in range(10):
    for length in [100,500,1000,1500,2000]: # darpa 100,420,900,1280,2500 #800,1300,1800,5350,7100,9700

        print("------------- epoch {} n_steps :{} -----------".format(epoch, length))

        ## run seq2seq
        
        X, y, hmm_y = generate_4_samples(s1,s2,s3,s4,other,label,length = length, numbers = 1000)
        
        seq2seq = Seq2Seq(alerts = alerts, 
                        labels = labels,
                        X = X,
                        y = y, 
                        latent_dim = 64, 
                        batch_size = 32, 
                        epochs = 5,
                        max_single_channel_length = 200, 
                        max_output_length = output_length
                    )
        seq2seq.train()
        y_true, output = seq2seq.evaluation()
        precision, recall, f1 = eva_metrics5(y_true,output,stages = stages)

        # csvFile = open(output_file,'a+',newline='')
        # try:
        #     writer=csv.writer(csvFile)
        #     writer.writerow((input_file, len(X), len(X[0]), output_length, 'seq2seq', 
        #                      precision, recall, f1, seq2seq.training_time, 
        #                      seq2seq.latent_dim, seq2seq.batch_size, seq2seq.epochs, seq2seq.max_single_channel_length))
        # finally:
        #     csvFile.close()

        hmm = HMM(alerts = alerts,
                  labels = labels,
                  x = X,
                  y = hmm_y)
        hmm_true, hmm_output = hmm.evaluation()
        hmm_precision, hmm_recall, hmm_f1 = eva_metrics2(hmm_true,hmm_output,stages = stages)

        csvFile = open(output_file,'a+',newline='')
        try:
            writer=csv.writer(csvFile)
            writer.writerow((input_file, len(X), length,
                             np.mean([len(xi) for xi in X]), seq2seq.max_encoder_seq_length, 
                             output_length, 'seq2seq-hmm',
                             precision, hmm_precision,recall,hmm_recall, f1, hmm_f1))
        finally:
            csvFile.close()

------------- epoch 0 n_steps :100 -----------
Number of samples: 2000
Number of unique input tokens: 58
Number of unique output tokens: 9
output tokens: ['eos', 'o', 's1', 's2', 's3', 's4', 's5', 's6', 'sos']
Max sequence length for inputs: 2812
Max sequence length for outputs: 4
training spend time:0.03936794772744179
precison:  0.9999990000010001
recall:  0.005263157867036012
f1:  0.01047119366245454
['s3', 's1', 's6', 's2', 's5', 'o', 's4']
Max sequence length for outputs: 5
Number of train samples: 1600
Number of test samples: 400
Number of states: 7
Number of obs: 57
precison:  0.28925789495483195
recall:  0.5499165290190146
f1:  0.3791047569807415
------------- epoch 0 n_steps :500 -----------
Number of samples: 2000
Number of unique input tokens: 58
Number of unique output tokens: 9
output tokens: ['eos', 'o', 's1', 's2', 's3', 's4', 's5', 's6', 'sos']
Max sequence length for inputs: 2776
Max sequence length for outputs: 4
training spend time:0.041680358499288556
precison:  0.9

#### seq2seq vs. hmm 5 stage

In [9]:
output_file = path + "results/"+ datetime.now().strftime("%Y%m%d%H%M%S") + "_darpa_5_stage.csv"
print(output_file)

/content/drive/My Drive/Colab Notebooks/MSA/results/20200823014048_darpa_5_stage.csv


In [10]:
data = pd.read_csv(input_file)

s1 = data[data.stage == "s1"].event.tolist()
s2 = data[data.stage == "s2"].event.tolist()
s3 = data[data.stage == "s3"].event.tolist()
s4 = data[data.stage == "s4"].event.tolist()
s5 = data[data.stage == "s5"].event.tolist()

other = data[(data.stage != "s1") & (data.stage != "s2") & (data.stage != "s3") & (data.stage != "s4") & (data.stage != "s5")].event.tolist()
label = data[(data.stage != "s1") & (data.stage != "s2") & (data.stage != "s3") & (data.stage != "s4") & (data.stage != "s5")].stage.tolist()
print(len(s1),len(s2),len(s3),len(s4),len(s5),len(other))

stages = ["s1","s2","s3","s4","s5"]
output_length = len(stages)

293 109 81 2 2030 1418


In [None]:
for length in [2000]:
    for epoch in range(10):
    
        print("------------- epoch {} n_steps :{} -----------".format(epoch, length))

        ## run seq2seq
        
        X, y, hmm_y = generate_5_samples(s1,s2,s3,s4,s5,other,label,length = length, numbers = 1000)
        
        seq2seq = Seq2Seq(alerts = alerts, 
                        labels = labels,
                        X = X,
                        y = y, 
                        latent_dim = 64, 
                        batch_size = 32, 
                        epochs = 15,
                        max_single_channel_length = 200, 
                        max_output_length = output_length
                    )
        seq2seq.train()
        y_true, output = seq2seq.evaluation()
        precision, recall, f1 = eva_metrics5(y_true,output,stages = stages)


        # hmm = HMM(alerts = alerts,
        #           labels = labels,
        #           x = X,
        #           y = hmm_y)
        # hmm_true, hmm_output = hmm.evaluation()
        # hmm_precision, hmm_recall, hmm_f1 = eva_metrics2(hmm_true,hmm_output,stages = stages)

        csvFile = open(output_file,'a+',newline='')
        try:
            writer=csv.writer(csvFile)
            writer.writerow((input_file, len(X), length,
                             np.mean([len(xi) for xi in X]), seq2seq.max_encoder_seq_length, 
                             output_length, 'seq2seq-hmm',
                             precision, recall, f1))
        finally:
            csvFile.close()

------------- epoch 0 n_steps :2000 -----------
Number of samples: 2000
Number of unique input tokens: 18
Number of unique output tokens: 8
output tokens: ['eos', 'o', 's1', 's2', 's3', 's4', 's5', 'sos']
Max sequence length for inputs: 14236
Max sequence length for outputs: 5
training spend time:1.7740791887044907
precison:  1.0
recall:  1.0
f1:  1.0
------------- epoch 1 n_steps :2000 -----------
Number of samples: 2000
Number of unique input tokens: 18
Number of unique output tokens: 8
output tokens: ['eos', 'o', 's1', 's2', 's3', 's4', 's5', 'sos']
Max sequence length for inputs: 14073
Max sequence length for outputs: 5
training spend time:1.743356636762619


### multi_channel vs. single

In [None]:

output_file = path + "results/"+ datetime.now().strftime("%Y%m%d%H%M%S") + "_darpa_2_stage_channel.csv"

print(output_file)

/content/drive/My Drive/Colab Notebooks/MSA/results/20200820135542_darpa_2_stage_channel.csv


In [None]:
data = pd.read_csv(input_file)

s1 = data[data.stage == "s1"].event.tolist()
s2 = data[data.stage == "s2"].event.tolist()
s3 = data[data.stage == "s3"].event.tolist()
s4 = data[data.stage == "s4"].event.tolist()
s5 = data[data.stage == "s5"].event.tolist()

other = data[(data.stage != "s1") & (data.stage != "s2")].event.tolist()
label = data[(data.stage != "s1") & (data.stage != "s2")].stage.tolist()
print(len(s1),len(s2),len(other))

stages = ["s1","s2"]
output_length = len(stages)

293 109 3531


In [None]:
logs = []
for epoch in range(5):
    for length in [100,500,1000,1500,2000]: # 100 500 1000 1500 2000 2500 3000

        print("------------- epoch {} n_steps :{} -----------".format(epoch, length))

        ## run seq2seq
        
        X, y, hmm_y = generate_2_samples(s1,s2,other,label,length = length, numbers = 1500,k=1)

        seq2seq = Seq2Seq(alerts = alerts, 
                        labels = labels,
                        X = X,
                        y = y, 
                        latent_dim = 64, 
                        batch_size = 32, 
                        epochs = 10,
                        max_single_channel_length = 200,
                        verbose = 0
                    )
        seq2seq.train()
        y_true, output = seq2seq.evaluation()
        precision, recall, f1 = eva_metrics5(y_true,output,stages = stages)

        
        # single
        seq2seq = Seq2Seq(alerts = alerts, 
                        labels = labels,
                        X = X,
                        y = y, 
                        latent_dim = 64, 
                        batch_size = 32, 
                        epochs = 8,
                        max_single_channel_length = 200,
                        verbose = 0,
                        multi_channel = False
                    )
        seq2seq.train()
        y_true, output = seq2seq.evaluation()
        s_precision, s_recall, s_f1 = eva_metrics5(y_true,output,stages = stages)


        csvFile = open(output_file,'a+',newline='')
        try:
            writer=csv.writer(csvFile)
            writer.writerow((input_file, len(X), length,
                             np.mean([len(xi) for xi in X]), seq2seq.max_encoder_seq_length, 
                             output_length, 'seq2seq-hmm',
                             precision, s_precision,recall,s_recall, f1, s_f1))
        finally:
            csvFile.close()
        

------------- epoch 0 n_steps :100 -----------
Number of samples: 3000
Number of unique input tokens: 18
Number of unique output tokens: 5
output tokens: ['eos', 'o', 's1', 's2', 'sos']
Max sequence length for inputs: 684
Max sequence length for outputs: 4
training spend time:0.05604758809010188
precison:  1.0
recall:  0.9655172413793104
f1:  0.9824561403508771
Number of samples: 3000
Number of unique input tokens: 18
Number of unique output tokens: 5
output tokens: ['eos', 'o', 's1', 's2', 'sos']
Max sequence length for inputs: 684
Max sequence length for outputs: 4
training spend time:0.06959164669116338
------------- epoch 0 n_steps :500 -----------
Number of samples: 3000
Number of unique input tokens: 18
Number of unique output tokens: 5
output tokens: ['eos', 'o', 's1', 's2', 'sos']
Max sequence length for inputs: 1834
Max sequence length for outputs: 4
training spend time:0.07634679992993673
precison:  0.7486910994764397
recall:  1.0
f1:  0.8562874251497006
Number of samples: 30