# Connection to drive and path definition

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/final/SANDS/')
path_files ='/content/drive/MyDrive/Colab Notebooks/final/UC9'

# Import libraries

In [None]:
!pip install tensorflow==2.12.0

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error,mean_absolute_percentage_error
import time
from calendar import timegm, monthrange
from datetime import datetime, timedelta
import math
import sands
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import tensorflow as tf
from tensorflow import keras
from os import listdir
from os.path import isfile, join
from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.metrics import CategoricalAccuracy,Accuracy,BinaryAccuracy


import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.tsa.api import ExponentialSmoothing
from scipy.stats import iqr


import json

import nltk
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import string
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.manifold import TSNE
from progressbar import ProgressBar
import h5py






# Functions for data processing

In [None]:
max_len = 210
latent_len = 512
training_size = 20000
testing_size = 2000
total_subset = training_size+testing_size
log_len = 210
batch_size = 256
nb_epoch = 200
training_steps_per_epoch = np.ceil(training_size/batch_size)
validation_steps_per_epoch = np.ceil(testing_size/batch_size)
training_file = 'log_messages.txt'
training_data_file = 'training_data_binary.h5'
noise_len = 128

In [None]:
with open(join(path_files,'c_n_dict_pred.txt'), 'r') as infile:
    c_n_dict=json.load(infile)

with open(join(path_files,'n_c_dict_pred.txt'), 'r') as infile:
    n_c_dict=json.load(infile)
n_c_dict = { int(key):value for key,value in n_c_dict.items()}

In [None]:
def remove_too_long(log_list):
    training_logs = []
    for log in log_list:
      if len(log)<max_len:
        training_logs.append(log)
    return training_logs

In [None]:
def tokenize_logs(log_list):
    training_logs_c = []
    for log in log_list:
      training_logs_c.append([char for char in log])
    return training_logs_c

In [None]:
def one_hot_encode(log_list_c):
    training_array=np.zeros((len(log_list_c),max_len))
    training_array_b=np.zeros((len(log_list_c),max_len,len(c_n_dict)))
    for i in range(len(log_list_c)):
      log = log_list_c[i][:-1]
      log_n = [c_n_dict[c] for c in log]
      log_n += [c_n_dict[' ']]*(max_len-len(log_n))
      training_array[i,:]=log_n
      for j in range(training_array_b.shape[1]):
        training_array_b[i,j,int(training_array[i,j])]=1
    return training_array_b

In [None]:
def load_data(Train_file, idx, batch_size):
  log_batch = [next(Train_file) for x in range(batch_size)]
  return log_batch

In [None]:
def batch_generator(Train_file, batch_size, steps):
  idx=1
  with h5py.File(Train_file,'r') as hf:
      while True:
        start_index = (idx-1)*batch_size
        stop_index = start_index + batch_size
        X_batch = hf['X_train_latent'][start_index:stop_index,:]
        yield (X_batch,X_batch)
        if idx < steps:
          idx +=1
        else:
          idx=1

In [None]:
def load_batch(Train_file, batch_size, start):
  with h5py.File(Train_file,'r') as hf:
        start_index = start
        stop_index = start_index + batch_size
        X_batch = hf['X_train_latent'][start_index:stop_index,:latent_len]
        return X_batch,X_batch

In [None]:
def data_loader(Train_file, batch_size, steps):
  idx=1
  go_on = True
  x = np.zeros((batch_size*int(steps),max_len,len(c_n_dict)))
  with open(Train_file,'r', encoding = "ISO-8859-1") as myfile:
      while go_on:
        log_batch = load_data(myfile, idx-1,batch_size)
        log_batch = remove_too_long(log_batch)
        log_batch_b = one_hot_encode(tokenize_logs(log_batch))
        x[(idx-1)*batch_size:idx*batch_size,:,:] = log_batch_b
        if idx < steps:
          idx +=1
        else:
          go_on = False
  return x

In [None]:
def binarize_log(generated_log):
  binarized_log = np.zeros(generated_log.shape)

  for i in range(generated_log.shape[0]):

    for j in range(generated_log.shape[1]):
      max_value = np.max(generated_log[i,j])
      binarized_log[i,j,:]=np.where(generated_log[i,j,:]==max_value,1,0)

  return binarized_log

In [None]:
def print_log(binarized_log):
  max_value_index = np.argmax(binarized_log,axis=-1)
  new_logs = []
  for i in range(binarized_log.shape[0]):
    char_list = [n_c_dict[j] for j in max_value_index[i]]
    new_log =''.join(char_list)
    print('Generated log: ',new_log)
    new_logs.append(new_log)
  return new_logs

# Models definition

In [None]:
def make_decoder_model():
  decoder_model = tf.keras.Sequential()
  decoder_model.add(layers.RepeatVector(log_len,input_shape=(512,)))
  decoder_model.add(Bidirectional(LSTM(256,return_sequences=True)))
  decoder_model.add(layers.TimeDistributed(Dense(len(c_n_dict),activation='softmax')))
  decoder_model.load_weights(join(path_files,'weights_decoder_best.hdf5'))
  return decoder_model

In [None]:
def make_generator_model():
    gen_model = tf.keras.Sequential()
    gen_model.add(Dense(1024,activation='relu',input_shape=(noise_len,)))
    gen_model.add(Dense(512,activation='relu'))
    gen_model.add(Dense(latent_len,activation='tanh'))
    return gen_model

In [None]:
decoder_model = make_decoder_model()

In [None]:
decoder_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 repeat_vector_1 (RepeatVect  (None, 210, 512)         0         
 or)                                                             
                                                                 
 bidirectional_1 (Bidirectio  (None, 210, 512)         1574912   
 nal)                                                            
                                                                 
 time_distributed_1 (TimeDis  (None, 210, 68)          34884     
 tributed)                                                       
                                                                 
Total params: 1,609,796
Trainable params: 1,609,796
Non-trainable params: 0
_________________________________________________________________


In [None]:
generator_model = make_generator_model()

In [None]:
generator_model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_2 (Dense)             (None, 1024)              132096    
                                                                 
 dense_3 (Dense)             (None, 512)               524800    
                                                                 
 dense_4 (Dense)             (None, 512)               262656    
                                                                 
Total params: 919,552
Trainable params: 919,552
Non-trainable params: 0
_________________________________________________________________


In [None]:
noise = tf.random.normal([5,noise_len, 1])
generated_log = generator_model(noise, training=False)
generated_log_np = tf.multiply(generated_log,1)

In [None]:
generated_log_np

<tf.Tensor: shape=(5, 512), dtype=float32, numpy=
array([[ 0.4686261 ,  0.11976805, -0.64332235, ...,  0.00119619,
         0.14156789, -0.00922195],
       [ 0.30733234,  0.4541695 , -0.3975639 , ..., -0.11179242,
         0.15578625,  0.1067327 ],
       [ 0.22862591,  0.12244097, -0.42136833, ...,  0.34721142,
         0.1372217 ,  0.16629401],
       [ 0.42050344, -0.00882316, -0.27556613, ...,  0.0197808 ,
         0.26825127,  0.392187  ],
       [ 0.03635823,  0.2617016 , -0.32123345, ...,  0.22509287,
        -0.11603758,  0.4230603 ]], dtype=float32)>

In [None]:
def make_discriminator_model():

  mymodel = tf.keras.Sequential()
  mymodel.add(Dense(1024,activation='relu',input_shape=(latent_len,)))
  mymodel.add(Dense(512,activation='relu'))
  mymodel.add(Dense(512,activation='relu'))
  mymodel.add(Dense(512,activation='relu'))
  mymodel.add(Dense(1))

  return mymodel

In [None]:
discriminator_model = make_discriminator_model()

In [None]:
discriminator_model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 1024)              525312    
                                                                 
 dense_6 (Dense)             (None, 512)               524800    
                                                                 
 dense_7 (Dense)             (None, 512)               262656    
                                                                 
 dense_8 (Dense)             (None, 512)               262656    
                                                                 
 dense_9 (Dense)             (None, 1)                 513       
                                                                 
Total params: 1,575,937
Trainable params: 1,575,937
Non-trainable params: 0
_________________________________________________________________


In [None]:
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [None]:
def discriminator_loss(real_output, fake_output):
    real_loss = cross_entropy(tf.ones_like(real_output), real_output)
    fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
    total_loss = real_loss + fake_loss
    return total_loss

In [None]:
def generator_loss(fake_output):
    fake_loss = cross_entropy(tf.ones_like(fake_output), fake_output)
    return fake_loss

In [None]:
generator_optimizer = tf.keras.optimizers.Adam(1e-4)
discriminator_optimizer = tf.keras.optimizers.Adam(1e-4)

In [None]:
checkpoint_dir = path_files
checkpoint_prefix = join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,
                                 discriminator_optimizer=discriminator_optimizer,
                                 generator=generator_model,
                                 discriminator=discriminator_model)

In [None]:

noise_dim = 1
num_examples_to_generate = 16

seed = tf.random.normal([num_examples_to_generate,noise_len, noise_dim])

# Training functions

In [None]:
def train_step(real_logs):
    noise = tf.random.normal([batch_size, noise_len, noise_dim])
    #print('next_train_step')
    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
      generated_logs = generator_model(noise, training=True)
      real_output = discriminator_model(real_logs, training=True)
      fake_output = discriminator_model(generated_logs, training=True)
      gen_loss = generator_loss(fake_output)
      disc_loss = discriminator_loss(real_output, fake_output)
    #print('   generator loss: ',gen_loss,'  discriminator loss:',disc_loss)
    gradients_of_generator = gen_tape.gradient(gen_loss, generator_model.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator_model.trainable_variables)
    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator_model.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator_model.trainable_variables))
    return 0

In [None]:
def generate_and_save_logs(model, epoch, test_input):
  predictions = model(test_input, training=False)
  generated_log_np = tf.multiply(predictions,1)
  new_logs = print_log(binarize_log(decoder_model.predict(predictions)))
  file1 = open(join(path_files,'logs_at_epoc_{:04d}.log'.format(epoch)), 'w', encoding = "ISO-8859-1")
  file1.writelines(new_logs)

In [None]:
def train(training_size, epochs, batch):
  for epoch in range(epochs):
    start = time.time()
    batch_start = 0
    batch_number=1
    while (batch_start+batch)<training_size:
      (training_set,_) = load_batch(join(path_files,training_data_file),batch,batch_start)
      train_step(training_set)
      batch_start += batch
      batch_number +=1
    # Produce logs as you go
    print('Epoch:',epoch)
    generate_and_save_logs(generator_model,
                             epoch + 1,
                             seed)

    # Save the model every 15 epochs
    if (epoch + 1) % 15 == 0:
      checkpoint.save(file_prefix = checkpoint_prefix)

    print ('Time for epoch {} is {} sec'.format(epoch + 1, time.time()-start))

  # Generate after the final epoch
  generate_and_save_logs(generator_model,
                           epochs,
                           seed)

# GAN Training

In [None]:
train(training_size, nb_epoch, batch_size)

Epoch: 0
Generated log:  20222-10-01 001:10::01   E__mgr1111]::  %PRRRSNNN____RRRFF_______IAAREEEEVEN    ::    eeeeff                                              020000022222222000000----11100 00011110:::::0111       __mgr1171]]:: %%E
Generated log:  20222-10-01  01:20::01    f_mNr[111]]:: %EERRENNNNNN__GGF______IIIARREEEEEN    :::   eeeefff                                              00000022222222200000000000000000-0000  0011111:::::01111      ___mN[888]
Generated log:  20222-10-001 01::1:::11   f__mgr1111]::  %EEEEENNNNT___GRHF______IIIAAREEEEVENN    :     eeeffff                                              22222222222000000000000000000000000--0001  00111222:::::00111   bf__
Generated log:  20222-10-01  01:20::01    f_mmr[111]]:: %ERRRRSSAT___GGFFF______IAAAEEEEEVVNN   ::      eeeeff                                                   02222222200000000000000--0111110011111::::00111       ___mr[388]]
Generated log:  20222-10-01 011:15::011  f__mgr1111]::  %PRRRRSSAT___RRR3FS______HA

KeyboardInterrupt: ignored

# Test GAN

In [None]:

generate_and_save_logs(generator_model,
                           nb_epoch,
                           seed)

Generated log:  2022-10-01 02:20:01: if_mgr[2222]: %INTF-STATE_MGR-3-STATE_CHANGE_EVENT : Interface changed state to down: tt------    device E---                                                                                
Generated log:  2022-10-01 02:20:01: if_mgr[2222]: %INTF-STATE_MGR-3-STATE_CHANGE_EVENT : Interface changed state to doow:: t------    ddvvic  ---                                                                                
Generated log:  2022-10-01 02:20:01: if_mgr[2222]: %INTF-STATE_MGR-3-STATE_CHANGE_EVENT : Interface changed state to doown: t------     dvvice ----                                                                               
Generated log:  2022-10-01 02:20:01: if_mgr[2222]: %INTF-STATE_MGR-3-STATE_CHANGE_EVENT : Interface changed state to doow:: t------    ddvvic  ---                                                                                
Generated log:  2022-10-01 02:20:01: if_mgr[2222]: %INTF-STATE_MGR-3-STATE_CHANGE_EVENT : In

In [None]:
new_seed = tf.random.normal([num_examples_to_generate,noise_len, noise_dim])
generate_and_save_logs(generator_model,
                           nb_epoch,
                           new_seed)

Generated log:  2022-10-01 02:20:01: if_mgr[2222]: %INTF-STATE_MGR-3-STATE_CHANGE_EVENT : Interface changed state to down::tt------    devvie  ---                                                                                
Generated log:  2022-10-01 02:20:01: if_mgr[2222]: %INTF-STATE_MGR-3-STATE_CHANGE_EVENT : Interface changed state to down: tt-----  i  dvvic  ---                                                                                 
Generated log:  2022-10-01 02:20:01: if_mgr[2222]: %INTF-STATE_MGR-3-STATE_CHANGE_EVENT : Interface changed state to down: tt------    device E---                                                                                
Generated log:  2022-10-01 02:20:01: if_mgr[2222]: %INTF-STATE_MGR-3-STATE_CHANGE_EVENT : Interface changed state to down: tt-----  i  dvvic  ---                                                                                 
Generated log:  2022-10-01 02:20:01: if_mgr[2222]: %INTF-STATE_MGR-3-STATE_CHANGE_EVENT : In

In [None]:
generator_model.save_weights(join(path_files,'GAN_generator_dense_best.hdf5'))
discriminator_model.save_weights(join(path_files,'GAN_discriminator_dense_best.hdf5'))
