<a href="https://colab.research.google.com/github/huanyanwei/ai-projects/blob/main/AI_on_Web_Logs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%tensorflow_version 2.x

In [2]:
import pandas as pd
import numpy as np

import tensorflow as tf

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence

from tensorflow.keras.optimizers import Adam

In [3]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [4]:
train_df = pd.read_csv("./train.csv")
test_df = pd.read_csv("./test.csv")

train_df.head()

Unnamed: 0.1,Unnamed: 0,url,malicious
0,176,"127.0.0.1 - - [28/Apr/2021:22:30:44 -0400] ""GE...",1
1,199,"127.0.0.1 - - [28/Apr/2021:21:44:08 -0400] ""GE...",0
2,86,"127.0.0.1 - - [28/Apr/2021:22:30:38 -0400] ""GE...",1
3,230,"127.0.0.1 - - [28/Apr/2021:21:42:17 -0400] ""GE...",0
4,134,"127.0.0.1 - - [28/Apr/2021:22:14:52 -0400] ""GE...",0


In [5]:
X_train = train_df["url"].values
X_test = test_df["url"].values

X_train[0]

'127.0.0.1 - - [28/Apr/2021:22:30:44 -0400] "GET /DVWA/vulnerabilities/fi/?page=%5c%2e%2e%5c%2e%2e%5c%2e%2e%5c%2e%2e%5c%2e%2e%5c%2e%2e%5c%2e%2e%5c%2e%2e%5c%2e%2e%5c%2e%2e%5cetc%5cshadow HTTP/1.1" 200 1315 "-" "Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0"'

In [6]:
y_train = train_df["malicious"].values

y_train[0]

1

In [7]:
# create the tokenizer
t = Tokenizer()

# Get the total number of words from all datasets (i.e. train2, valdn and test)
all_comments = list (X_train) + list (X_test)
print("There are a total of", len(all_comments), "logs in all of the data")

# fit the tokenizer on the documents
t.fit_on_texts(all_comments)

# summarize what was learned
total_num_of_words = len(t.word_counts)
print("There are a total of", total_num_of_words, "distinct words in all of the data")

There are a total of 236 logs in all of the data
There are a total of 270 distinct words in all of the data


In [8]:
X_train_encoded = t.texts_to_sequences(X_train)

# pad sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length = 50

X_train_encoded_padded = pad_sequences(X_train_encoded, maxlen=max_length, padding='pre')
X_train_encoded_padded[1]

array([ 0,  0,  0,  0,  0,  6,  1,  1,  3,  9, 10, 11, 31, 69, 73, 12, 24,
        5,  5, 49, 66, 50,  7,  3,  3, 57, 79,  7,  6,  1,  1,  3,  5, 35,
       28, 15, 14,  1, 16, 17, 18, 19, 20,  4,  1, 21, 22, 23,  4,  1],
      dtype=int32)

In [9]:
# Experimenting with a different model

from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, Embedding, GlobalMaxPooling1D
from tensorflow.keras.optimizers import RMSprop

Inp = Input(name='inputs',shape=[max_length])
x = Embedding(total_num_of_words + 1, 50, input_length=max_length)(Inp)
x = GlobalMaxPooling1D()(x)
x = Dropout(0.5,name='Dropout')(x)
# Need to change output to 6 --> 6 types of output...
out = Dense(1,activation='sigmoid', name='output')(x)

In [10]:
model2 = Model(inputs=Inp,outputs=out)
# Change Optimiser
model2.compile(loss='binary_crossentropy',optimizer=Adam(0.01),metrics=['accuracy'])

model2.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          [(None, 50)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 50, 50)            13550     
_________________________________________________________________
global_max_pooling1d (Global (None, 50)                0         
_________________________________________________________________
Dropout (Dropout)            (None, 50)                0         
_________________________________________________________________
output (Dense)               (None, 1)                 51        
Total params: 13,601
Trainable params: 13,601
Non-trainable params: 0
_________________________________________________________________


In [11]:
from tensorflow.keras.callbacks import EarlyStopping
# change early_stop 0.0001 --> 0.001
early_stop = EarlyStopping(monitor='val_loss',min_delta=0.001)

# Reduce the number of epochs to 3 as the previous training was completed within 4
model2.fit(X_train_encoded_padded,y_train,
          batch_size=128,
          epochs=3,
          validation_split=0.2,
          callbacks=[early_stop])

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f8d343952d0>

In [12]:
X_test_encoded = t.texts_to_sequences(X_test)
X_test_encoded_padded = pad_sequences(X_test_encoded, maxlen=max_length, padding='pre')
print(X_test_encoded_padded[0])

[  0   0   0   0   0   0   0   6   1   1   3   9  10  11  31  59   9  12
  24   5  36  28   7   3   3  26 260   7   6   1   1   3   5  35  28  15
  14   1  16  17  18  19  20   4   1  21  22  23   4   1]


In [13]:
print(X_test_encoded_padded[23])

[  0   0   0   0   0   6   1   1   3   9  10  11  27  44  74  12  24   5
   5  49 109  50   7   3   3  57 262   7   6   1   1   3   5  35  28  15
  14   1  16  17  18  19  20   4   1  21  22  23   4   1]


In [14]:
prediction = model2.predict(X_test_encoded_padded)
prediction[0]

array([0.40404952], dtype=float32)

In [24]:
round_predictions= np.around(prediction, decimals=1)
results_df= pd.concat([test_df, pd.DataFrame(round_predictions, columns= ["malicious_predict"])], axis=1)

results_df.head(50)

Unnamed: 0.1,Unnamed: 0,url,malicious,malicious_predict
0,56,"127.0.0.1 - - [28/Apr/2021:21:53:28 -0400] ""GE...",0,0.4
1,64,"127.0.0.1 - - [28/Apr/2021:21:44:08 -0400] ""GE...",0,0.4
2,129,"127.0.0.1 - - [28/Apr/2021:21:58:58 -0400] ""PO...",0,0.41
3,188,"127.0.0.1 - - [28/Apr/2021:22:49:46 -0400] ""GE...",0,0.41
4,84,"127.0.0.1 - - [28/Apr/2021:22:30:46 -0400] ""GE...",1,0.46
5,209,"127.0.0.1 - - [28/Apr/2021:22:31:13 -0400] ""GE...",1,0.46
6,25,"127.0.0.1 - - [28/Apr/2021:22:49:39 -0400] ""GE...",0,0.4
7,224,"127.0.0.1 - - [28/Apr/2021:22:30:52 -0400] ""GE...",1,0.47
8,174,"127.0.0.1 - - [28/Apr/2021:21:49:04 -0400] ""GE...",0,0.4
9,73,"127.0.0.1 - - [28/Apr/2021:22:49:40 -0400] ""GE...",0,0.4


In [29]:
prediction[prediction < 0.45] = 0
prediction[prediction >= 0.45] = 1

results_df= pd.concat([test_df, pd.DataFrame(prediction, columns= ["malicious_predict"])], axis=1)

results_df.head(50)

Unnamed: 0.1,Unnamed: 0,url,malicious,malicious_predict
0,56,"127.0.0.1 - - [28/Apr/2021:21:53:28 -0400] ""GE...",0,0.0
1,64,"127.0.0.1 - - [28/Apr/2021:21:44:08 -0400] ""GE...",0,0.0
2,129,"127.0.0.1 - - [28/Apr/2021:21:58:58 -0400] ""PO...",0,0.0
3,188,"127.0.0.1 - - [28/Apr/2021:22:49:46 -0400] ""GE...",0,0.0
4,84,"127.0.0.1 - - [28/Apr/2021:22:30:46 -0400] ""GE...",1,1.0
5,209,"127.0.0.1 - - [28/Apr/2021:22:31:13 -0400] ""GE...",1,1.0
6,25,"127.0.0.1 - - [28/Apr/2021:22:49:39 -0400] ""GE...",0,0.0
7,224,"127.0.0.1 - - [28/Apr/2021:22:30:52 -0400] ""GE...",1,1.0
8,174,"127.0.0.1 - - [28/Apr/2021:21:49:04 -0400] ""GE...",0,0.0
9,73,"127.0.0.1 - - [28/Apr/2021:22:49:40 -0400] ""GE...",0,0.0
