In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from sklearn.metrics import confusion_matrix
# from keras.utils.np_utils import to_categorical 
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout
import re 
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string 
import nltk 

In [None]:
df = pd.read_csv('/kaggle/input/text-based-cyber-threat-detection/cyber-threat-intelligence_all.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df=df.drop(labels='Unnamed: 0',axis=1)

In [None]:
df['label'].fillna(value='benign', inplace=True)

In [None]:
df=df.fillna(0)

In [None]:
df.info()

In [None]:
df['label'].unique()

In [None]:
data = df[['text','label']]

In [None]:
lemmatizer = WordNetLemmatizer()

stop_words = set(stopwords.words('english'))
punctuations = set(string.punctuation)

In [None]:
def preprocess_text(text):
    tokens = word_tokenize(text)

    tokens = [token.lower() for token in tokens if token not in punctuations]

    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    tokens = [token for token in tokens if token not in stop_words]

    processed_text = ' '.join(tokens)
    
    return processed_text

In [None]:
data['text_new'] = data['text'].apply(preprocess_text)

In [None]:
data = data.drop(labels=['text'],axis=1)

In [None]:
data['text_new'] = data['text_new'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.label.loc[(data['label']=="MD5") | (data['label']=="REGISTRYKEY") | (data['label']=="EMAIL") |  (data['label']=="Infrastucture") | (data['label']=="DOMAIN") | (data['label']=="SHA1") | (data['label']=="IPV4") | (data['label']=="campaign") | (data['label']=="URL") | (data['label']=="SHA2") | (data['label']=="vulnerability") | (data['label']=="FILEPATH") | (data['label']=="tools") | (data['label']=="TIME") | (data['label']=="url") | (data['label']=="hash") ] = "NEED_ATTENTION"

In [None]:
data['label'].value_counts()

In [None]:
data.label.value_counts()

In [None]:
data['text_new'][0]

In [None]:
max_length = data['text_new'].str.len().max()
max_length

In [None]:
max_length_tokens = data['text_new'].apply(lambda x: len(x.split())).max()

In [None]:
max_length_characters = data['text_new'].apply(lambda x: len(x)).max()

print('Maximum Sequence Length (Tokens):', max_length_tokens)
print('Maximum Sequence Length (Characters):', max_length_characters)

In [None]:
MAX_NB_WORDS = 50000
MAX_SEQ_LENGTH = 450
EMBEDDING_DIM=100

In [None]:
data = data.rename(columns={'text_new': 'text'})

In [None]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%()*+,-./:;<=>?@[\]^_`{|}',lower=True)
tokenizer.fit_on_texts(data['text'].values)

In [None]:
word_index = tokenizer.word_index
len(word_index)

In [None]:
X = tokenizer.texts_to_sequences(data['text'].values) 
X = pad_sequences(X,maxlen=MAX_SEQ_LENGTH)

In [None]:
X 

In [None]:
X.shape

In [None]:
Y = pd.get_dummies(data['label']).values

In [None]:
from sklearn.preprocessing import LabelBinarizer
# Perform one-hot encoding using LabelBinarizer
lb = LabelBinarizer()
Y = lb.fit_transform(data['label'])

In [None]:
Y 

In [None]:
Y.shape 

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.20,random_state=42)

In [None]:
model=Sequential()
model.add(Embedding(MAX_NB_WORDS,EMBEDDING_DIM,input_length=X.shape[1]))
model.add(LSTM(150,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(50))
model.add(Dropout(0.2))
model.add(Dense(8,activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
model.summary()

In [None]:
model.save_weights("model.h5")
print("Saved model to disk")

In [None]:
epochs=10
batch_size=64
history = model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

In [None]:
accr = model.evaluate(x_test,y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
predictions = model.predict(x_test)
for i in range(5):
    print('Prediction: {}'.format(predictions[i]))
    print('Actual: {}'.format(y_test[i]))

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();

In [None]:
y_pred = np.argmax(model.predict(x_test), axis=1)

In [None]:
class_labels = lb.classes_

print(class_labels)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

y_true = np.argmax(y_test, axis=1)
y_pred = np.argmax(model.predict(x_test), axis=1)

cm = confusion_matrix(y_true, y_pred)

classes = np.arange(len(cm))

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
print('hi')

In [None]:
accuracy = (cm.diagonal().sum()) / cm.sum()
accuracy

In [None]:
precision = cm.diagonal() / cm.sum(axis=0)
precision

In [None]:
recall = cm.diagonal() / cm.sum(axis=1)
recall

In [None]:
f1_score = 2 * (precision * recall) / (precision + recall)
f1_score

In [None]:
import matplotlib.pyplot as plt

def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

In [None]:
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dropout, Dense

# Create the model
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(LSTM(150, dropout=0.2, recurrent_dropout=0.2))
model.add(Dropout(0.2))
model.add(Dense(8, activation='softmax'))

# Function to plot the model diagram
def plot_model_diagram(model):
    layer_names = [layer.__class__.__name__ for layer in model.layers]
    output_shapes = [layer.output_shape[1:] for layer in model.layers]

    plt.figure(figsize=(5, 5))
    plt.title("Model Diagram")
    plt.xlabel("Layers")
    plt.ylabel("Output Shapes")

    for i, (layer_name, output_shape) in enumerate(zip(layer_names, output_shapes)):
        output_shape_str = str(output_shape).replace(",", "x").replace("(", "").replace(")", "")

        plt.text(0.5, i, f"{layer_name}\n{output_shape_str}",
                 horizontalalignment='center',
                 verticalalignment='center',
                 bbox=dict(facecolor='white', edgecolor='black', boxstyle='round,pad=0.3'))

    plt.yticks(range(len(layer_names)), layer_names)
    plt.xticks([])
    plt.ylim(len(layer_names) - 0.5, -0.5)  # Reverse y-axis
    plt.tight_layout()
    plt.show()

# Generate the model diagram
plot_model_diagram(model)


# new fit

In [None]:
model1=Sequential()
model1.add(Embedding(MAX_NB_WORDS,EMBEDDING_DIM,input_length=X.shape[1]))
model1.add(LSTM(150,dropout=0.2,recurrent_dropout=0.2))
model1.add(Dropout(0.2))
model1.add(Dense(64, activation='relu'))
model1.add(Dense(8,activation='softmax')) 

In [None]:
model1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model1.summary()

In [None]:
epochs=15
batch_size=75
history = model1.fit(x_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

In [None]:
accr = model.evaluate(x_test,y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();

In [None]:
y_pred = np.argmax(model.predict(x_test), axis=1)

In [None]:
accuracy = (cm.diagonal().sum()) / cm.sum()
accuracy

In [None]:
precision = cm.diagonal() / cm.sum(axis=0)
precision

In [None]:
recall = cm.diagonal() / cm.sum(axis=1)
recall

In [None]:
f1_score = 2 * (precision * recall) / (precision + recall)
f1_score

In [None]:
class_labels = lb.classes_

print(class_labels)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

y_true = np.argmax(y_test, axis=1)
y_pred = np.argmax(model.predict(x_test), axis=1)

cm1 = confusion_matrix(y_true, y_pred)

classes = np.arange(len(cm))

plt.figure(figsize=(8, 6))
sns.heatmap(cm1, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
y_pred.shape

In [None]:
y_true.shape

In [None]:
accuracy = (cm.diagonal().sum()) / cm.sum()
accuracy

In [None]:
precision = cm.diagonal() / cm.sum(axis=0)
precision

In [None]:
recall = cm.diagonal() / cm.sum(axis=1)
recall

In [None]:
f1_score = 2 * (precision * recall) / (precision + recall)
f1_score

In [None]:
class_labels = lb.classes_

print(class_labels)

In [None]:
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();

In [None]:
import matplotlib.pyplot as plt

def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

# Another notebook with LSTM, RNN, and GRU models will be released soon along with different Optimizers training.