In [1]:
import pandas as pd
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords 
import nltk 
import string 
import re 
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df_data = pd.read_csv('monoxor.csv')

In [3]:
df_data.head(3)

Unnamed: 0.1,Unnamed: 0,ip,desc,content-length,secure,protocol,issafe,isafe
0,0,::ffff:117.99.96.244,Top recognize eat. Fact whom spend area thing ...,70,False,http,,True
1,1,::ffff:117.99.96.244,As possible American many prepare four strong....,70,False,http,,True
2,2,::ffff:117.99.96.244,Tuesday Notes or 2 like 2 XSP Class,70,False,http,,False


In [4]:
l = len(df_data['desc'])

In [5]:
df_data['isafe'].value_counts()

True     572
False    428
Name: isafe, dtype: int64

In [6]:
lemmatizer = WordNetLemmatizer() 

def preprocessing(text):
    text = text.lower() 
    result = re.sub(r'\d+', '', text)
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator) 
    stop_words = set(stopwords.words("english")) 
    word_tokens = word_tokenize(text) 
    filtered_text = [word for word in word_tokens if word not in stop_words] 
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in filtered_text] 
    return lemmas


In [7]:
doc = []
for i in range(0, l):
    temp = preprocessing(df_data['desc'][i])
    temp = ' '.join(temp)
    doc.append(temp)

In [8]:
vectorizer = TfidfVectorizer()

In [9]:
arr = vectorizer.fit_transform(doc)

In [10]:
print('Document transform',arr.toarray())

Document transform [[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.29720645 0.        ]]


In [11]:
print(vectorizer.get_feature_names())

['1230', 'ability', 'able', 'accept', 'accord', 'account', 'across', 'act', 'action', 'activity', 'actually', 'add', 'address', 'admin', 'administration', 'admit', 'adult', 'affect', 'age', 'agency', 'agent', 'ago', 'agree', 'agreement', 'ahead', 'air', 'allow', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'american', 'among', 'amount', 'analysis', 'animal', 'another', 'answer', 'anyone', 'anything', 'appear', 'apply', 'approach', 'area', 'argue', 'arm', 'around', 'arrive', 'art', 'article', 'artist', 'ask', 'assume', 'attack', 'attention', 'attorney', 'audience', 'author', 'authority', 'available', 'avoid', 'away', 'baby', 'back', 'bad', 'bag', 'ball', 'bank', 'bar', 'base', 'bear', 'beat', 'beautiful', 'become', 'bed', 'begin', 'behavior', 'behind', 'believe', 'benefit', 'best', 'better', 'beyond', 'big', 'bill', 'billion', 'bite', 'black', 'blank', 'blood', 'blue', 'board', 'body', 'book', 'box', 'boy', 'break', 'bring', 'brother', 'budget', 'build', 'busines

In [12]:
class_ = []
for i in range(0,l):
    if str(df_data['isafe'][i]) == 'True':
        class_.append(1)
    if str(df_data['isafe'][i]) == 'False':
        class_.append(0)

In [13]:
import numpy as np
class_arr = np.array(class_)

In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(arr.toarray(), class_arr, test_size=0.20, random_state=16)

In [15]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(x_train, y_train)

GaussianNB()

In [16]:
y_pred = model.predict(x_test)

In [17]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

In [18]:
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred)

In [19]:
from sklearn.metrics import precision_score
precision = precision_score(y_test, y_pred)

In [20]:
from sklearn.metrics import recall_score
recall = recall_score(y_test, y_pred)

In [21]:
print("Accuracy is ", accuracy)
print("Precision is ", precision)
print("Recall is ", recall)
print("F1score is ", f1)

Accuracy is  0.85
Precision is  0.7941176470588235
Recall is  0.9818181818181818
F1score is  0.8780487804878049
