In [1]:
# Load EDA Pkgs
import pandas as pd
import numpy as np

In [2]:
# Load Data Viz Pkgs
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# ML Pkgs
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.metrics import accuracy_score,hamming_loss,classification_report

In [4]:
### Split Dataset into Train and Text
from sklearn.model_selection import train_test_split
# Feature engineering
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
# Multi Label Pkgs
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.adapt import MLkNN

In [6]:
train_path = "/kaggle/input/code-injection-6labels-balance/code_injection_6labels_balance.csv"


In [7]:
# Load Dataset
df = pd.read_csv(train_path)

In [8]:
target_list = ["000 - Normal", '126 - Path Traversal',
               '242 - Code Injection', '274 - HTTP Verb Tampering',
               '66 - SQL Injection', '88 - OS Command Injection']

In [9]:
df

Unnamed: 0.1,Unnamed: 0,text,000 - Normal,126 - Path Traversal,242 - Code Injection,274 - HTTP Verb Tampering,66 - SQL Injection,88 - OS Command Injection
0,0,GET /,1,0,0,0,0,0
1,1,GET /blog/index.php/2020/04/04/voluptatum-repr...,1,0,0,0,0,0
2,2,GET /blog/xmlrpc.php?rsd,1,0,0,0,0,0
3,3,POST /blog/index.php/my-account/user-logout/?_...,0,0,0,0,0,1
4,4,GET /blog/index.php/2020/04/04/nihil-tenetur-e...,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...
69730,69730,POST /blog/index.php/my-account/user-logout/?_...,0,1,0,0,0,1
69731,69731,POST /blog/index.php/my-account/user-logout/?_...,0,1,0,0,0,1
69732,69732,POST /blog/index.php/my-account/user-logout/?_...,0,1,0,0,0,1
69733,69733,POST /blog/index.php/my-account/user-logout/?_...,0,1,0,0,0,1


In [10]:
!pip install neattext

Collecting neattext
  Downloading neattext-0.1.3-py3-none-any.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.7/114.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: neattext
Successfully installed neattext-0.1.3


In [11]:
import neattext as nt
import neattext.functions as nfx

In [12]:
# Explore For Noise
df['text'].apply(lambda x:nt.TextFrame(x).noise_scan())

0        {'text_noise': 40.0, 'text_length': 5, 'noise_...
1        {'text_noise': 17.28395061728395, 'text_length...
2        {'text_noise': 20.833333333333336, 'text_lengt...
3        {'text_noise': 8.996539792387544, 'text_length...
4        {'text_noise': 22.58064516129032, 'text_length...
                               ...                        
69730    {'text_noise': 8.58085808580858, 'text_length'...
69731    {'text_noise': 9.24092409240924, 'text_length'...
69732    {'text_noise': 9.30232558139535, 'text_length'...
69733    {'text_noise': 8.49673202614379, 'text_length'...
69734    {'text_noise': 16.071428571428573, 'text_lengt...
Name: text, Length: 69735, dtype: object

In [13]:
# Explore For Noise
df['text'].apply(lambda x:nt.TextExtractor(x).extract_stopwords())

0        [get]
1        [get]
2        [get]
3           []
4        [get]
         ...  
69730       []
69731       []
69732       []
69733       []
69734    [get]
Name: text, Length: 69735, dtype: object

In [14]:
# Explore For Noise
df['text'].apply(nfx.remove_stopwords)

0                                                        /
1        /blog/index.php/2020/04/04/voluptatum-reprehen...
2                                     /blog/xmlrpc.php?rsd
3        POST /blog/index.php/my-account/user-logout/?_...
4        /blog/index.php/2020/04/04/nihil-tenetur-et-ad...
                               ...                        
69730    POST /blog/index.php/my-account/user-logout/?_...
69731    POST /blog/index.php/my-account/user-logout/?_...
69732    POST /blog/index.php/my-account/user-logout/?_...
69733    POST /blog/index.php/my-account/user-logout/?_...
69734    /blog/index.php/my-account/edit-profile%28?mod...
Name: text, Length: 69735, dtype: object

In [15]:
corpus = df['text'].apply(nfx.remove_stopwords)

In [16]:
import gensim
from gensim.models import Word2Vec
sentences = [sentence.split() for sentence in corpus]
w2v_model = Word2Vec(sentences, window=5, min_count=5, workers=4)

In [17]:
import numpy as np

def vectorize(sentence):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

X = np.array([vectorize(sentence) for sentence in corpus])
y = df[target_list]

In [18]:
# Split Data 
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [19]:
from sklearn.preprocessing import MinMaxScaler #fixed import

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [20]:
### Problem Transform
import skmultilearn

In [21]:
def build_model(model,mlb_estimator,xtrain,ytrain,xtest,ytest):
    # Create an Instance
    clf = mlb_estimator(model)
    clf.fit(xtrain,ytrain)
    # Predict
    clf_predictions = clf.predict(xtest)
    # Check For Accuracy
    evaluate_model(ytest, clf_predictions)
    acc = accuracy_score(ytest,clf_predictions)
    ham = hamming_loss(ytest,clf_predictions)
    result = {"accuracy:":acc,"hamming_score":ham}
    return result

In [22]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report,accuracy_score, f1_score

In [23]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    y_true1 = y_true.to_numpy()
    y_pred1 = y_pred.toarray()
    acc_list = []
    
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true1[i])[0] )
        set_pred = set( np.where(y_pred1[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/float(len(set_true.union(set_pred)))
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [24]:
clf = BinaryRelevance(MultinomialNB())
clf.fit(X_train,y_train)
# Predict
clf_predictions = clf.predict(X_test)
# Check For Accuracy

In [25]:
hamming_score(y_test, clf_predictions)

0.24215302009145515

In [26]:
def evaluate_model(y_test, br_prediction):
    score = f1_score(y_test,br_prediction,average="micro")
    precision = precision_score(y_test,br_prediction,average="micro")
    recall = recall_score(y_test,br_prediction,average="micro")
    report = classification_report(y_test,br_prediction,digits=4)
    EMR = accuracy_score(y_test,br_prediction)
    acc = hamming_score(y_test,br_prediction)
    
    #classifaction_report_csv(report,precision,recall,score,0)
    print ('\n clasification report:\n', report)
    print ('F1 score:', score)
    print ('Recall:', recall)
    print ('Precision:', precision)
    print('EMR:', EMR)
    print ('Acc:', acc)
    print('Hamming Loss', hamming_loss(y_test,br_prediction))
 

In [27]:
binary_rel_clf = build_model(MultinomialNB(),BinaryRelevance,X_train,y_train,X_test,y_test)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



 clasification report:
               precision    recall  f1-score   support

           0     0.1845    0.6578    0.2882      4407
           1     0.1807    0.0220    0.0392      4775
           2     0.0000    0.0000    0.0000      4636
           3     0.5536    0.8306    0.6644      1665
           4     0.2728    0.9073    0.4195      4781
           5     0.3799    0.6069    0.4673      2226

   micro avg     0.2634    0.4480    0.3318     22490
   macro avg     0.2619    0.5041    0.3131     22490
weighted avg     0.2111    0.4480    0.2494     22490
 samples avg     0.2562    0.4591    0.3196     22490

F1 score: 0.33177477774119196
Recall: 0.44802134281903067
Precision: 0.26342483660130717
EMR: 0.016729601835476317
Acc: 0.24215302009145515
Hamming Loss 0.3233433710944346


In [28]:
clf_chain_model = build_model(MultinomialNB(),ClassifierChain,X_train,y_train,X_test,y_test)


 clasification report:
               precision    recall  f1-score   support

           0     0.1845    0.6578    0.2882      4407
           1     0.2000    0.0019    0.0037      4775
           2     0.0000    0.0000    0.0000      4636
           3     0.5495    0.8306    0.6614      1665
           4     0.0535    0.0155    0.0240      4781
           5     0.3233    0.4200    0.3654      2226

   micro avg     0.2348    0.2357    0.2352     22490
   macro avg     0.2185    0.3210    0.2238     22490
weighted avg     0.1627    0.2357    0.1475     22490
 samples avg     0.2156    0.2394    0.2202     22490

F1 score: 0.23524189968930312
Recall: 0.23566029346376166
Precision: 0.23482498892334958
EMR: 0.1587878208498638
Acc: 0.20441980147539152
Hamming Loss 0.27452479964310184


  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
clf_labelP_model = build_model(MultinomialNB(),LabelPowerset,X_train,y_train,X_test,y_test)

  _warn_prf(average, modifier, msg_start, len(result))



 clasification report:
               precision    recall  f1-score   support

           0     0.8518    0.1917    0.3130      4407
           1     0.0000    0.0000    0.0000      4775
           2     0.5734    0.0455    0.0843      4636
           3     0.9269    0.8300    0.8758      1665
           4     0.2666    0.9119    0.4126      4781
           5     0.6472    0.6065    0.6262      2226

   micro avg     0.3827    0.3623    0.3722     22490
   macro avg     0.5443    0.4309    0.3853     22490
weighted avg     0.4745    0.3623    0.2933     22490
 samples avg     0.3796    0.3634    0.3678     22490

F1 score: 0.37223326252312755
Recall: 0.36229435304579816
Precision: 0.38273286673869134
EMR: 0.3413316763061039
Acc: 0.36092124340774023
Hamming Loss 0.2189426891639979
