In [1]:
# Load EDA Pkgs
import pandas as pd
import numpy as np

In [2]:
# Load Data Viz Pkgs
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# ML Pkgs
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.metrics import accuracy_score,hamming_loss,classification_report

In [4]:
### Split Dataset into Train and Text
from sklearn.model_selection import train_test_split
# Feature engineering
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
# Multi Label Pkgs
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.adapt import MLkNN

In [6]:
train_path = "/kaggle/input/code-injection-6labels-balance/code_injection_6labels_balance.csv"


In [7]:
# Load Dataset
df = pd.read_csv(train_path)

In [8]:
target_list = ["000 - Normal", '126 - Path Traversal',
               '242 - Code Injection', '274 - HTTP Verb Tampering',
               '66 - SQL Injection', '88 - OS Command Injection']

In [9]:
df

Unnamed: 0.1,Unnamed: 0,text,000 - Normal,126 - Path Traversal,242 - Code Injection,274 - HTTP Verb Tampering,66 - SQL Injection,88 - OS Command Injection
0,0,GET /,1,0,0,0,0,0
1,1,GET /blog/index.php/2020/04/04/voluptatum-repr...,1,0,0,0,0,0
2,2,GET /blog/xmlrpc.php?rsd,1,0,0,0,0,0
3,3,POST /blog/index.php/my-account/user-logout/?_...,0,0,0,0,0,1
4,4,GET /blog/index.php/2020/04/04/nihil-tenetur-e...,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...
69730,69730,POST /blog/index.php/my-account/user-logout/?_...,0,1,0,0,0,1
69731,69731,POST /blog/index.php/my-account/user-logout/?_...,0,1,0,0,0,1
69732,69732,POST /blog/index.php/my-account/user-logout/?_...,0,1,0,0,0,1
69733,69733,POST /blog/index.php/my-account/user-logout/?_...,0,1,0,0,0,1


In [10]:
!pip install neattext

Collecting neattext
  Downloading neattext-0.1.3-py3-none-any.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.7/114.7 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: neattext
Successfully installed neattext-0.1.3


In [11]:
import neattext as nt
import neattext.functions as nfx

In [12]:
# Explore For Noise
df['text'].apply(lambda x:nt.TextFrame(x).noise_scan())

0        {'text_noise': 40.0, 'text_length': 5, 'noise_...
1        {'text_noise': 17.28395061728395, 'text_length...
2        {'text_noise': 20.833333333333336, 'text_lengt...
3        {'text_noise': 8.996539792387544, 'text_length...
4        {'text_noise': 22.58064516129032, 'text_length...
                               ...                        
69730    {'text_noise': 8.58085808580858, 'text_length'...
69731    {'text_noise': 9.24092409240924, 'text_length'...
69732    {'text_noise': 9.30232558139535, 'text_length'...
69733    {'text_noise': 8.49673202614379, 'text_length'...
69734    {'text_noise': 16.071428571428573, 'text_lengt...
Name: text, Length: 69735, dtype: object

In [13]:
# Explore For Noise
df['text'].apply(lambda x:nt.TextExtractor(x).extract_stopwords())

0        [get]
1        [get]
2        [get]
3           []
4        [get]
         ...  
69730       []
69731       []
69732       []
69733       []
69734    [get]
Name: text, Length: 69735, dtype: object

In [14]:
# Explore For Noise
df['text'].apply(nfx.remove_stopwords)

0                                                        /
1        /blog/index.php/2020/04/04/voluptatum-reprehen...
2                                     /blog/xmlrpc.php?rsd
3        POST /blog/index.php/my-account/user-logout/?_...
4        /blog/index.php/2020/04/04/nihil-tenetur-et-ad...
                               ...                        
69730    POST /blog/index.php/my-account/user-logout/?_...
69731    POST /blog/index.php/my-account/user-logout/?_...
69732    POST /blog/index.php/my-account/user-logout/?_...
69733    POST /blog/index.php/my-account/user-logout/?_...
69734    /blog/index.php/my-account/edit-profile%28?mod...
Name: text, Length: 69735, dtype: object

In [15]:
corpus = df['text'].apply(nfx.remove_stopwords)

In [16]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
tfidf = TfidfVectorizer()

In [17]:
# Build Features
Xfeatures = tfidf.fit_transform(corpus)

In [18]:
len(tfidf.get_feature_names_out())

21634

In [19]:
print(Xfeatures.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [20]:
len(Xfeatures.toarray()[0])

21634

In [21]:
y = df[target_list]

In [22]:
# Split Data 
X_train,X_test,y_train,y_test = train_test_split(Xfeatures,y,test_size=0.3,random_state=42)

In [23]:
### Problem Transform
import skmultilearn

In [24]:
def build_model(model,mlb_estimator,xtrain,ytrain,xtest,ytest):
    # Create an Instance
    clf = mlb_estimator(model)
    clf.fit(xtrain,ytrain)
    # Predict
    clf_predictions = clf.predict(xtest)
    # Check For Accuracy
    evaluate_model(ytest, clf_predictions)
    acc = accuracy_score(ytest,clf_predictions)
    ham = hamming_loss(ytest,clf_predictions)
    result = {"accuracy:":acc,"hamming_score":ham}
    return result

In [25]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report,accuracy_score, f1_score

In [26]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    y_true1 = y_true.to_numpy()
    y_pred1 = y_pred.toarray()
    acc_list = []
    
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true1[i])[0] )
        set_pred = set( np.where(y_pred1[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/float(len(set_true.union(set_pred)))
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [27]:
clf = BinaryRelevance(MultinomialNB())
clf.fit(X_train,y_train)
# Predict
clf_predictions = clf.predict(X_test)
# Check For Accuracy

In [28]:
hamming_score(y_test, clf_predictions)

0.782531109092937

In [29]:
def evaluate_model(y_test, br_prediction):
    score = f1_score(y_test,br_prediction,average="micro")
    precision = precision_score(y_test,br_prediction,average="micro")
    recall = recall_score(y_test,br_prediction,average="micro")
    report = classification_report(y_test,br_prediction,digits=4)
    EMR = accuracy_score(y_test,br_prediction)
    acc = hamming_score(y_test,br_prediction)
    
    #classifaction_report_csv(report,precision,recall,score,0)
    print ('\n clasification report:\n', report)
    print ('F1 score:', score)
    print ('Recall:', recall)
    print ('Precision:', precision)
    print('EMR:', EMR)
    print ('Acc:', acc)
    print('Hamming Loss', hamming_loss(y_test,br_prediction))
 

In [30]:
binary_rel_clf = build_model(MultinomialNB(),BinaryRelevance,X_train,y_train,X_test,y_test)


 clasification report:
               precision    recall  f1-score   support

           0     0.8703    0.7002    0.7761      4407
           1     0.9499    0.9577    0.9538      4775
           2     0.9976    0.9068    0.9501      4636
           3     0.9901    0.4799    0.6464      1665
           4     0.9721    0.7130    0.8226      4781
           5     0.7909    0.7475    0.7686      2226

   micro avg     0.9338    0.7886    0.8551     22490
   macro avg     0.9285    0.7509    0.8196     22490
weighted avg     0.9361    0.7886    0.8492     22490
 samples avg     0.7949    0.7890    0.7889     22490

F1 score: 0.8550696687719975
Recall: 0.7885726989773233
Precision: 0.9338142375737153
EMR: 0.7624874527986234
Acc: 0.782531109092937
Hamming Loss 0.047894460111849335


  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
clf_chain_model = build_model(MultinomialNB(),ClassifierChain,X_train,y_train,X_test,y_test)


 clasification report:
               precision    recall  f1-score   support

           0     0.8703    0.7002    0.7761      4407
           1     0.9386    0.9610    0.9497      4775
           2     0.9972    0.9374    0.9664      4636
           3     1.0000    0.6252    0.7694      1665
           4     0.9408    0.7672    0.8452      4781
           5     0.7934    0.8005    0.7970      2226

   micro avg     0.9266    0.8231    0.8718     22490
   macro avg     0.9234    0.7986    0.8506     22490
weighted avg     0.9279    0.8231    0.8684     22490
 samples avg     0.8301    0.8250    0.8248     22490

F1 score: 0.8717888342084815
Recall: 0.823121387283237
Precision: 0.926572901546624
EMR: 0.7994837722862196
Acc: 0.8187467138282108
Hamming Loss 0.04337746761627073


  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
clf_labelP_model = build_model(MultinomialNB(),LabelPowerset,X_train,y_train,X_test,y_test)


 clasification report:
               precision    recall  f1-score   support

           0     0.7773    0.8713    0.8217      4407
           1     0.9886    0.9225    0.9544      4775
           2     0.9954    0.9409    0.9674      4636
           3     0.9989    0.5658    0.7224      1665
           4     0.7965    0.8695    0.8314      4781
           5     0.8385    0.7044    0.7656      2226

   micro avg     0.8837    0.8570    0.8702     22490
   macro avg     0.8992    0.8124    0.8438     22490
weighted avg     0.8937    0.8570    0.8691     22490
 samples avg     0.8792    0.8670    0.8710     22490

F1 score: 0.8701580135440181
Recall: 0.8570031124944419
Precision: 0.8837230628152224
EMR: 0.8534008890588404
Acc: 0.866840335866673
Hamming Loss 0.04582317607507608
