In [1]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold #Кросс-валидация
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import csv

### Обработка данных

In [5]:
d1 = []
d2 = []
with open('sqli.csv', newline='') as File:
    reader = csv.reader(File)
    for row in reader:
        d1.append(row[0])
        try:
            if row[1] != '1':
                d2.append('NaN')
            else:
                d2.append(int(row[1]))
        except:
            d2.append('NaN')
        

d1.pop(0)
d2.pop(0)

d1 = pd.Series(d1)
d2 = pd.Series(d2)
data = pd.concat([d1, d2],axis=1)
data.columns = ['Sentence', 'Label']

data[data[:1129] == 'NaN'] = 1
data[data[1129:] == 'NaN' ] = 0

y = data['Label']
X = data['Sentence']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

### Метод SVM

In [3]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)   
y_train = y_train.astype('int')
X_test = vectorizer.transform(X_test)
y_test = y_test.astype('int')

#Cетка параметров для перебора 10^-5 ... 10^5 
grid = {'C': np.power(10.0, np.arange(-5, 6))}

#Кросс-валидация по 5 блокам
cv = KFold(n_splits=5, shuffle=True, random_state=0)

#Обучение и подбор параметра С
clf = SVC(kernel='linear', random_state=0)
gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv) #Подбор параметров (В нашем случае C)
gs.fit(X_train,y_train)
gs.cv_results_



{'mean_fit_time': array([0.30860133, 0.30177016, 0.32140055, 0.40319176, 0.31499124,
        0.29778681, 0.33719077, 0.39159603, 0.37199922, 0.36360049,
        0.34878769]),
 'std_fit_time': array([0.01298545, 0.00997713, 0.01567435, 0.01016597, 0.01966505,
        0.01212447, 0.02605076, 0.02820383, 0.02712634, 0.02586625,
        0.03078692]),
 'mean_score_time': array([0.06259756, 0.06082382, 0.06659689, 0.08281503, 0.0600091 ,
        0.05422354, 0.06999683, 0.06402092, 0.06459932, 0.06400671,
        0.06261559]),
 'std_score_time': array([0.00215377, 0.0003871 , 0.00440977, 0.00690669, 0.00357242,
        0.00193906, 0.02269668, 0.00376159, 0.00794139, 0.00817575,
        0.00680632]),
 'param_C': masked_array(data=[1e-05, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0,
                    1000.0, 10000.0, 100000.0],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False],
        fill_value='?',
             dtype=object)

### ВЫВОД Параметр С=10 дает лучший результат на 5 выборках

In [6]:
#ВЫВОД Параметр С=10 дает лучший результат на 5 выборках

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)   
y_train = y_train.astype('int')
X_test = vectorizer.transform(X_test)
y_test = y_test.astype('int')

clf = SVC(kernel='linear', random_state=0, C=10) #С=1.0 по умолчанию
clf.fit(X_train,y_train)
print(X_train)

#10 слов с наибольшим абсолютным значением веса
#Находим слова с наиболее большим весом из get_feature_names()
df = pd.DataFrame(np.transpose(abs(clf.coef_.toarray())), #Берем по модулю потому что 2 класса todense() переводит матрицу из разряженной в обычную 
                   index=np.asarray(vectorizer.get_feature_names()), 
                   columns=["col"])


  (1, 736)	0.5595668488286685
  (1, 3283)	0.5021060376389519
  (1, 3444)	0.6593743008780867
  (2, 114)	0.22408114742580754
  (2, 1232)	0.20550087147090867
  (2, 3152)	0.1610509535839669
  (2, 3181)	0.22408114742580754
  (2, 3283)	0.16235880672046896
  (2, 3461)	0.178384229259323
  (2, 4001)	0.1809390826753678
  (2, 4666)	0.19049999422553257
  (2, 4911)	0.14586834227314394
  (2, 4959)	0.15980395330442418
  (2, 5154)	0.12995181225497202
  (2, 5554)	0.22408114742580754
  (2, 5556)	0.22408114742580754
  (2, 6076)	0.22408114742580754
  (2, 6150)	0.20550087147090867
  (2, 6151)	0.18376334209757508
  (2, 6497)	0.3106519126165559
  (2, 7121)	0.19049999422553257
  (2, 7616)	0.20550087147090867
  (2, 7689)	0.12728806631824513
  (2, 8007)	0.17390623226317678
  (2, 8163)	0.22408114742580754
  :	:
  (3139, 2392)	0.37651147653323536
  (3139, 3940)	0.29220527045640865
  (3139, 4143)	0.31407260370043555
  (3139, 5769)	0.32702984051959305
  (3139, 5929)	0.27135415818791964
  (3139, 6412)	0.158863815826

### Вывод опорных слов (векторов)

In [7]:
df_sort = df.sort_values(by='col')[::-1]
display(df_sort[:30].sort_index())

Unnamed: 0,col
0x730065006c0065006300740020004000400076006500,1.587318
10000000,2.613383
21,1.587507
26,1.587365
28,1.58733
29,1.587337
7c,1.587304
asc,1.587462
bfilename,1.587296
desc,1.587307


In [8]:
df1 = pd.DataFrame(np.transpose(clf.coef_.toarray()), #Берем по модулю потому что 2 класса todense() переводит матрицу из разряженной в обычную 
                   index=np.asarray(vectorizer.get_feature_names()), 
                   columns=["col"])
d = df1.sort_values(by='col')
display(d.iloc[-30:-1])
display(d.iloc[0:20])

Unnamed: 0,col
bfilename,1.587296
7c,1.587304
sp_password,1.587306
desc,1.587307
handler,1.587314
ues,1.587317
distinct,1.587317
0x730065006c0065006300740020004000400076006500,1.587318
objectclass,1.587318
to_timestamp_tz,1.587327


Unnamed: 0,col
quot,-1.346091
apos,-1.305127
the,-1.174359
fyodor,-1.155974
father,-0.895984
one,-0.894646
close,-0.886583
impunity,-0.852102
silencing,-0.852102
fell,-0.775527


### Вывод и предсказание
    Accuracy (доля верно угаданных) — sklearn.metrics.accuracy_score
    
    Precision (точность) — sklearn.metrics.precision_score
    
    Recall (полнота) — sklearn.metrics.recall_score
    
    F-мера — sklearn.metrics.f1_score

In [9]:
print(clf.score(X_test, y_test)) #оценка 0.97

print('Доля верно угаданных: ',accuracy_score(y_test, clf.predict(X_test))) 
print('F-мера: ', f1_score(y_test, clf.predict(X_test))) 
print('Точность: ', precision_score(y_test, clf.predict(X_test)))
print('Полнота: ',recall_score(y_test, clf.predict(X_test)))
print('ROC-AUC: ',roc_auc_score(y_test, clf.decision_function(X_test)))

X_test2 = ["SELECT `name`, `status`, `books` FROM `members` WHERE name = 'Demo' AND password ='111'",
           "SELECT * FROM members WHERE username = 'admin'--' AND password = 'password'", 
           "/*! MYSQL Special SQL */ (M): SELECT /*!32302 1/0, */ 1 FROM tablename",
           "if ((select user) = 'sa' OR (select user) = 'dbo') select 1 else select 1/0 (S)", 
           "SELECT LOAD_FILE(0x633A5C626F6F742E696E69)", 
           "CONCAT(str1, str2, str3, …) (M): SELECT CONCAT(login, password) FROM members",
           "SELECT (CHaR(75)||CHaR(76)||CHaR(77)) (P)",
           "SELECT header, txt FROM news UNION ALL SELECT name, pass FROM members",
           "') or ('1'='1--", 
           "GROUP BY table.columnfromerror1, columnfromerror2, columnfromerror3 HAVING 1=1 –", 
           "ORDER BY 3—", 
           "11223344) UNION SELECT NULL,NULL,NULL,NULL WHERE 1=2 –-", 
           "11223344) UNION SELECT 1,’2’,NULL,NULL WHERE 1=2 –-", 
           "INSERT INTO members(id, user, pass) VALUES(1, ''+SUBSTRING(@@version,1,10) ,10)",
           'bcp "SELECT * FROM test..foo" queryout c:\inetpub\wwwroot\runcommand.asp -c -Slocalhost -Usa -Pfoobar', 
           "exec xp_regread HKEY_LOCAL_MACHINE, 'SYSTEM\CurrentControlSet\Services\lanmanserver\parameters', 'nullsessionshares'", 
           'INSERT tbl EXEC master..xp_cmdshell OSQL /Q"DBCC SHOWCONTIG"', 
           "';shutdown –", 
           "EXEC sp_configure 'xp_cmdshell',1", 
           "SELECT name FROM syscolumns WHERE id =(SELECT id FROM sysobjects WHERE name = 'tablenameforcolumnnames')"]

X_test2 = vectorizer.transform(X_test2)
print(clf.predict(X_test2))

X_test3 = ['Let me introduce myself. My name is Mariya I am a 20-year-old student from Donetsk.']
X_test3 = vectorizer.transform(X_test3)
print(clf.predict(X_test3))

0.9704198473282443
Доля верно угаданных:  0.9704198473282443
F-мера:  0.9460869565217391
Точность:  0.9963369963369964
Полнота:  0.9006622516556292
ROC-AUC:  0.9943051684036717
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1]
[0]
