In [37]:
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

from sqliGoT import sqliGoT
from scores import print_scores

In [38]:
df = pd.read_csv("./resource/Modified_SQL_Dataset.csv")

X = df['Query']
y = df['Label']

X_sqliGoT_indegree = sqliGoT(X, windows_size = 5, type = "directed", mode = "proportional", degree="in")
X_sqliGoT_outdegree = sqliGoT(X, windows_size = 5, type = "directed", mode = "proportional", degree="out")
X_sqliGoT_undirected = sqliGoT(X, windows_size = 5, type = "undirected", mode = "proportional")

X_sqliGoT_undirected = pd.DataFrame(X_sqliGoT_undirected).loc[:,(X_sqliGoT_undirected != 0).any(axis=0)]
X_sqliGoT_indegree = pd.DataFrame(X_sqliGoT_indegree).loc[:,(X_sqliGoT_indegree != 0).any(axis=0)]
X_sqliGoT_outdegree = pd.DataFrame(X_sqliGoT_outdegree).loc[:,(X_sqliGoT_outdegree != 0).any(axis=0)]


In [39]:
X_undirected_train, X_undirected_test, y_train, y_test = train_test_split(X_sqliGoT_undirected, y, test_size = 0.2, random_state=43192)
X_indegree_train, X_indegree_test, y_train, y_test = train_test_split(X_sqliGoT_indegree, y, test_size = 0.2, random_state=43192)
X_outdegree_train, X_outdegree_test, y_train, y_test = train_test_split(X_sqliGoT_outdegree, y, test_size = 0.2, random_state=43192)

In [40]:
print("Undirected Train: ", X_undirected_train.shape, "Test: ", X_undirected_test.shape)
print("Indegree Train: ", X_indegree_train.shape, "Test: ", X_indegree_test.shape)
print("Outdegree Train: ", X_outdegree_train.shape, "Test: ", X_outdegree_test.shape)

Undirected Train:  (24735, 411) Test:  (6184, 411)
Indegree Train:  (24735, 411) Test:  (6184, 411)
Outdegree Train:  (24735, 411) Test:  (6184, 411)


In [41]:
for i in range(10):
    knn_undirected_clf = KNeighborsClassifier(n_neighbors=1, metric="minkowski", algorithm="brute")
    knn_undirected_clf.fit(X_undirected_train, y_train)
    y_undirected_pred = knn_undirected_clf.predict(X_undirected_test)
    print_scores("K Neighbors (undirected)", y_undirected_pred, y_test)

    knn_indegree_clf = KNeighborsClassifier(n_neighbors=1, metric="minkowski", algorithm="brute")
    knn_indegree_clf.fit(X_indegree_train, y_train)
    y_indegree_pred = knn_indegree_clf.predict(X_indegree_test)
    print_scores("K Neighbors (indegree)", y_indegree_pred, y_test)

    knn_outdegree_clf = KNeighborsClassifier(n_neighbors=1, metric="minkowski", algorithm="brute")
    knn_outdegree_clf.fit(X_outdegree_train, y_train)
    y_outdegree_pred = knn_outdegree_clf.predict(X_outdegree_test)
    print_scores("K Neighbors (outdegree)", y_outdegree_pred, y_test)

    y_vote_pred = y_undirected_pred + y_indegree_pred + y_outdegree_pred
    for i in range(len(y_vote_pred)):
        if y_vote_pred[i]<=1:
            y_vote_pred[i] = 0
        else:
            y_vote_pred[i] = 1
    result_vote = print_scores("K Neighbors (vote)", y_vote_pred, y_test)


In [42]:
for i in range(10):    
    lr_undirected_clf = LogisticRegression(dual=False, C=1.0)
    lr_undirected_clf.fit(X_undirected_train, y_train)
    y_undirected_pred = lr_undirected_clf.predict(X_undirected_test)
    print_scores("Logistic Regression (undirected)", y_undirected_pred, y_test)

    lr_indegree_clf = LogisticRegression(dual=False, C=1.0)
    lr_indegree_clf.fit(X_indegree_train, y_train)
    y_indegree_pred = lr_indegree_clf.predict(X_indegree_test)
    print_scores("Logistic Regression (indegree)", y_indegree_pred, y_test)

    lr_outdegree_clf = LogisticRegression(dual=False, C=1.0)
    lr_outdegree_clf.fit(X_outdegree_train, y_train)
    y_outdegree_pred = lr_outdegree_clf.predict(X_outdegree_test)
    print_scores("Logistic Regression (outdegree)", y_outdegree_pred, y_test)

    y_vote_pred = y_undirected_pred + y_indegree_pred + y_outdegree_pred
    for i in range(len(y_vote_pred)):
        if y_vote_pred[i]<=1:
            y_vote_pred[i] = 0
        else:
            y_vote_pred[i] = 1
    print_scores("Logistic Regression (vote)", y_vote_pred, y_test)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [43]:
for i in range(10):
    lsvc_undirected_clf = LinearSVC(dual=False, loss='squared_hinge', C=1.0)
    lsvc_undirected_clf.fit(X_undirected_train, y_train)
    y_undirected_pred = lsvc_undirected_clf.predict(X_undirected_test)
    print_scores("Linear SVC (undirected)", y_undirected_pred, y_test)

    lsvc_indegree_clf = LinearSVC(dual=False, loss='squared_hinge', C=1.0)
    lsvc_indegree_clf.fit(X_indegree_train, y_train)
    y_indegree_pred = lsvc_indegree_clf.predict(X_indegree_test)
    print_scores("Linear SVC (indegree)", y_indegree_pred, y_test)

    lsvc_outdegree_clf = LinearSVC(dual=False, loss='squared_hinge', C=1.0)
    lsvc_outdegree_clf.fit(X_outdegree_train, y_train)
    y_outdegree_pred = lsvc_outdegree_clf.predict(X_outdegree_test)
    print_scores("Linear SVC (outdegree)", y_outdegree_pred, y_test)

    y_vote_pred = y_undirected_pred + y_indegree_pred + y_outdegree_pred
    for i in range(len(y_vote_pred)):
        if y_vote_pred[i]<=1:
            y_vote_pred[i] = 0
        else:
            y_vote_pred[i] = 1
    print_scores("Linear SVC (vote)", y_vote_pred, y_test)


In [None]:
for i in range(10):
    ada_dt_undirected_clf = AdaBoostClassifier(DecisionTreeClassifier(min_samples_split=2, min_samples_leaf=1), n_estimators=50)
    ada_dt_undirected_clf.fit(X_undirected_train, y_train)
    y_undirected_pred = ada_dt_undirected_clf.predict(X_undirected_test)
    print_scores("AdaBoost + DecisionTree (undirected)", y_undirected_pred, y_test)

    ada_dt_indegree_clf = AdaBoostClassifier(DecisionTreeClassifier(min_samples_split=2, min_samples_leaf=1), n_estimators=50)
    ada_dt_indegree_clf.fit(X_indegree_train, y_train)
    y_indegree_pred = ada_dt_indegree_clf.predict(X_indegree_test)
    print_scores("AdaBoost + DecisionTree (indegree)", y_indegree_pred, y_test)

    ada_dt_outdegree_clf = AdaBoostClassifier(DecisionTreeClassifier(min_samples_split=2, min_samples_leaf=1), n_estimators=50)
    ada_dt_outdegree_clf.fit(X_outdegree_train, y_train)
    y_outdegree_pred = ada_dt_outdegree_clf.predict(X_outdegree_test)
    print_scores("AdaBoost + DecisionTree (outdegree)", y_outdegree_pred, y_test)

    y_vote_pred = y_undirected_pred + y_indegree_pred + y_outdegree_pred
    for i in range(len(y_vote_pred)):
        if y_vote_pred[i]<=1:
            y_vote_pred[i] = 0
        else:
            y_vote_pred[i] = 1
    print_scores("AdaBoost + DecisionTree (vote)", y_vote_pred, y_test)


In [None]:
for i in range(10):
    rm_undirected_clf = RandomForestClassifier(n_estimators=100, min_samples_split=2, min_samples_leaf=1)
    rm_undirected_clf.fit(X_undirected_train, y_train)
    y_undirected_pred = rm_undirected_clf.predict(X_undirected_test)
    print_scores("Random Forest (undirected)", y_undirected_pred, y_test)

    rm_indegree_clf = RandomForestClassifier(n_estimators=100, min_samples_split=2, min_samples_leaf=1)
    rm_indegree_clf.fit(X_indegree_train, y_train)
    y_indegree_pred = rm_indegree_clf.predict(X_indegree_test)
    print_scores("Random Forest (indegree)", y_indegree_pred, y_test)

    rm_outdegree_clf = RandomForestClassifier(n_estimators=100, min_samples_split=2, min_samples_leaf=1)
    rm_outdegree_clf.fit(X_outdegree_train, y_train)
    y_outdegree_pred = rm_outdegree_clf.predict(X_outdegree_test)
    print_scores("Random Forest (outdegree)", y_outdegree_pred, y_test)

    y_vote_pred = y_undirected_pred + y_indegree_pred + y_outdegree_pred
    for i in range(len(y_vote_pred)):
        if y_vote_pred[i]<=1:
            y_vote_pred[i] = 0
        else:
            y_vote_pred[i] = 1
    print_scores("Random Forest (vote)", y_vote_pred, y_test)


In [None]:
for i in range(10):
    mlp_sgd_undirected_clf = MLPClassifier(hidden_layer_sizes=(200, 50), activation='relu', solver='sgd' , max_iter=1000)
    mlp_sgd_undirected_clf.fit(X_undirected_train, y_train)
    y_undirected_pred = mlp_sgd_undirected_clf.predict(X_undirected_test)
    print_scores("Perceptron + SGD (undirected)", y_undirected_pred, y_test)

    mlp_sgd_indegree_clf = MLPClassifier(hidden_layer_sizes=(200, 50), activation='relu', solver='sgd' , max_iter=1000)
    mlp_sgd_indegree_clf.fit(X_indegree_train, y_train)
    y_indegree_pred = mlp_sgd_indegree_clf.predict(X_indegree_test)
    print_scores("Perceptron + SGD (indegree)", y_indegree_pred, y_test)

    mlp_sgd_outdegree_clf = MLPClassifier(hidden_layer_sizes=(200, 50), activation='relu', solver='sgd' , max_iter=1000)
    mlp_sgd_outdegree_clf.fit(X_outdegree_train, y_train)
    y_outdegree_pred = mlp_sgd_outdegree_clf.predict(X_outdegree_test)
    print_scores("Perceptron + SGD (outdegree)", y_outdegree_pred, y_test)

    y_vote_pred = y_undirected_pred + y_indegree_pred + y_outdegree_pred
    for i in range(len(y_vote_pred)):
        if y_vote_pred[i]<=1:
            y_vote_pred[i] = 0
        else:
            y_vote_pred[i] = 1
    print_scores("Perceptron + SGD (vote)", y_vote_pred, y_test)
