# Importing Libraries

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Loading the DataSet


In [18]:
data = pd.read_csv("new1.csv")

data1 = data.copy()

data["Sentence"].fillna("", inplace=True)

data.dropna(subset=["Label"], inplace=True)

data2 = data.copy()

# PreProcessing the Data

In [19]:
X = data['Sentence']
y = data['Label']

# Spliting the Data

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Vectorizing the text data using TF-IDF

In [21]:
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Building the Algorithms

In [None]:
# Build Logistic Regression classifier
logistic_classifier = LogisticRegression()
logistic_classifier.fit(X_train_vectorized, y_train)

# Build Random Forest classifier

random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=0, max_depth = 16)
random_forest_classifier.fit(X_train_vectorized, y_train)

# Build Decision Tree classifier
decision_tree_classifier = DecisionTreeClassifier()
decision_tree_classifier.fit(X_train_vectorized, y_train)

# Evaluating the Algorithms


In [None]:
# Evaluate Logistic Regression model on the test set
logistic_y_pred = logistic_classifier.predict(X_test_vectorized)
logistic_accuracy = accuracy_score(y_test, logistic_y_pred)
print(f'The score of Logistic regression accuracy: {logistic_accuracy:.2f}')

# Evaluate Random Forest model on the test set
random_forest_y_pred = random_forest_classifier.predict(X_test_vectorized)
random_forest_accuracy = accuracy_score(y_test, random_forest_y_pred)
print(f'The score of Random forest algorithm accuracy :  {random_forest_accuracy:.2f}')

# Evaluate Decision Tree model on the test set
decision_tree_y_pred = decision_tree_classifier.predict(X_test_vectorized)
decision_tree_accuracy = accuracy_score(y_test, decision_tree_y_pred)
print(f'The score of Decision Tree accuracy algorithm: {decision_tree_accuracy:.2f}')

# Predicting the O/P using Logistic Regression

In [None]:
# Function to predict SQL injection attempts using Logistic Regression
def detect_sql_injection_logistic(query):
    # Vectorize the user input
    query_vectorized = vectorizer.transform([query])

    # Predict using the Logistic Regression classifier
    prediction = logistic_classifier.predict(query_vectorized)

    # Return the result
    if prediction == '0':
        return "Malicious SQL Injection Attempt (Logistic Regression)"
    else:
        return "Benign SQL Query (Logistic Regression)"

# Predicting the O/P using Random Forest

In [None]:
# Function to predict SQL injection attempts using Random Forest
def detect_sql_injection_random_forest(query):
    # Vectorize the user input
    query_vectorized = vectorizer.transform([query])

    # Predict using the Random Forest classifier
    prediction = random_forest_classifier.predict(query_vectorized)

    # Return the result
    if prediction == '0':
        return "Malicious SQL Injection Attempt (Random Forest)"
    else:
        return "Benign SQL Query (Random Forest)"

# Predicting the O/P using Decision Tree

In [None]:
# Function to predict SQL injection attempts using Decision Tree
def detect_sql_injection_decision_tree(query):
    # Vectorize the user input
    query_vectorized = vectorizer.transform([query])

    # Predict using the Decision Tree classifier
    prediction = decision_tree_classifier.predict(query_vectorized)

    # Return the result
    if prediction == '0':
        return "Malicious SQL Injection Attempt (Decision Tree)"
    else:
        return "Benign SQL Query (Decision Tree)"

# Taking the I/P from User

In [27]:
# Example user input
user_input = input("Enter an SQL query: ")

# Predict using Logistic Regression
logistic_result = detect_sql_injection_logistic(user_input)
print(logistic_result)

# Predict using Random Forest
random_forest_result = detect_sql_injection_random_forest(user_input)
print(random_forest_result)

# Predict using Decision Tree
decision_tree_result = detect_sql_injection_decision_tree(user_input)
print(decision_tree_result)

#Example : select * from users where id  =  1 +$+ or 1  =  1 -- 1

Enter an SQL query: select * from users where id  =  1 +$+ or 1  =  1 -- 1
Benign SQL Query (Logistic Regression)
Malicious SQL Injection Attempt (Random Forest)
Benign SQL Query (Decision Tree)


# Describing the Logistic Regression Algorithm

In [28]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Load your dataset with 'sentence' and 'label' columns
data = pd.read_csv("new1.csv")

# Handle missing values
data["Sentence"].fillna("", inplace=True)
data.dropna(subset=["Label"], inplace=True)

# Preprocess the data
X = data['Sentence']
y = data['Label']

# Vectorize the text data using TF-IDF (you can try different vectorization methods)
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=0)

# Create and train the Logistic Regression classifier
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Print the classification report
report = classification_report(y_test, y_pred, target_names=['Sentence', 'Label'])
print(report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

    Sentence       0.95      0.99      0.97      3795
       Label       0.99      0.92      0.95      2253

    accuracy                           0.97      6048
   macro avg       0.97      0.96      0.96      6048
weighted avg       0.97      0.97      0.97      6048



# Describing the Random Forest Algorithm

In [None]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Load your dataset with 'sentence' and 'label' columns
data = pd.read_csv("new1.csv")

# Handle missing values
data["Sentence"].fillna("", inplace=True)
data.dropna(subset=["Label"], inplace=True)

# Preprocess the data
X = data['Sentence']
y = data['Label']

# Vectorize the text data using TF-IDF (you can try different vectorization methods)
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=0)

# Create and train the  Random Forest classifier
classifier = RandomForestClassifier(random_state=0)
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Print the classification report
report = classification_report(y_test, y_pred, target_names=['Sentence', 'Label'])
print(report)

# Describing the Decision Tree Algorithm

In [None]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Load your dataset with 'sentence' and 'label' columns
data = pd.read_csv("new1.csv")

# Handle missing values
data["Sentence"].fillna("", inplace=True)
data.dropna(subset=["Label"], inplace=True)

# Preprocess the data
X = data['Sentence']
y = data['Label']

# Vectorize the text data using TF-IDF (you can try different vectorization methods)
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=0)

# Create and train the Decision Tree classifier
classifier = DecisionTreeClassifier(random_state=0)
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Print the classification report
report = classification_report(y_test, y_pred, target_names=['Sentence', 'Label'])
print(report)

In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt
# plt.figure(figsize=(6, 4))
# ax = sns.countplot(x='Label', data=data2)
# # ax.set_xticklabels(['Benign','Malware'])
# ax.set_xlabel('Class')
# ax.set_ylabel('Count')
# plt.show()