In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
import warnings

warnings.filterwarnings('ignore')

# Load dataset
file_path = 'dataset/PhishingDataset.csv'
df = pd.read_csv(file_path)

# Select features and label
LABEL = df.iloc[:, -1:].columns[0]
cols = [
    'LineOfCode', 'NoOfExternalRef', 'LargestLineLength', 'URLLength', 'NoOfImage', 'NoOfJS', 'NoOfSelfRef', 'NoOfCSS',
    'URLCharProb', 'CharContinuationRate', 'LetterRatioInURL', 'IsHTTPS', 'SpacialCharRatioInURL', 'NoOfEmptyRef',
    'NoOfOtherSpecialCharsInURL', 'HasDescription', 'HasSocialNet', 'DomainLength', 'DegitRatioInURL', 'NoOfDegitsInURL',
    'HasCopyrightInfo', 'NoOfLettersInURL', 'TLDLegitimateProb', 'DomainTitleMatchScore', 'IsResponsive',
    'HasHiddenFields', 'HasSubmitButton', 'NoOfSubDomain', 'HasFavicon', 'HasTitle'
]

X = df[cols].values  # Features
y = df[LABEL].values  # Target label

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Initialize logistic regression model
model = LogisticRegression(solver='liblinear')

In [None]:
# Define hyperparameters to tune
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}

# Perform Grid Search
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)

# Best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

Based on the results change the code to use the best parameters

In [None]:
param_grid = {
    'C': [0.05, 0.075, 0.1, 0.15, 0.2],
    'penalty': ['l1', 'l2']
}

# Perform Grid Search
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)

# Best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

Next iteration

In [None]:
param_grid = {
    'C': [0.145, 0.1475, 0.15, 0.1525, 0.155],
    'penalty': ['l1', 'l2']
}

# Perform Grid Search
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)

# Best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

Best Parameters: {'C': 0.15, 'penalty': 'l1'}
Training model with best parameters

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Make predictions
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
TP = cm[1, 1]
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]

# Create a DataFrame for the confusion matrix
CM = pd.DataFrame(columns=['Mode', 'TP', 'TN', 'FP', 'FN'])

# Add the values to the DataFrame
newRow = {
    'Mode': "LogisticRegression",
    'TP': TP,
    'TN': TN,
    'FP': FP,
    'FN': FN
}
CM = pd.concat([CM, pd.DataFrame([newRow])], ignore_index=True)

# Display the DataFrame
print(CM)