In [19]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [20]:
data = pd.read_csv('https://raw.githubusercontent.com/kdemertzis/EKPA/main/Data/phishing_dataset.csv', delimiter=',')

In [40]:
print("Στήλες του dataset:", data.columns)

Στήλες του dataset: Index(['Have_IP', 'Have_At', 'URL_Length', 'URL_Depth', 'Redirection',
       'https_Domain', 'TinyURL', 'Prefix/Suffix', 'DNS_Record', 'Web_Traffic',
       'Domain_Age', 'Domain_End', 'iFrame', 'Mouse_Over', 'Right_Click',
       'Web_Forwards'],
      dtype='object')


In [41]:
if 'Domain' in data.columns and 'Label' in data.columns:
    data = data.drop(columns=['Domain', 'Label'])

In [42]:
print(data.head())

   Have_IP  Have_At  URL_Length  URL_Depth  Redirection  https_Domain  \
0        0        0           1          1            0             0   
1        0        0           1          1            1             0   
2        0        0           1          1            0             0   
3        0        0           1          3            0             0   
4        0        0           1          3            0             0   

   TinyURL  Prefix/Suffix  DNS_Record  Web_Traffic  Domain_Age  Domain_End  \
0        0              0           0            1           1           1   
1        0              0           0            1           1           1   
2        0              0           0            1           0           1   
3        0              0           0            1           0           1   
4        0              0           0            1           0           1   

   iFrame  Mouse_Over  Right_Click  Web_Forwards  
0       0           0            1       

In [43]:
X = data.drop(columns=['Domain', 'Label'])
y = data['Label']

KeyError: "['Domain', 'Label'] not found in axis"

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [47]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [46]:
# Define classifiers
classifiers = [
    LogisticRegression(),
    SVC(),
    RandomForestClassifier(),
    MLPClassifier(),
    KNeighborsClassifier(),
    DecisionTreeClassifier()
]

In [45]:
# Evaluate and compare classifiers
results = {'Classifier': [], 'Accuracy': [], 'Precision': [], 'Recall': [], 'F1 Score': []}

for clf in classifiers:
    clf_name = clf.__class__.__name__

    # Train the classifier
    clf.fit(X_train, y_train)

    # Make predictions
    y_pred = clf.predict(X_test)

    # Calculate evaluation metrics
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # Display results
    print(f"\n{clf_name} Metrics:")
    print(f"Accuracy: {acc}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print("Confusion Matrix:")
    print(cm)

    # Store results for comparison
    results['Classifier'].append(clf_name)
    results['Accuracy'].append(acc)
    results['Precision'].append(precision)
    results['Recall'].append(recall)
    results['F1 Score'].append(f1)


LogisticRegression Metrics:
Accuracy: 0.794
Precision: 0.8139589731199033
Recall: 0.794
F1 Score: 0.7897973595115082
Confusion Matrix:
[[1422  115]
 [ 503  960]]

SVC Metrics:
Accuracy: 0.8296666666666667
Precision: 0.8622283020972779
Recall: 0.8296666666666667
F1 Score: 0.8249651031210172
Confusion Matrix:
[[1509   28]
 [ 483  980]]

RandomForestClassifier Metrics:
Accuracy: 0.862
Precision: 0.8761206398358945
Recall: 0.862
F1 Score: 0.8602947546569485
Confusion Matrix:
[[1478   59]
 [ 355 1108]]





MLPClassifier Metrics:
Accuracy: 0.8533333333333334
Precision: 0.8720724751415437
Recall: 0.8533333333333334
F1 Score: 0.8509693713550994
Confusion Matrix:
[[1488   49]
 [ 391 1072]]

KNeighborsClassifier Metrics:
Accuracy: 0.8426666666666667
Precision: 0.8457532148910482
Recall: 0.8426666666666667
F1 Score: 0.8420759958333631
Confusion Matrix:
[[1376  161]
 [ 311 1152]]

DecisionTreeClassifier Metrics:
Accuracy: 0.8606666666666667
Precision: 0.877514069067632
Recall: 0.8606666666666667
F1 Score: 0.8586575857745357
Confusion Matrix:
[[1489   48]
 [ 370 1093]]


In [37]:
# Display comparison results
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='Accuracy', ascending=False)
print("\nComparison of Classifiers:")
print(results_df)


Comparison of Classifiers:
Empty DataFrame
Columns: [Classifier, Accuracy, Precision, Recall, F1 Score]
Index: []
