In [1]:
### Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Load dataset
ds = pd.read_csv("dataset_website.csv")
print(ds.head())
print(ds.info())
print(ds.isnull().any())  # Check for any null values

# No need of applying label/one hot encoding as there are no categorical columns in the dataset.
# Feature scaling is not required as data is already present in the same range.

# Splitting data as independent and dependent
x = ds.iloc[:, 1:31].values
y = ds.iloc[:, -1].values
print(x, y)

# Splitting data into train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

# Model building and evaluation
models = {
    "Logistic Regression": LogisticRegression(),
    "KNN (Euclidean)": KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2),
    "KNN (Manhattan)": KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=1),
    "SVM (RBF)": SVC(kernel='rbf'),
    "SVM (Sigmoid)": SVC(kernel='sigmoid'),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=10, random_state=0, n_jobs=-1),  
    "Naive Bayes": GaussianNB()
}

results = []

for name, model in models.items():
    model.fit(x_train, y_train)
    
    if name == "Random Forest":
        y_pred = model.predict(x_test).round()
    else:
        y_pred = model.predict(x_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    results.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': conf_matrix
    })

# Display the results
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='Accuracy', ascending=False)
print(results_df)

# Detailed classification report for each model
for name, model in models.items():
    print(f"Model: {name}")
    print(classification_report(y_test, model.predict(x_test)))
    print("\n")

# Saving the best model (Decision Tree in this case) as pickle
best_model = DecisionTreeClassifier()
best_model.fit(x_train, y_train)
import pickle
pickle.dump(best_model, open('Phishing_Website.pkl', 'wb'))


   index  having_IPhaving_IP_Address  URLURL_Length  Shortining_Service  \
0      1                          -1              1                   1   
1      2                           1              1                   1   
2      3                           1              0                   1   
3      4                           1              0                   1   
4      5                           1              0                  -1   

   having_At_Symbol  double_slash_redirecting  Prefix_Suffix  \
0                 1                        -1             -1   
1                 1                         1             -1   
2                 1                         1             -1   
3                 1                         1             -1   
4                 1                         1             -1   

   having_Sub_Domain  SSLfinal_State  Domain_registeration_length  ...  \
0                 -1              -1                           -1  ...   
1               