In [None]:
# Ensure scikit-learn is installed
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,recall_score,precision_score,confusion_matrix,classification_report
from sklearn.feature_selection import SelectFromModel

In [None]:
df = pd.read_csv(r"C:\Users\stanl\anaconda_projects\orbital_ml\dataset_phishing.csv")


In [None]:
df.dropna(inplace=True)
df['status'] = df['status'].map({'phishing' : 1, 'legitimate' : 0})

In [None]:
features_df = df.iloc[:, 1:90]
features_df

In [None]:
corr_matrix = features_df.corr()
status_cor = corr_matrix['status']

In [None]:
def feature_selector (corr_matrix, threshold):
    selected_features = []
    feature_score = []
    for idx, score in enumerate(corr_matrix):
        if (abs(score) > threshold):
            selected_features.append(corr_matrix.index[idx])
            feature_score.append(['{:3f}'.format(score)])
    result = list(zip(selected_features, feature_score))
    return result

In [None]:
features_selected_02 = feature_selector(status_cor, 0.2)
features_selected_02



In [None]:
feature_name = []
for feature, score in features_selected_02:
    if feature != 'status':
        feature_name.append(feature)

# feature_name = feature_name[0:15]
feature_name

In [None]:

X = df[feature_name]
y = df['status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
clf = RandomForestClassifier(
    max_depth=20,
    n_estimators=100,
    random_state=42 
)

clf.fit(X_train_scaled, y_train)

y_pred = clf.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred)
test_accuracy

In [None]:
selector = SelectFromModel(clf, threshold=0.03)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)
print("Selected features:", selector.get_support(indices=True))
selected_indices = selector.get_support(indices=True)
filt_idx = slice(0, 8)
print(filt_idx)
print("First 8 selected features:", [feature_name[idx] for idx in selected_indices[:8]])

X_train_selected = X_train_selected[:, filt_idx]
X_test_selected = X_test_selected[:, filt_idx]

clf_selected = RandomForestClassifier(max_depth=20, n_estimators=100,random_state=42)
clf_selected.fit(X_train_selected, y_train)
y_pred = clf_selected.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy after feature selection:", accuracy)

Selected features: [ 0  7 14 15 16 17 20 21 22]
slice(0, 8, None)
First 8 selected features: ['length_url', 'nb_www', 'longest_word_path', 'phish_hints', 'nb_hyperlinks', 'ratio_intHyperlinks', 'domain_age', 'google_index']
Accuracy after feature selection: 0.9422673198040588


With the 8 selected features, 'length_url', 'nb_www', 'longest_word_path', 'phish_hints', 'nb_hyperlinks', 'ratio_intHyperlinks', 'domain_age', 'google_index'. We are able to achieve a accuracy score of 94.2% 

In [94]:
with open("phishing_model.pkl", "wb") as f:
    pickle.dump(clf_selected, f)