In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Step 1: Load the dataset
file_path = 'uci-ml-phishing-dataset.csv'
df = pd.read_csv(file_path)

In [2]:
# Function to detect outliers using IQR
def detect_outliers_iqr(df):
    outlier_indices = {}
    for column in df.columns[:-1]:  # skip label column
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
        outlier_indices[column] = outliers.index.tolist()
    return outlier_indices

outliers_dict = detect_outliers_iqr(df)

# Print summary of outliers
for feature, indices in outliers_dict.items():
    print(f"{feature}: {len(indices)} outliers")

# Optional: Check top few features with most outliers
top_outliers = sorted(outliers_dict.items(), key=lambda x: len(x[1]), reverse=True)[:5]
print("\nTop features with most outliers:")
for feature, indices in top_outliers:
    print(f"{feature}: {len(indices)} outliers")

id: 0 outliers
having_IP_Address: 0 outliers
URL_Length: 2095 outliers
Shortining_Service: 1444 outliers
having_At_Symbol: 1655 outliers
double_slash_redirecting: 1429 outliers
Prefix_Suffix: 1465 outliers
having_Sub_Domain: 0 outliers
SSLfinal_State: 0 outliers
Domain_registeration_length: 0 outliers
Favicon: 2053 outliers
port: 1502 outliers
HTTPS_token: 1796 outliers
Request_URL: 0 outliers
URL_of_Anchor: 0 outliers
Links_in_tags: 0 outliers
SFH: 2615 outliers
Submitting_to_email: 2014 outliers
Abnormal_URL: 1629 outliers
Redirect: 1279 outliers
on_mouseover: 1315 outliers
RightClick: 476 outliers
popUpWidnow: 2137 outliers
Iframe: 1012 outliers
age_of_domain: 0 outliers
DNSRecord: 0 outliers
web_traffic: 0 outliers
Page_Rank: 0 outliers
Google_Index: 1539 outliers
Links_pointing_to_page: 0 outliers
Statistical_report: 1550 outliers

Top features with most outliers:
SFH: 2615 outliers
popUpWidnow: 2137 outliers
URL_Length: 2095 outliers
Favicon: 2053 outliers
Submitting_to_email: 20

In [3]:
# Identify non-numeric columns
non_numeric_cols = df.select_dtypes(exclude=['number']).columns.tolist()

# Output the result
if non_numeric_cols:
    print("Non-numeric columns found:")
    for col in non_numeric_cols:
        print(f"- {col}: {df[col].unique()[:5]} (type: {df[col].dtype})")
else:
    print("✅ All columns are numeric.")

✅ All columns are numeric.


In [4]:
# Get the label column (assuming it's the last column)
label_column = df.columns[-1]
label_counts = df[label_column].value_counts()

# Print class distribution
print("Class distribution:")
print(label_counts)

# Calculate imbalance ratio
majority_class = label_counts.max()
minority_class = label_counts.min()
imbalance_ratio = round(majority_class / minority_class, 2)

print(f"\nImbalance Ratio (majority/minority): {imbalance_ratio}:1")

Class distribution:
Result
 1    6157
-1    4898
Name: count, dtype: int64

Imbalance Ratio (majority/minority): 1.26:1


In [6]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# Split features and target
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Stratified K-Fold setup
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Training without any scaling
accuracies = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    accuracies.append(acc)

In [7]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler



# Split features and target
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Stratified K-Fold setup
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Training with MinMaxScaler inside each fold
accuracies = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Scale the data
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    # Evaluate
    acc = accuracy_score(y_test, y_pred)
    accuracies.append(acc)

print(f"\nFold Accuracies (with MinMaxScaler): {[round(a, 4) for a in accuracies]}")
print(f"Average Accuracy: {sum(accuracies) / len(accuracies):.4f}")



Fold Accuracies (with MinMaxScaler): [0.9665, 0.9715, 0.972, 0.9679, 0.9634]
Average Accuracy: 0.9682


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Pipeline: scaling + model
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('rf', RandomForestClassifier(random_state=42))
])

# Hyperparameter grid
param_grid = {
    'rf__n_estimators': [50, 100, 150],
    'rf__max_depth': [None, 10, 20],
    'rf__min_samples_split': [2, 5],
    'rf__min_samples_leaf': [1, 2]
}

# Setup GridSearch with Stratified K-Fold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)

# Run search
grid_search.fit(X, y)

# Best results
print("\nBest Parameters:")
print(grid_search.best_params_)

print(f"\nBest Accuracy: {grid_search.best_score_:.4f}")

Fitting 5 folds for each of 36 candidates, totalling 180 fits

Best Parameters:
{'rf__max_depth': None, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 100}

Best Accuracy: 0.9682


In [11]:
import joblib

# Save model and scaler from pipeline
best_model = grid_search.best_estimator_

# Save the entire pipeline (includes scaler + model)
joblib.dump(best_model, 'model.pkl')


['model.pkl']

In [14]:
!pip install streamlit
!pip install pyngrok

import streamlit as st
import numpy as np
import joblib

# Load model
model = joblib.load('model.pkl')

# Page config
st.set_page_config(page_title="Phishing URL Detector", layout="centered")

st.title("🔒 Phishing Website Detector")

st.markdown("Enter feature values below to predict whether a site is **Phishing (1)** or **Legit (0)**.")

# Assuming 30 features
features = []
for i in range(1, 31):
    val = st.number_input(f"Feature {i}", value=0)
    features.append(val)

# Predict
if st.button("Predict"):
    input_data = np.array(features).reshape(1, -1)
    prediction = model.predict(input_data)[0]
    label = "Phishing 🔴" if prediction == 1 else "Legitimate 🟢"
    st.success(f"Prediction: **{label}**")


Collecting streamlit
  Downloading streamlit-1.44.1-py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.44.1-py3-none-any.whl (9.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.8/9.8 MB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m86.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInst

2025-04-17 03:40:58.104 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-04-17 03:40:58.114 Session state does not function when running a script without `streamlit run`


In [16]:
from google.colab import files

# Download the trained model
files.download('model.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [18]:
%%writefile app.py
import streamlit as st
import numpy as np
import joblib

# Load the model
model = joblib.load('model.pkl')

# Streamlit page config
st.set_page_config(page_title="Phishing Website Detector", layout="centered")

# Title
st.title("🔒 Phishing Website Detector")

# Input fields for 30 features
st.markdown("Enter the feature values to predict if the website is phishing or legitimate.")

features = []
for i in range(1, 31):
    val = st.number_input(f"Feature {i}", value=0)
    features.append(val)

# Prediction on button click
if st.button("Predict"):
    input_data = np.array(features).reshape(1, -1)
    prediction = model.predict(input_data)[0]
    label = "Phishing 🔴" if prediction == 1 else "Legitimate 🟢"
    st.success(f"Prediction: **{label}**")


Writing app.py


In [19]:
from google.colab import files
files.download('app.py')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [23]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
import joblib



# 2) Drop the 'id' and the target 'Result' columns to get exactly your 30 features
X = df.iloc[:, 1:-1]    # columns 1 through the penultimate
y = df['Result']        # target column

# 3) Build a pipeline: scaler + RandomForest
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
])

# 4) Train on all your data
pipeline.fit(X, y)

# 5) Save the entire pipeline (so it knows the exact feature order)
joblib.dump(pipeline, 'model.pkl')
print("✅ Saved new model.pkl with all 30 features in the correct order.")


✅ Saved new model.pkl with all 30 features in the correct order.


In [24]:
from google.colab import files
files.download('model.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [26]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
import joblib

# Load the full dataset

# Select exactly the 7 features your Streamlit app uses (in the same order)
feature_cols = [
    'URL_Length',
    'HTTPS_token',
    'Domain_registeration_length',
    'Abnormal_URL',
    'age_of_domain',
    'DNSRecord',
    'Shortining_Service'
]
X = df[feature_cols]
y = df['Result']

# Build and train the pipeline (scaling + RandomForest)
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
])
pipeline.fit(X, y)

# Save the trained pipeline to /mnt/data/model.pkl
joblib.dump(pipeline, 'model.pkl')

# Confirm save
print("✅ Saved new model.pkl at /mnt/data/model.pkl")


✅ Saved new model.pkl at /mnt/data/model.pkl


In [27]:
from google.colab import files
files.download('model.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [29]:
!pip install validators

import streamlit as st
import numpy as np
import joblib
import validators
from urllib.parse import urlparse

# Load the trained model
model = joblib.load('model.pkl')

# Streamlit page config
st.set_page_config(page_title="Phishing Website Detector", layout="centered")

# Title
st.title("🔒 Phishing Website Detection")

# Collecting user input for URL
website_url = st.text_input("Please enter the website URL:")

# If URL is entered, extract features automatically
if website_url:
    # Check if the URL is valid
    if not validators.url(website_url):
        st.error("The URL you entered is not valid. Please try again with a valid URL.")
    else:
        # Extract URL features
        url_length = len(website_url)
        has_https = 1 if 'https' in website_url else 0

        # Display extracted features
        st.markdown(f"**Extracted Features from URL:**")
        st.write(f"URL Length: {url_length}")
        st.write(f"Contains 'https': {'Yes' if has_https else 'No'}")

        # Ask for the remaining features based on the dataset
        st.markdown("### Please enter the remaining details:")

        # Domain Registration Length
        domain_reg_length = st.number_input("Enter the domain registration length:", min_value=1, value=10)

        # Suspicious URL (e.g., URL contains certain keywords like "login", "secure", etc.)
        suspicious_url = st.radio("Does the URL contain suspicious keywords?", options=[True, False])

        # Domain Age (in years)
        domain_age = st.number_input("Enter the domain age (in years):", min_value=1, value=5)

        # Has DNS Record (True = 1, False = 0)
        has_dns_record = st.radio("Does the domain have a DNS record?", options=[True, False])

        # URL Shortened (True = 1, False = 0)
        url_shortened = st.radio("Is the URL shortened?", options=[True, False])

        # Combine all feature inputs into a list
        features = [
            url_length,
            has_https,
            domain_reg_length,
            suspicious_url,
            domain_age,
            has_dns_record,
            url_shortened
        ]

        # Prediction on button click
        if st.button("Predict"):
            # Convert input data to an array
            input_data = np.array(features).reshape(1, -1)

            # Prediction
            prediction = model.predict(input_data)[0]

            # Show prediction result
            label = "Phishing 🔴" if prediction == 1 else "Legitimate 🟢"
            st.success(f"Prediction: **{label}**")


Collecting validators
  Downloading validators-0.34.0-py3-none-any.whl.metadata (3.8 kB)
Downloading validators-0.34.0-py3-none-any.whl (43 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: validators
Successfully installed validators-0.34.0




In [30]:
from google.colab import files
files.download('app.py')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [31]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import joblib

# Load the dataset
df = pd.read_csv('uci-ml-phishing-dataset.csv')

# Prepare features and target variable
X = df.drop(columns=['Result'])  # Features
y = df['Result']  # Target variable

# Preprocessing: Min-Max Scaling
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Model: RandomForest
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Save the model as a .pkl file
joblib.dump(model, 'model.pkl')


['model.pkl']

In [32]:
from google.colab import files

# Download the model.pkl file
files.download('model.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [33]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
import joblib

# 1) Load your dataset (no extra paths!)
df = pd.read_csv('uci-ml-phishing-dataset.csv')

# 2) Pick exactly the 7 features your app asks for, by name:
feature_cols = [
    'URL_Length',                   # will match your computed url_length
    'HTTPS_token',                  # match has_https
    'Domain_registeration_length',  # domain_reg_length
    'Abnormal_URL',                 # suspicious_url
    'age_of_domain',                # domain_age
    'DNSRecord',                    # has_dns_record
    'Shortining_Service'            # url_shortened
]

X = df[feature_cols]
y = df['Result']

# 3) Build a simple pipeline and train
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
])
pipeline.fit(X, y)

# 4) Save your pipeline to model.pkl (in the same folder)
joblib.dump(pipeline, 'model.pkl')
print("✅ model.pkl saved.")


✅ model.pkl saved.


In [34]:
from google.colab import files

# Download the model.pkl file
files.download('model.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>