In [None]:
# Phishing Detection for Non-Web URLs using Intelligent Link Analysis
# Step 1: Install dependencies (if not already available in Colab)
!pip install tldextract scikit-learn xgboost

import pandas as pd
import numpy as np
import re
import tldextract
import math
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBClassifier


In [None]:
# Step 2: Loading of dataset for URLs
data = {
    "url": [
        "ftp://192.168.1.10/malicious/file.exe",
        "mailto:support@securebank.com",
        "ftp://phishy-domain.ru/login",
        "file://C:/Windows/System32/cmd.exe",
        "sms://+1234567890?body=click%20here",
        "skype://securecall?user=trusted_contact"
    ],
    "label": [1, 0, 1, 1, 1, 0]
}

df = pd.DataFrame(data)
print(df.head())

In [None]:
# Step 3: Feature Extraction
def extract_features(url):
    features = {}
    parsed = urlparse(url)
    ext = tldextract.extract(url)

    # Basic features
    features['url_length'] = len(url)
    features['num_digits'] = sum(c.isdigit() for c in url)
    features['num_special_chars'] = len(re.findall(r'[\W_]', url))
    features['entropy'] = -(sum((url.count(c)/len(url)) * math.log2(url.count(c)/len(url)) for c in set(url)))

    # Protocol-based features
    features['is_ftp'] = 1 if parsed.scheme == "ftp" else 0
    features['is_file'] = 1 if parsed.scheme == "file" else 0
    features['is_mailto'] = 1 if parsed.scheme == "mailto" else 0
    features['is_sms'] = 1 if "sms" in parsed.scheme else 0
    features['is_skype'] = 1 if "skype" in parsed.scheme else 0

    # Domain-related features
    features['has_ip'] = 1 if re.match(r"^\d{1,3}(\.\d{1,3}){3}$", ext.domain) else 0
    features['subdomain_length'] = len(ext.subdomain)
    features['domain_length'] = len(ext.domain)

    # Suspicious keywords
    suspicious_keywords = ['login', 'verify', 'update', 'secure', 'bank', 'account']
    features['has_suspicious_keyword'] = 1 if any(word in url.lower() for word in suspicious_keywords) else 0

    return features

feature_list = df['url'].apply(extract_features)
X = pd.DataFrame(feature_list.tolist())
y = df['label']

print("\nExtracted Features:\n", X.head())


In [None]:
# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Step 5: ML Model (Random Forest + XGBoost)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)

In [None]:
# Step 6: Evaluation
print("\nRandom Forest Performance:")
print(classification_report(y_test, rf_preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, rf_preds))
print("Accuracy:", accuracy_score(y_test, rf_preds))

print("\nXGBoost Performance:")
print(classification_report(y_test, xgb_preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, xgb_preds))
print("Accuracy:", accuracy_score(y_test, xgb_preds))

In [None]:
# Phishing Detection with Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Class Distribution
plt.figure(figsize=(6,4))
sns.countplot(x=y, palette="Set2")
plt.title("Distribution of Legitimate vs Phishing Non-Web URLs")
plt.xlabel("Class (0 = Legitimate, 1 = Phishing)")
plt.ylabel("Count")
plt.show()


In [None]:
# Feature Importance (Random Forest)
importances = rf_model.feature_importances_
feature_names = X.columns
feat_importances = pd.Series(importances, index=feature_names).sort_values(ascending=False)

plt.figure(figsize=(8,5))
sns.barplot(x=feat_importances, y=feat_importances.index, palette="viridis")
plt.title("Feature Importance for Phishing Detection (Random Forest)")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.show()

In [None]:
# Confusion Matrix Heatmap (XGBoost)
cm = confusion_matrix(y_test, xgb_preds)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Legit","Phishing"], yticklabels=["Legit","Phishing"])
plt.title("Confusion Matrix - XGBoost")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# Example Predictions Visualization
sample_df = pd.DataFrame({"URL": X_test.index, "Actual": y_test.values, "Predicted": xgb_preds})
sample_df = sample_df.head(10)  # Show only 10 samples

plt.figure(figsize=(10,5))
sns.scatterplot(data=sample_df, x=range(len(sample_df)), y="Actual", label="Actual", marker="o", s=100)
sns.scatterplot(data=sample_df, x=range(len(sample_df)), y="Predicted", label="Predicted", marker="X", s=120)
plt.xticks(range(len(sample_df)), sample_df["URL"], rotation=45, ha="right")
plt.title("Sample Predictions: Actual vs Predicted Classes")
plt.ylabel("Class (0=Legit, 1=Phishing)")
plt.legend()
plt.tight_layout()
plt.show()