In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import re

In [2]:
data = pd.read_csv('/kaggle/input/tabular-dataset-ready-for-malicious-url-detection/train_dataset.csv')

In [3]:
# 1. URL Length
data['url_length'] = data['url'].apply(len)

# 2. Count of Special Characters (@, -, ., //)
data['count_at'] = data['url'].apply(lambda x: x.count('@'))
data['count_dash'] = data['url'].apply(lambda x: x.count('-'))
data['count_dot'] = data['url'].apply(lambda x: x.count('.'))
data['count_double_dash'] = data['url'].apply(lambda x: x.count('//'))


# 3. Number of Subdomains(Calculate the number of subdomains by counting the number of dots (.))

data['num_subdomains'] = data['url'].apply(lambda x: x.count('.'))


# 4. Presence of IP Address in URL
# (Check if the URL contains an IP address instead of a domain name, which is common in phishing URLs.)

data['has_ip'] = data['url'].apply(lambda x: 1 if re.search(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', x) else 0)


# Presence of HTTPS
data['https'] = data['url'].apply(lambda x: 1 if 'https' in x else 0)


# displaying few rouws to verify features
data.head()
     

Unnamed: 0,url,label,source,url_has_login,url_has_client,url_has_server,url_has_admin,url_has_ip,url_isshorted,url_len,...,subdomain_len,subdomain_count_dot,url_length,count_at,count_dash,count_dot,count_double_dash,num_subdomains,has_ip,https
0,irs-profilepaymentservice.com/home,1,phishtank,0,0,0,0,0,0,34,...,0,0,34,0,1,1,0,1,0,0
1,cpuggsukabumi.id,0,majestic_million,0,0,0,0,0,0,16,...,0,0,16,0,0,1,0,1,0,0
2,members.tripod.com/~don_rc/ring.htm,0,data_clean_test_mendel,0,0,0,0,0,0,35,...,7,0,35,0,0,3,0,3,0,0
3,optuswebmailadminprovider.weebly.com/,1,phishtank,0,0,0,1,0,0,37,...,25,0,37,0,0,2,0,2,0,0
4,topoz.com.pl,0,dmoz_harvard,0,0,0,0,0,0,12,...,0,0,12,0,0,2,0,2,0,0


In [4]:
features = ['url_length','count_at', 'count_dash', 'count_dot', 'count_double_dash', 'num_subdomains', 'has_ip', 'https']
# Function to calculate and count outliers using the IQR method
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

# Counting the number of outliers for each feature
for feature in features:
    outliers = detect_outliers(data, feature)
    print(f"Number of outliers in {feature}: {len(outliers)}")

Number of outliers in url_length: 501109
Number of outliers in count_at: 13928
Number of outliers in count_dash: 1292516
Number of outliers in count_dot: 178994
Number of outliers in count_double_dash: 4940
Number of outliers in num_subdomains: 178994
Number of outliers in has_ip: 32531
Number of outliers in https: 10616


In [5]:
# Define a function to remove outliers based on the IQR method
def remove_outliers_iqr(df, columns):
    for col in columns:
        # Calculate Q1 and Q3 for the column
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        # Calculate the IQR
        IQR = Q3 - Q1
        # Define the lower and upper bounds
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        # Remove rows that have outliers in this column
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

# List of columns to remove outliers from
outlier_columns = ['url_length','count_at', 'count_dash', 'count_dot', 'count_double_dash', 'num_subdomains', 'has_ip', 'https']

# Apply the function to the data
data_cleaned = remove_outliers_iqr(data, outlier_columns)

# Display the shape of the dataset before and after removing outliers
print("Original dataset shape:", data.shape)
print("Dataset shape after outlier removal:", data_cleaned.shape)

Original dataset shape: (6728848, 68)
Dataset shape after outlier removal: (5128786, 68)


In [6]:
total_rows = len(data)
for feature in outlier_columns:
    outliers = detect_outliers(data, feature)
    outlier_percentage = (len(outliers) / total_rows) * 100
    print(f"Proportion d'outliers pour {feature}: {outlier_percentage:.2f}%")

Proportion d'outliers pour url_length: 7.45%
Proportion d'outliers pour count_at: 0.21%
Proportion d'outliers pour count_dash: 19.21%
Proportion d'outliers pour count_dot: 2.66%
Proportion d'outliers pour count_double_dash: 0.07%
Proportion d'outliers pour num_subdomains: 2.66%
Proportion d'outliers pour has_ip: 0.48%
Proportion d'outliers pour https: 0.16%


In [7]:
# Finalizing feature selection based on feature importance analysis
selected_features = [
    'url_entropy',
    'path_count_no_of_dir',
    'url_3bentropy',
    'url_length',
    'path_len',
    'pdomain_min_distance',
    'subdomain_len',
    'url_count_digit',
    'url_2bentropy',
    'url_hamming_1',
    'path_count_upper',
    'path_count_lower',
    'tld_len',
    'url_hamming_01',
    'url_nunique_chars_ratio'
]

X = data[selected_features]
y = data['label']

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize and train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9319014393246988
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.98      0.96   1056595
           1       0.91      0.76      0.83    289175

    accuracy                           0.93   1345770
   macro avg       0.92      0.87      0.89   1345770
weighted avg       0.93      0.93      0.93   1345770



In [10]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the Logistic Regression model
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
logistic_model.fit(X_train, y_train)

# Make predictions
y_pred_logistic = logistic_model.predict(X_test)

# Evaluate the model
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logistic))
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_logistic))

 

Logistic Regression Accuracy: 0.8715746375680837
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.96      0.92   1056595
           1       0.79      0.55      0.65    289175

    accuracy                           0.87   1345770
   macro avg       0.84      0.75      0.78   1345770
weighted avg       0.87      0.87      0.86   1345770

