In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,precision_score, recall_score, confusion_matrix
file_path = 'url_dataset.csv'
data = pd.read_csv(file_path)

In [2]:
data.info(), data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Domain         10000 non-null  object
 1   Have_IP        10000 non-null  int64 
 2   Have_At        10000 non-null  int64 
 3   URL_Length     10000 non-null  int64 
 4   URL_Depth      10000 non-null  int64 
 5   Redirection    10000 non-null  int64 
 6   https_Domain   10000 non-null  int64 
 7   TinyURL        10000 non-null  int64 
 8   Prefix/Suffix  10000 non-null  int64 
 9   DNS_Record     10000 non-null  int64 
 10  Web_Traffic    10000 non-null  int64 
 11  Domain_Age     10000 non-null  int64 
 12  Domain_End     10000 non-null  int64 
 13  iFrame         10000 non-null  int64 
 14  Mouse_Over     10000 non-null  int64 
 15  Right_Click    10000 non-null  int64 
 16  Web_Forwards   10000 non-null  int64 
 17  Label          10000 non-null  int64 
dtypes: int64(17), object(1)
mem

(None,
             Have_IP       Have_At    URL_Length     URL_Depth   Redirection  \
 count  10000.000000  10000.000000  10000.000000  10000.000000  10000.000000   
 mean       0.005500      0.022600      0.773400      3.072000      0.013500   
 std        0.073961      0.148632      0.418653      2.128631      0.115408   
 min        0.000000      0.000000      0.000000      0.000000      0.000000   
 25%        0.000000      0.000000      1.000000      2.000000      0.000000   
 50%        0.000000      0.000000      1.000000      3.000000      0.000000   
 75%        0.000000      0.000000      1.000000      4.000000      0.000000   
 max        1.000000      1.000000      1.000000     20.000000      1.000000   
 
        https_Domain       TinyURL  Prefix/Suffix    DNS_Record   Web_Traffic  \
 count  10000.000000  10000.000000   10000.000000  10000.000000  10000.000000   
 mean       0.000200      0.090300       0.093200      0.100800      0.845700   
 std        0.014141      0.

In [3]:
outliers = {}
for column in data.select_dtypes(include=['int64']).columns:
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outlier_count = ((data[column] < lower_bound) | (data[column] > upper_bound)).sum()
    outliers[column] = outlier_count

outliers

{'Have_IP': 55,
 'Have_At': 226,
 'URL_Length': 2266,
 'URL_Depth': 357,
 'Redirection': 135,
 'https_Domain': 2,
 'TinyURL': 903,
 'Prefix/Suffix': 932,
 'DNS_Record': 1008,
 'Web_Traffic': 1543,
 'Domain_Age': 0,
 'Domain_End': 1901,
 'iFrame': 909,
 'Mouse_Over': 666,
 'Right_Click': 7,
 'Web_Forwards': 1053,
 'Label': 0}

In [4]:
X = data.drop(["Label","Domain"], axis=1)
y = data["Label"]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.25)

In [5]:
#Neural Network 
parameters = {'activation': ['identity', 'logistic', 'tanh', 'relu'], 'solver': ['lbfgs','adam','sgd'], 'hidden_layer_sizes':np.arange(5, 15),'max_iter':[500,600,700,800,900,1000]}
mlp_clf = GridSearchCV(MLPClassifier(), parameters,n_jobs=-1)
mlp_clf.fit(X_train, y_train)
print(mlp_clf.best_params_)

{'activation': 'tanh', 'hidden_layer_sizes': 14, 'max_iter': 900, 'solver': 'lbfgs'}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [6]:
y_pred = mlp_clf.predict(X_test)
mlp_accuracy = accuracy_score(y_test,y_pred)
mlp_recall = recall_score(y_test,y_pred)
mlp_precision = precision_score(y_test,y_pred)
print("MLP Accuracy:",mlp_accuracy,"\nMLP Recall",mlp_recall,"\nMLP Precision",mlp_precision)
print(confusion_matrix(y_test,y_pred))

MLP Accuracy: 0.8492 
MLP Recall 0.7545239968528717 
MLP Precision 0.9365234375
[[1164   65]
 [ 312  959]]


In [7]:
#Random Forest