### import pandas as pd 
df = pd.read_csv("C:\\Users\\jayan\\OneDrive\\Documents\\finalyearproject\\url_spam_classification.csv")
df.head()

In [182]:
df = df.drop_duplicates(keep = 'first')


In [184]:
df.shape


(87581, 2)

In [186]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['is_spam'])

df = df.drop('is_spam', axis=1)

In [188]:
df.head

<bound method NDFrame.head of                                                       url  target
0       https://briefingday.us8.list-manage.com/unsubs...       1
1                                  https://www.hvper.com/       1
2                      https://briefingday.com/m/v4n3i4f3       1
3        https://briefingday.com/n/20200618/m#commentform       0
4                             https://briefingday.com/fan       1
...                                                   ...     ...
148287  https://numlock.substack.com/p/numlock-news-de...       0
148288  https://numlock.substack.com/p/numlock-news-de...       0
148290  https://www.hollywoodreporter.com/news/2020-bo...       0
148294  https://apnews.com/article/seoul-south-korea-n...       0
148297  https://www.wsj.com/articles/the-philadelphia-...       0

[87581 rows x 2 columns]>

In [190]:
import re
import urllib.parse
import math

# Feature extraction function
def extract_url_features(url):
    # Parse the URL
    parsed_url = urllib.parse.urlparse(url)
    
    features = {}
    
    # 1. url-length: Number of characters in the URL
    features['url_length'] = len(url)
    
    # 2. has_subscribe: Whether the URL contains the word 'subscribe'
    features['has_subscribe'] = int('subscribe' in url.lower())
    
    # 3. contains_hash: Whether the URL contains the hash '#' symbol
    features['contains_hash'] = int('#' in url)
    
    # 4. num_digits: The number of digits in the URL
    features['num_digits'] = len(re.findall(r'\d', url))
    
    # 5. non_https: Whether the URL uses a non-HTTPS connection
    features['non_https'] = int(parsed_url.scheme != 'https')
    
    # 6. num_words: The number of words in the URL (split by '/' and '-')
    features['num_words'] = len(re.findall(r'[\w]+', parsed_url.path))
    
    # 7. entropy: Measure of entropy (disorder/uncertainty) in the URL
    def calculate_entropy(url):
        # Frequency of each character
        prob = [float(url.count(c)) / len(url) for c in set(url)]
        # Shannon entropy formula
        entropy = -sum([p * math.log2(p) for p in prob])
        return entropy
    
    features['entropy'] = calculate_entropy(url)
    
    # 8. num_params: Number of query parameters in the URL
    query_params = urllib.parse.parse_qs(parsed_url.query)
    features['num_params'] = len(query_params)
    
    # 9. num_fragments: Number of fragments in the URL (after '#')
    features['num_fragments'] = len(parsed_url.fragment.split('&')) if parsed_url.fragment else 0
    
    # 10. num_subdomains: Number of subdomains (split by '.')
    features['num_subdomains'] = len(parsed_url.netloc.split('.')) - 2  # -2 for 'domain' and 'tld'
    
    # 11. num_%20: Number of encoded white spaces ('%20') in the URL
    features['num_%20'] = url.count('%20')
    
    # 12. num_@: Number of '@' symbols in the URL
    features['num_@'] = url.count('@')
    
    # 13. has_ip: Check if the URL has an IP address instead of a domain name
    ip_pattern = re.compile(r'(\d{1,3}\.){3}\d{1,3}')  # Pattern to match IP addresses
    features['has_ip'] = int(bool(ip_pattern.search(parsed_url.netloc)))
    
    return features


In [192]:
df['features'] = df['url'].apply(extract_url_features)


In [193]:
df.head()

Unnamed: 0,url,target,features
0,https://briefingday.us8.list-manage.com/unsubs...,1,"{'url_length': 51, 'has_subscribe': 1, 'contai..."
1,https://www.hvper.com/,1,"{'url_length': 22, 'has_subscribe': 0, 'contai..."
2,https://briefingday.com/m/v4n3i4f3,1,"{'url_length': 34, 'has_subscribe': 0, 'contai..."
3,https://briefingday.com/n/20200618/m#commentform,0,"{'url_length': 48, 'has_subscribe': 0, 'contai..."
4,https://briefingday.com/fan,1,"{'url_length': 27, 'has_subscribe': 0, 'contai..."


In [195]:
X = pd.json_normalize(df['features'])

print(X)


       url_length  has_subscribe  contains_hash  num_digits  non_https  \
0              51              1              0           1          0   
1              22              0              0           0          0   
2              34              0              0           4          0   
3              48              0              1           8          0   
4              27              0              0           0          0   
...           ...            ...            ...         ...        ...   
87576          68              0              0           6          0   
87577          77              0              0           6          0   
87578          97              0              0           6          0   
87579         113              0              0          24          0   
87580         112              0              0          11          0   

       num_words   entropy  num_params  num_fragments  num_subdomains  \
0              1  4.385195           0

In [199]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Assuming 'X' is already your feature matrix and 'y' is your target column
from sklearn.ensemble import RandomForestClassifier

X = pd.json_normalize(df['features'])

y = df['target']
# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Initialize BaggingClassifier with DecisionTreeClassifier as the base estimator
bagging_clf = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, random_state=42)

# Step 4: Fit the model to the training data
bagging_clf.fit(X_train, y_train)

# Step 5: Make predictions on the test data
y_pred = bagging_clf.predict(X_test)

# Step 6: Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Step 7: Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.9682023177484729
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     16956
           1       0.51      0.19      0.28       561

    accuracy                           0.97     17517
   macro avg       0.74      0.59      0.63     17517
weighted avg       0.96      0.97      0.96     17517



In [7]:
# 1. Logistic Regression
from sklearn.linear_model import LogisticRegression

# 2. Support Vector Machine (SVM)
from sklearn.svm import SVC

# 3. K-Nearest Neighbors (KNN)
from sklearn.neighbors import KNeighborsClassifier

# 4. Decision Tree
from sklearn.tree import DecisionTreeClassifier

# 5. Random Forest
from sklearn.ensemble import RandomForestClassifier

# 6. Gradient Boosting Machines (GBM)
from sklearn.ensemble import GradientBoostingClassifier

# 7. XGBoost
from xgboost import XGBClassifier
from xgboost import XGBClassifier

# 8. LightGBM
from lightgbm import LGBMClassifier

# 9. CatBoost
from catboost import CatBoostClassifier

# 10. Naive Bayes
from sklearn.naive_bayes import GaussianNB

# 11. Linear Discriminant Analysis (LDA)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# 12. Quadratic Discriminant Analysis (QDA)
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# 13. AdaBoost
from sklearn.ensemble import AdaBoostClassifier

# 14. Extra Trees
from sklearn.ensemble import ExtraTreesClassifier

# 15. Stacking
from sklearn.ensemble import StackingClassifier

# 16. Voting Classifier
from sklearn.ensemble import VotingClassifier

# 17. Principal Component Analysis (PCA) for dimensionality reduction
from sklearn.decomposition import PCA


In [8]:

classifiers = {}

# 1. Logistic Regression
classifiers['log_reg'] = LogisticRegression(random_state=42)

# 2. Support Vector Machine (SVM)
classifiers['svc'] = SVC(kernel='sigmoid', gamma=1.0)

# 3. K-Nearest Neighbors (KNN)
classifiers['knn'] = KNeighborsClassifier(n_neighbors=5)

# 4. Decision Tree
classifiers['decision_tree'] = DecisionTreeClassifier(random_state=42)

# 5. Random Forest
classifiers['random_forest'] = RandomForestClassifier(n_estimators=100, random_state=42)

# 6. Gradient Boosting Machines (GBM)
classifiers['gb'] = GradientBoostingClassifier(n_estimators=100, random_state=42)

# 7. XGBoost
classifiers['xgb'] = XGBClassifier(random_state=42)

# 8. LightGBM
classifiers['lgb'] = LGBMClassifier(random_state=42)

# 9. CatBoost
classifiers['catboost'] = CatBoostClassifier(random_state=42, verbose=0)

# 10. Naive Bayes
classifiers['naive_bayes'] = GaussianNB()

# 11. Linear Discriminant Analysis (LDA)
classifiers['lda'] = LinearDiscriminantAnalysis()

# 12. Quadratic Discriminant Analysis (QDA)
classifiers['qda'] = QuadraticDiscriminantAnalysis()

# 13. AdaBoost
classifiers['ada'] = AdaBoostClassifier(random_state=42)

# 14. Extra Trees
classifiers['extra_trees'] = ExtraTreesClassifier(n_estimators=100, random_state=42)




In [9]:
from sklearn.ensemble import StackingClassifier

def create_stacking_classifier():
    # Base estimators
    base_estimators = [
        ('log_reg', LogisticRegression(random_state=42)),
        ('svc', SVC(kernel='sigmoid', gamma=1.0)),
        ('knn', KNeighborsClassifier(n_neighbors=5)),
        ('random_forest', RandomForestClassifier(n_estimators=100, random_state=42))
    ]

    # Final estimator
    final_estimator = LogisticRegression(random_state=42)

    # Stacking Classifier
    stacking_clf = StackingClassifier(estimators=base_estimators, final_estimator=final_estimator, cv=5)

    return stacking_clf

# Example of using the function
stacking_classifier = create_stacking_classifier()


In [10]:
def create_voting_classifier():
    # Create base classifiers
    base_estimators = [
        ('log_reg', LogisticRegression(random_state=42)),
        ('svc', SVC(kernel='sigmoid', gamma=1.0, probability=True)),  # Set probability=True for soft voting
        ('knn', KNeighborsClassifier(n_neighbors=5)),
        ('random_forest', RandomForestClassifier(n_estimators=100, random_state=42))
    ]

    # Create Voting Classifier (soft voting)
    voting_clf = VotingClassifier(estimators=base_estimators, voting='soft')

    return voting_clf

In [13]:
import pickle
# Create the voting classifier
voting_clf_model = create_voting_classifier()

# Save the model to a .pkl file
with open('voting_classifier.pkl', 'wb') as file:
    pickle.dump(voting_clf_model, file)

print("Model successfully pickled as 'voting_classifier.pkl'.")

Model successfully pickled as 'voting_classifier.pkl'.


In [242]:
clfs = {
    'SVC': svc,
    'Logistic Regression': LogisticRegression(random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'LightGBM': LGBMClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(random_state=42, verbose=0),
    'Naive Bayes': GaussianNB(),
    'LDA': LinearDiscriminantAnalysis(),
    'QDA': QuadraticDiscriminantAnalysis(),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=42),
    'Stacking': create_stacking_classifier(),
    'Voting Classifier': create_voting_classifier(),
    # Assuming you have the create_stacking_classifier function defined
}

In [244]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    
    return accuracy,precision

In [247]:
accuracy_scores = []
precision_scores = []

for name,clf in clfs.items():
    
    current_accuracy,current_precision = train_classifier(clf, X_train,y_train,X_test,y_test)
    
    print("For ",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


For  SVC
Accuracy -  0.9679739681452304
Precision -  0.0
For  Logistic Regression
Accuracy -  0.9714562995946795
Precision -  0.9295774647887324
For  KNN
Accuracy -  0.969458240566307
Precision -  0.6031746031746031
For  Decision Tree
Accuracy -  0.9537021179425701
Precision -  0.24279835390946503
For  Random Forest
Accuracy -  0.9679739681452304
Precision -  0.5
For  Gradient Boosting
Accuracy -  0.9719700862019752
Precision -  0.8977272727272727
For  XGBoost
Accuracy -  0.9726551350117029
Precision -  0.91
[LightGBM] [Info] Number of positive: 2224, number of negative: 67840
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007269 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 642
[LightGBM] [Info] Number of data points in the train set: 70064, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.031742 -> init

  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


For  AdaBoost
Accuracy -  0.9712850373922476
Precision -  0.8918918918918919
For  Extra Trees
Accuracy -  0.9649483359022664
Precision -  0.40148698884758366


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


For  Stacking
Accuracy -  0.9717417365987326
Precision -  0.83
For  Voting Classifier
Accuracy -  0.9717417365987326
Precision -  1.0


In [249]:
performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False)
performance_df

Unnamed: 0,Algorithm,Accuracy,Precision
15,Voting Classifier,0.971742,1.0
1,Logistic Regression,0.971456,0.929577
6,XGBoost,0.972655,0.91
7,LightGBM,0.972712,0.902913
5,Gradient Boosting,0.97197,0.897727
12,AdaBoost,0.971285,0.891892
8,CatBoost,0.972541,0.884615
10,LDA,0.971,0.835443
14,Stacking,0.971742,0.83
2,KNN,0.969458,0.603175


In [257]:

current_accuracy,current_precision = train_classifier(voting_classifier(), X_train,y_train,X_test,y_test)
    
print("For ",name)
print("Accuracy - ",current_accuracy)
print("Precision - ",current_precision)

[LightGBM] [Info] Number of positive: 2224, number of negative: 67840
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004550 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 642
[LightGBM] [Info] Number of data points in the train set: 70064, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.031742 -> initscore=-3.417845
[LightGBM] [Info] Start training from score -3.417845
For  Voting Classifier
Accuracy -  0.9722555232060285
Precision -  0.9746835443037974


In [None]:
def voting_classifier():
    # Create base classifiers
    base_estimators = [
        ('log_reg', LogisticRegression(random_state=42)),
        ('XGBoost', XGBClassifier(random_state=42)),  # Set probability=True for soft voting
        ('LightGBM',LGBMClassifier(random_state=42)),
        ('Gradient Boosting', GradientBoostingClassifier(n_estimators=100, random_state=42))
    ]

    # Create Voting Classifier (soft voting)
    voting_clf = VotingClassifier(estimators=base_estimators, voting='hard')

    return voting_clf