In [1]:
import zipfile
import os
import pandas as pd
# Function to extract the ZIP file
def extract_zip(file_path, extract_to_folder):
    try:
        with zipfile.ZipFile(file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to_folder)
            print(f"Files extracted to {extract_to_folder}")
    except Exception as e:
        print(f"Error extracting the ZIP file: {e}")

# Function to read the .txt (CSV-like) file from the extracted folder and skip commented lines
def read_file_from_extracted_folder(extracted_folder):
    extracted_files = os.listdir(extracted_folder)
    print(f"Files in extracted folder: {extracted_files}")
    file_to_read = None
    for file in extracted_files:
        if file.endswith('.txt'):
            file_to_read = os.path.join(extracted_folder, file)
            break
    if file_to_read:
        try:
            df = pd.read_csv(file_to_read, comment='#', header=None)
            print(f"File read successfully from {file_to_read}")
            return df
        except Exception as e:
            print(f"Error reading the file: {e}")
            return None
    else:
        print("No readable file found in the extracted folder.")
        return None

# Function to assign column labels to the DataFrame
def assign_column_labels(df):
    df.columns = [
        'id', 'dateadded', 'url', 'url_status', 'last_online', 'threat', 
        'tags', 'urlhaus_link', 'reporter'
    ]
    return df

# Function to save DataFrame to CSV
def save_to_csv(df, output_path):
    try:
        df.to_csv(output_path, index=False)
        print(f"Data saved to {output_path}")
    except Exception as e:
        print(f"Error saving data to CSV: {e}")

# Path setup
zip_file_path = "urlhaus_data.csv"  # Replace with the actual path to the ZIP file
extracted_folder = "extracted_data"
output_csv_path = "malicious_data.csv"

# Process data
extract_zip(zip_file_path, extracted_folder)
df = read_file_from_extracted_folder(extracted_folder)
if df is not None:
    df_malicious = assign_column_labels(df)
    print(df_malicious.head())
    print(f"DataFrame Shape: {df_malicious.shape}")
    save_to_csv(df_malicious, output_csv_path)
else:
    print("Failed to read the file.")


Files extracted to extracted_data
Files in extracted folder: ['csv.txt']
File read successfully from extracted_data/csv.txt
        id            dateadded                                 url  \
0  3312255  2024-11-30 02:04:23   http://117.199.21.47:53445/bin.sh   
1  3312254  2024-11-30 02:03:23  http://117.209.28.169:33720/bin.sh   
2  3312253  2024-11-30 02:03:06  http://222.246.40.100:55760/bin.sh   
3  3312252  2024-11-30 02:02:35    http://59.91.75.160:58979/bin.sh   
4  3312250  2024-11-30 02:02:07          http://39.79.9.209:50976/i   

  url_status          last_online            threat                  tags  \
0     online  2024-11-30 02:04:23  malware_download  32-bit,elf,mips,Mozi   
1     online  2024-11-30 02:03:23  malware_download  32-bit,elf,mips,Mozi   
2     online  2024-11-30 02:03:06  malware_download   32-bit,arm,elf,Mozi   
3    offline                  NaN  malware_download  32-bit,elf,mips,Mozi   
4     online  2024-11-30 02:02:07  malware_download  32-bit,elf,

In [3]:
df_malicious.shape

(173887, 9)

In [5]:
df_malicious.columns

Index(['id', 'dateadded', 'url', 'url_status', 'last_online', 'threat', 'tags',
       'urlhaus_link', 'reporter'],
      dtype='object')

In [7]:
df_malicious.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173887 entries, 0 to 173886
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            173887 non-null  int64 
 1   dateadded     173887 non-null  object
 2   url           173887 non-null  object
 3   url_status    173887 non-null  object
 4   last_online   156697 non-null  object
 5   threat        173887 non-null  object
 6   tags          169485 non-null  object
 7   urlhaus_link  173887 non-null  object
 8   reporter      173887 non-null  object
dtypes: int64(1), object(8)
memory usage: 11.9+ MB


In [9]:
import pandas as pd

# Load the malicious dataset
malicious_data = pd.read_csv("malicious_data.csv")

# Save the updated dataset to a new file
malicious_data.to_csv("malicious_data_final.csv", index=False)

# Display the first few rows to confirm
print("Updated Malicious Dataset with Labels:")
print(malicious_data.head())


Updated Malicious Dataset with Labels:
        id            dateadded                                 url  \
0  3312255  2024-11-30 02:04:23   http://117.199.21.47:53445/bin.sh   
1  3312254  2024-11-30 02:03:23  http://117.209.28.169:33720/bin.sh   
2  3312253  2024-11-30 02:03:06  http://222.246.40.100:55760/bin.sh   
3  3312252  2024-11-30 02:02:35    http://59.91.75.160:58979/bin.sh   
4  3312250  2024-11-30 02:02:07          http://39.79.9.209:50976/i   

  url_status          last_online            threat                  tags  \
0     online  2024-11-30 02:04:23  malware_download  32-bit,elf,mips,Mozi   
1     online  2024-11-30 02:03:23  malware_download  32-bit,elf,mips,Mozi   
2     online  2024-11-30 02:03:06  malware_download   32-bit,arm,elf,Mozi   
3    offline                  NaN  malware_download  32-bit,elf,mips,Mozi   
4     online  2024-11-30 02:02:07  malware_download  32-bit,elf,mips,Mozi   

                            urlhaus_link  reporter  
0  https://urlhaus

In [11]:
import pandas as pd

# Load the 'tranco_58PNN_legitimate_dataset.csv'
df_legitimate = pd.read_csv('tranco_58PNN-2.csv')

# Display the first few rows of the dataset
df_legitimate.head()

Unnamed: 0,id,dateadded,url,url_status,last_online,threat,tags,urlhaus_link,reporter
0,1,2024-11-30 02:10:35,mail.ru,online,2024-11-30 02:10:35,legitimate,none,,Tranco-58PNN
1,2,2024-11-30 02:10:35,microsoft.com,online,2024-11-30 02:10:35,legitimate,none,,Tranco-58PNN
2,3,2024-11-30 02:10:35,facebook.com,online,2024-11-30 02:10:35,legitimate,none,,Tranco-58PNN
3,4,2024-11-30 02:10:35,dzen.ru,online,2024-11-30 02:10:35,legitimate,none,,Tranco-58PNN
4,5,2024-11-30 02:10:35,apple.com,online,2024-11-30 02:10:35,legitimate,none,,Tranco-58PNN


In [13]:
df_legitimate.shape

(999999, 9)

In [15]:
df_legitimate.columns

Index(['id', 'dateadded', 'url', 'url_status', 'last_online', 'threat', 'tags',
       'urlhaus_link', 'reporter'],
      dtype='object')

In [17]:
df_legitimate.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999999 entries, 0 to 999998
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            999999 non-null  int64  
 1   dateadded     999999 non-null  object 
 2   url           999999 non-null  object 
 3   url_status    999999 non-null  object 
 4   last_online   999999 non-null  object 
 5   threat        999999 non-null  object 
 6   tags          999999 non-null  object 
 7   urlhaus_link  0 non-null       float64
 8   reporter      999999 non-null  object 
dtypes: float64(1), int64(1), object(7)
memory usage: 68.7+ MB


In [19]:
import pandas as pd

# Load the malicious dataset (with Label column already added)
malicious_data = pd.read_csv("malicious_data_final.csv")  # Replace with your file path

# Load the legitimate dataset (with Label column added as 0)
legitimate_data = pd.read_csv("tranco_58PNN-2.csv")  # Replace with your file path

# Combine the two datasets
combined_data = pd.concat([malicious_data, legitimate_data], ignore_index=True)

# Shuffle the dataset to ensure randomization
combined_data = combined_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the combined dataset to a new file
combined_data.to_csv("raw_dataset.csv", index=False)

# Display the first few rows of the combined dataset
print("Raw Dataset:")
print(combined_data.head())

Raw Dataset:
        id            dateadded                               url url_status  \
0  2857468  2024-05-20 21:19:22  http://31.222.113.214:49012/sshd     online   
1    36118  2024-11-30 02:10:35                intrepidtravel.com     online   
2   967200  2024-11-30 02:10:35                       stavki.info     online   
3   811220  2024-11-30 02:10:35                  prismatec.srv.br     online   
4    56759  2024-11-30 02:10:35                 digitalpfizer.com     online   

           last_online            threat  tags  \
0  2024-11-30 01:45:01  malware_download   elf   
1  2024-11-30 02:10:35        legitimate  none   
2  2024-11-30 02:10:35        legitimate  none   
3  2024-11-30 02:10:35        legitimate  none   
4  2024-11-30 02:10:35        legitimate  none   

                            urlhaus_link      reporter  
0  https://urlhaus.abuse.ch/url/2857468/  abus3reports  
1                                    NaN  Tranco-58PNN  
2                                 

In [20]:
import pandas as pd

# Load the dataset
file_path = "raw_dataset.csv"
df= pd.read_csv(file_path)

# Display the first few rows to understand the structure
print(df.head())
print(df.info())  # Get information about the dataset

        id            dateadded                               url url_status  \
0  2857468  2024-05-20 21:19:22  http://31.222.113.214:49012/sshd     online   
1    36118  2024-11-30 02:10:35                intrepidtravel.com     online   
2   967200  2024-11-30 02:10:35                       stavki.info     online   
3   811220  2024-11-30 02:10:35                  prismatec.srv.br     online   
4    56759  2024-11-30 02:10:35                 digitalpfizer.com     online   

           last_online            threat  tags  \
0  2024-11-30 01:45:01  malware_download   elf   
1  2024-11-30 02:10:35        legitimate  none   
2  2024-11-30 02:10:35        legitimate  none   
3  2024-11-30 02:10:35        legitimate  none   
4  2024-11-30 02:10:35        legitimate  none   

                            urlhaus_link      reporter  
0  https://urlhaus.abuse.ch/url/2857468/  abus3reports  
1                                    NaN  Tranco-58PNN  
2                                    NaN  Tranc

Steps to Address Class Imbalance
1. Load the Combined Dataset
First, load the combined dataset and inspect the class distribution.

In [22]:
import pandas as pd

# Load the raw combined dataset
df = pd.read_csv("raw_dataset.csv")

# Check class distribution
print("Class Distribution:")
print(df['threat'].value_counts())  # Ensure 'threat' or appropriate label column is present

# Optionally rename 'threat' to 'Label' for clarity
df['Label'] = df['threat'].apply(lambda x: 1 if x == 'malware_download' else 0)

Class Distribution:
threat
legitimate          999999
malware_download    173887
Name: count, dtype: int64


From result:
Phishing URLs (malicious): 173,887 samples.
Legitimate URLs: 999,999 samples.
This imbalance can cause the model to predict the majority class (legitimate) more often, leading to poor performance on phishing detection.

Steps to Handle Class Imbalance
Below are detailed steps to balance your dataset before proceeding to feature extraction.

Combine Techniques:- 
we can do downsampling

Downsample the majority class to a reasonable size.

Step 1: Prepare the Dataset
First, ensure that the dataset is loaded and labeled correctly.
Address Class Imbalance

In [30]:
import pandas as pd

# Load the raw dataset
df = pd.read_csv("raw_dataset.csv")

# Map 'threat' to binary labels: legitimate (0) and phishing (1)
df['Label'] = df['threat'].apply(lambda x: 1 if x == 'malware_download' else 0)

# Check the updated class distribution
print("Updated Class Distribution:")
print(df['Label'].value_counts())

Updated Class Distribution:
Label
0    999999
1    173887
Name: count, dtype: int64


Step 2: Downsample the Majority Class
Reduce the number of legitimate URLs (Label = 0) to a reasonable size while keeping the phishing URLs intact.

In [32]:
import pandas as pd

# Load the raw dataset
df = pd.read_csv("raw_dataset.csv")

# Map 'threat' column to binary labels: legitimate = 0, phishing = 1
df['Label'] = df['threat'].apply(lambda x: 1 if x == 'malware_download' else 0)

# Separate legitimate and phishing samples
df_legit = df[df['Label'] == 0]
df_phish = df[df['Label'] == 1]

# Downsample legitimate URLs to a reasonable size (e.g., 400,000 samples)
df_legit_downsampled = df_legit.sample(173887, random_state=42)

# Combine the downsampled legitimate data with all phishing data
df_downsampled = pd.concat([df_legit_downsampled, df_phish])

# Shuffle the dataset
df_downsampled = df_downsampled.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the downsampled dataset
df_downsampled.to_csv("downsampled_dataset.csv", index=False)

# Check class distribution
print("Class Distribution After Downsampling:")
print(df_downsampled['Label'].value_counts())


Class Distribution After Downsampling:
Label
1    173887
0    173887
Name: count, dtype: int64


 Load the Downsampled Dataset

In [35]:
# Load necessary libraries
import pandas as pd

# Load the downsampled dataset
df_downsampled = pd.read_csv("downsampled_dataset.csv")

# Inspect the dataset structure
print(df_downsampled.info())

# Check class distribution
print("Class Distribution:")
print(df_downsampled['Label'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 347774 entries, 0 to 347773
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            347774 non-null  int64 
 1   dateadded     347774 non-null  object
 2   url           347774 non-null  object
 3   url_status    347774 non-null  object
 4   last_online   330584 non-null  object
 5   threat        347774 non-null  object
 6   tags          343372 non-null  object
 7   urlhaus_link  173887 non-null  object
 8   reporter      347774 non-null  object
 9   Label         347774 non-null  int64 
dtypes: int64(2), object(8)
memory usage: 26.5+ MB
None
Class Distribution:
Label
1    173887
0    173887
Name: count, dtype: int64


Extract Numerical Features
Extract keyA numerical features to capture patterns that distinguish phishing from legitimate URLs.

Features to Extraction

1. URL Length
Description: The total length of the URL string.
Why it's important: Phishing URLs often tend to be longer, as they try to obfuscate their true destination.

In [41]:
df_downsampled['URL_Length'] = df_downsampled['url'].apply(len)
print("\nShape of the dataframe:", df_downsampled.shape)
print(df_downsampled.head())


Shape of the dataframe: (347774, 11)
        id            dateadded                                      url  \
0  3230050  2024-10-11 12:23:33        http://117.209.4.108:57478/bin.sh   
1  3177228  2024-09-16 19:43:06  http://154.216.18.121/.c/loginclientbot   
2      815  2024-11-30 02:10:35                                 zoho.com   
3   792779  2024-11-30 02:10:35                            kp-velenje.si   
4  3157557  2024-09-05 06:43:05            http://119.117.99.166:41825/i   

  url_status          last_online            threat                  tags  \
0    offline  2024-10-11 12:23:33  malware_download  32-bit,elf,mips,Mozi   
1    offline  2024-09-20 07:22:27  malware_download    elf,ladvix,opendir   
2     online  2024-11-30 02:10:35        legitimate                  none   
3     online  2024-11-30 02:10:35        legitimate                  none   
4    offline  2024-09-10 05:51:25  malware_download  32-bit,elf,mips,Mozi   

                            urlhaus_link  

2. URL Depth
Description: The number of slashes (/) in the URL path, indicating its depth.
Why it's important: Phishing URLs may have a deep directory structure to confuse users about the website's origin.

In [44]:
df_downsampled['URL_Depth'] = df_downsampled['url'].apply(lambda x: x.count('/'))
print("\nShape of the dataframe:", df_downsampled.shape)
print(df_downsampled.head())


Shape of the dataframe: (347774, 12)
        id            dateadded                                      url  \
0  3230050  2024-10-11 12:23:33        http://117.209.4.108:57478/bin.sh   
1  3177228  2024-09-16 19:43:06  http://154.216.18.121/.c/loginclientbot   
2      815  2024-11-30 02:10:35                                 zoho.com   
3   792779  2024-11-30 02:10:35                            kp-velenje.si   
4  3157557  2024-09-05 06:43:05            http://119.117.99.166:41825/i   

  url_status          last_online            threat                  tags  \
0    offline  2024-10-11 12:23:33  malware_download  32-bit,elf,mips,Mozi   
1    offline  2024-09-20 07:22:27  malware_download    elf,ladvix,opendir   
2     online  2024-11-30 02:10:35        legitimate                  none   
3     online  2024-11-30 02:10:35        legitimate                  none   
4    offline  2024-09-10 05:51:25  malware_download  32-bit,elf,mips,Mozi   

                            urlhaus_link  

3. Presence of Suspicious Characters
Description: Check for suspicious characters like @, %, =, and ? in the URL.
Why it's important: Phishing URLs often contain special characters to manipulate URL parsing and mislead users.

In [47]:
df_downsampled['Have_At'] = df_downsampled['url'].apply(lambda x: 1 if '@' in x else 0)
df_downsampled['Have_Percent'] = df_downsampled['url'].apply(lambda x: 1 if '%' in x else 0)
df_downsampled['Have_Equals'] = df_downsampled['url'].apply(lambda x: 1 if '=' in x else 0)
df_downsampled['Have_QuestionMark'] = df_downsampled['url'].apply(lambda x: 1 if '?' in x else 0)
print("\nShape of the dataframe:", df_downsampled.shape)
print(df_downsampled.head())


Shape of the dataframe: (347774, 16)
        id            dateadded                                      url  \
0  3230050  2024-10-11 12:23:33        http://117.209.4.108:57478/bin.sh   
1  3177228  2024-09-16 19:43:06  http://154.216.18.121/.c/loginclientbot   
2      815  2024-11-30 02:10:35                                 zoho.com   
3   792779  2024-11-30 02:10:35                            kp-velenje.si   
4  3157557  2024-09-05 06:43:05            http://119.117.99.166:41825/i   

  url_status          last_online            threat                  tags  \
0    offline  2024-10-11 12:23:33  malware_download  32-bit,elf,mips,Mozi   
1    offline  2024-09-20 07:22:27  malware_download    elf,ladvix,opendir   
2     online  2024-11-30 02:10:35        legitimate                  none   
3     online  2024-11-30 02:10:35        legitimate                  none   
4    offline  2024-09-10 05:51:25  malware_download  32-bit,elf,mips,Mozi   

                            urlhaus_link  

4. Number of Subdomains
Description: Count the number of subdomains in the URL (i.e., the parts before the domain name).
Why it's important: Phishing URLs often use multiple subdomains to mimic legitimate websites.

In [50]:
# Count the number of subdomains in the URL (excluding the main domain and TLD)
df_downsampled['Num_Subdomains'] = df_downsampled['url'].apply(lambda x: len(x.split('//')[1].split('.')[:-2]) if '//' in x else 0)

# Display the first few rows with the 'Num_Subdomains' feature
print("\nShape of the dataframe:", df_downsampled.shape)
print(df_downsampled.head())


Shape of the dataframe: (347774, 17)
        id            dateadded                                      url  \
0  3230050  2024-10-11 12:23:33        http://117.209.4.108:57478/bin.sh   
1  3177228  2024-09-16 19:43:06  http://154.216.18.121/.c/loginclientbot   
2      815  2024-11-30 02:10:35                                 zoho.com   
3   792779  2024-11-30 02:10:35                            kp-velenje.si   
4  3157557  2024-09-05 06:43:05            http://119.117.99.166:41825/i   

  url_status          last_online            threat                  tags  \
0    offline  2024-10-11 12:23:33  malware_download  32-bit,elf,mips,Mozi   
1    offline  2024-09-20 07:22:27  malware_download    elf,ladvix,opendir   
2     online  2024-11-30 02:10:35        legitimate                  none   
3     online  2024-11-30 02:10:35        legitimate                  none   
4    offline  2024-09-10 05:51:25  malware_download  32-bit,elf,mips,Mozi   

                            urlhaus_link  

5. Check for Suspicious Words in the URL
Phishing URLs often contain suspicious or misleading words like "login", "secure", "bank", etc.

In [53]:
# List of suspicious keywords commonly found in phishing URLs
suspicious_keywords = ['login', 'secure', 'update', 'account', 'signin', 'verify', 'bank', 'paypal', 'free', 'confirm']

# Function to check if any suspicious keyword is in the URL
def check_suspicious_keywords(url):
    return 1 if any(keyword in url.lower() for keyword in suspicious_keywords) else 0  # Case-insensitive check

# Apply the function to the DataFrame and create the 'Has_Suspicious_Keyword' column
df_downsampled['Has_Suspicious_Keyword'] = df_downsampled['url'].apply(check_suspicious_keywords)

# Display the first few rows of the dataframe to verify
print("\nShape of the dataframe:", df_downsampled.shape)
print(df_downsampled.head())


Shape of the dataframe: (347774, 18)
        id            dateadded                                      url  \
0  3230050  2024-10-11 12:23:33        http://117.209.4.108:57478/bin.sh   
1  3177228  2024-09-16 19:43:06  http://154.216.18.121/.c/loginclientbot   
2      815  2024-11-30 02:10:35                                 zoho.com   
3   792779  2024-11-30 02:10:35                            kp-velenje.si   
4  3157557  2024-09-05 06:43:05            http://119.117.99.166:41825/i   

  url_status          last_online            threat                  tags  \
0    offline  2024-10-11 12:23:33  malware_download  32-bit,elf,mips,Mozi   
1    offline  2024-09-20 07:22:27  malware_download    elf,ladvix,opendir   
2     online  2024-11-30 02:10:35        legitimate                  none   
3     online  2024-11-30 02:10:35        legitimate                  none   
4    offline  2024-09-10 05:51:25  malware_download  32-bit,elf,mips,Mozi   

                            urlhaus_link  

6.Number of IP Addresses in the URL
Phishing URLs sometimes use IP addresses instead of domain names. This feature helps identify those URLs.

In [56]:
import re

# Function to check if URL has an IP address
def has_ip(url):
    ip_pattern = r'(\d{1,3}\.){3}\d{1,3}'  # Matches typical IP address pattern
    return 1 if re.search(ip_pattern, url) else 0  # Return 1 if IP is found, else return 0

# Apply the function to the DataFrame and create the 'Has_IP' column
df_downsampled['Has_IP'] = df_downsampled['url'].apply(has_ip)
print("\nShape of the dataframe:", df_downsampled.shape)
# Display the first few rows of the dataframe to verify
print(df_downsampled.head())



Shape of the dataframe: (347774, 19)
        id            dateadded                                      url  \
0  3230050  2024-10-11 12:23:33        http://117.209.4.108:57478/bin.sh   
1  3177228  2024-09-16 19:43:06  http://154.216.18.121/.c/loginclientbot   
2      815  2024-11-30 02:10:35                                 zoho.com   
3   792779  2024-11-30 02:10:35                            kp-velenje.si   
4  3157557  2024-09-05 06:43:05            http://119.117.99.166:41825/i   

  url_status          last_online            threat                  tags  \
0    offline  2024-10-11 12:23:33  malware_download  32-bit,elf,mips,Mozi   
1    offline  2024-09-20 07:22:27  malware_download    elf,ladvix,opendir   
2     online  2024-11-30 02:10:35        legitimate                  none   
3     online  2024-11-30 02:10:35        legitimate                  none   
4    offline  2024-09-10 05:51:25  malware_download  32-bit,elf,mips,Mozi   

                            urlhaus_link  

7.Query Parameters Count
Phishing URLs often use multiple query parameters to obfuscate their intent.

In [59]:
df_downsampled['Query_Parameter_Count'] = df_downsampled['url'].apply(lambda x: x.count('?') + x.count('&'))
print("\nShape of the dataframe:", df_downsampled.shape)
print(df_downsampled.head())


Shape of the dataframe: (347774, 20)
        id            dateadded                                      url  \
0  3230050  2024-10-11 12:23:33        http://117.209.4.108:57478/bin.sh   
1  3177228  2024-09-16 19:43:06  http://154.216.18.121/.c/loginclientbot   
2      815  2024-11-30 02:10:35                                 zoho.com   
3   792779  2024-11-30 02:10:35                            kp-velenje.si   
4  3157557  2024-09-05 06:43:05            http://119.117.99.166:41825/i   

  url_status          last_online            threat                  tags  \
0    offline  2024-10-11 12:23:33  malware_download  32-bit,elf,mips,Mozi   
1    offline  2024-09-20 07:22:27  malware_download    elf,ladvix,opendir   
2     online  2024-11-30 02:10:35        legitimate                  none   
3     online  2024-11-30 02:10:35        legitimate                  none   
4    offline  2024-09-10 05:51:25  malware_download  32-bit,elf,mips,Mozi   

                            urlhaus_link  

8.Port Number: Phishing URLs often use non-standard port numbers. This feature captures any port number other than the default HTTP (80) or HTTPS (443).

In [62]:
def has_non_standard_port(url):
    # Check if URL contains a port
    if ':' in url:
        # Extract the port number
        port = url.split(':')[1].split('/')[0]
        
        # Handle case where the port is empty or invalid
        if port:
            try:
                port_num = int(port)  # Try converting to integer
                # Return 1 if the port is not 80 (HTTP) or 443 (HTTPS), otherwise return 0
                return 1 if port_num not in [80, 443] else 0
            except ValueError:
                return 0  # If invalid port, return 0
    return 0  # Return 0 if no port is found

# Apply the function to the DataFrame
df_downsampled['Has_Port'] = df_downsampled['url'].apply(has_non_standard_port)
print("\nShape of the dataframe:", df_downsampled.shape)
# Show the result
print(df_downsampled.head())



Shape of the dataframe: (347774, 21)
        id            dateadded                                      url  \
0  3230050  2024-10-11 12:23:33        http://117.209.4.108:57478/bin.sh   
1  3177228  2024-09-16 19:43:06  http://154.216.18.121/.c/loginclientbot   
2      815  2024-11-30 02:10:35                                 zoho.com   
3   792779  2024-11-30 02:10:35                            kp-velenje.si   
4  3157557  2024-09-05 06:43:05            http://119.117.99.166:41825/i   

  url_status          last_online            threat                  tags  \
0    offline  2024-10-11 12:23:33  malware_download  32-bit,elf,mips,Mozi   
1    offline  2024-09-20 07:22:27  malware_download    elf,ladvix,opendir   
2     online  2024-11-30 02:10:35        legitimate                  none   
3     online  2024-11-30 02:10:35        legitimate                  none   
4    offline  2024-09-10 05:51:25  malware_download  32-bit,elf,mips,Mozi   

                            urlhaus_link  

9. Number of Path Segments Extraction
 This feature counts the number of segments in the path part of the URL.

In [65]:
def num_path_segments(url):
    # Extract the path part of the URL (after the domain)
    path = url.split('//')[-1].split('/')[1:]
    
    # Count the number of path segments
    return len(path)

# Apply the function to the DataFrame
df_downsampled['Num_Path_Segments'] = df_downsampled['url'].apply(num_path_segments)
print("\nShape of the dataframe:", df_downsampled.shape)
# Show the result
print(df_downsampled.head())


Shape of the dataframe: (347774, 22)
        id            dateadded                                      url  \
0  3230050  2024-10-11 12:23:33        http://117.209.4.108:57478/bin.sh   
1  3177228  2024-09-16 19:43:06  http://154.216.18.121/.c/loginclientbot   
2      815  2024-11-30 02:10:35                                 zoho.com   
3   792779  2024-11-30 02:10:35                            kp-velenje.si   
4  3157557  2024-09-05 06:43:05            http://119.117.99.166:41825/i   

  url_status          last_online            threat                  tags  \
0    offline  2024-10-11 12:23:33  malware_download  32-bit,elf,mips,Mozi   
1    offline  2024-09-20 07:22:27  malware_download    elf,ladvix,opendir   
2     online  2024-11-30 02:10:35        legitimate                  none   
3     online  2024-11-30 02:10:35        legitimate                  none   
4    offline  2024-09-10 05:51:25  malware_download  32-bit,elf,mips,Mozi   

                            urlhaus_link  

10.Presence of URL Shorteners Extraction
This feature checks whether the URL contains a shortener domain (like bit.ly, goo.gl, tinyurl.com, etc.), which is a common tactic used by phishing websites to disguise malicious links

In [68]:
from urllib.parse import urlparse

# Function to check if a URL uses a shortener service
def has_url_shortener(url):
    """
    Function to check if a URL uses a known URL shortener service.

    Args:
    url (str): The URL to check.

    Returns:
    int: 1 if a URL shortener is detected, 0 otherwise.
    """
    shorteners = ['bit.ly', 'goo.gl', 'tinyurl.com', 't.co', 'is.gd']
    # Check if any of the shorteners are present in the domain part of the URL
    return 1 if any(shortener in url for shortener in shorteners) else 0

# Function to check if a URL uses HTTP or HTTPS
def check_http_or_https(url):
    """
    Function to check if a URL uses HTTP or HTTPS.

    Args:
    url (str): The URL to check.

    Returns:
    int: 1 if HTTPS is present, 0 if HTTP is present, -1 if no scheme.
    """
    parsed_url = urlparse(url)
    return 1 if parsed_url.scheme == 'https' else 0 if parsed_url.scheme == 'http' else -1  # -1 for no scheme

# Assuming `df_downsampled` is your DataFrame with a column 'url'
df_downsampled = df_downsampled.copy()  # Create a new DataFrame named preprocess_dataset

# Apply the functions to extract features
df_downsampled['Has_HTTPS'] = df_downsampled['url'].apply(check_http_or_https)
df_downsampled['Has_URL_Shortener'] = df_downsampled['url'].apply(has_url_shortener)

# Output results
print("\nShape of the dataframe:", balanced_dataset.shape)
print(df_downsampled[['url', 'Has_HTTPS', 'Has_URL_Shortener']].head())



NameError: name 'balanced_dataset' is not defined

In [None]:
from urllib.parse import urlparse

def check_http_or_https(url):
    """
    Function to check if a URL uses HTTP or HTTPS.

    Args:
    url (str): The URL to check.

    Returns:
    int: 1 if HTTPS is present, 0 if HTTP is present.
    """
    parsed_url = urlparse(url)
    return 1 if parsed_url.scheme == 'https' else 0 if parsed_url.scheme == 'http' else 1  # -1 for no scheme

# Apply the function to the DataFrame
df_downsampled = df_downsampled.copy()  # Create a new DataFrame named preprocess_dataset
df_downsampled['Has_HTTPS'] = df_downsampled['url'].apply(check_http_or_https)

print("\nShape of the dataframe:", balanced_dataset.shape)
# Show the result
print(df_downsampled[['url', 'Has_HTTPS']].head())


In [None]:
def extract_tld_type(url):
    suspicious_tlds = ['xyz', 'top', 'club', 'work']
    parsed_url = urlparse(url)
    domain_parts = parsed_url.netloc.split('.')
    tld = domain_parts[-1] if len(domain_parts) > 1 else ''
    return 1 if tld in suspicious_tlds else 0

df_downsampled['tld_type'] = df_downsampled['url'].apply(extract_tld_type)
print(df_downsampled.head())

In [72]:
def digit_letter_ratio(url):
    letters = sum(c.isalpha() for c in url)
    digits = sum(c.isdigit() for c in url)
    return digits / (letters + 1e-6)  # Avoid division by zero

df_downsampled['digit_letter_ratio'] = df_downsampled['url'].apply(digit_letter_ratio)
print(df_downsampled.head())

        id            dateadded                                      url  \
0  3230050  2024-10-11 12:23:33        http://117.209.4.108:57478/bin.sh   
1  3177228  2024-09-16 19:43:06  http://154.216.18.121/.c/loginclientbot   
2      815  2024-11-30 02:10:35                                 zoho.com   
3   792779  2024-11-30 02:10:35                            kp-velenje.si   
4  3157557  2024-09-05 06:43:05            http://119.117.99.166:41825/i   

  url_status          last_online            threat                  tags  \
0    offline  2024-10-11 12:23:33  malware_download  32-bit,elf,mips,Mozi   
1    offline  2024-09-20 07:22:27  malware_download    elf,ladvix,opendir   
2     online  2024-11-30 02:10:35        legitimate                  none   
3     online  2024-11-30 02:10:35        legitimate                  none   
4    offline  2024-09-10 05:51:25  malware_download  32-bit,elf,mips,Mozi   

                            urlhaus_link      reporter  Label  ...  \
0  https:/

In [73]:
def has_encoded_chars(url):
    return 1 if '%' in url else 0

df_downsampled['has_encoded_chars'] = df_downsampled['url'].apply(has_encoded_chars)

print(df_downsampled.head())

        id            dateadded                                      url  \
0  3230050  2024-10-11 12:23:33        http://117.209.4.108:57478/bin.sh   
1  3177228  2024-09-16 19:43:06  http://154.216.18.121/.c/loginclientbot   
2      815  2024-11-30 02:10:35                                 zoho.com   
3   792779  2024-11-30 02:10:35                            kp-velenje.si   
4  3157557  2024-09-05 06:43:05            http://119.117.99.166:41825/i   

  url_status          last_online            threat                  tags  \
0    offline  2024-10-11 12:23:33  malware_download  32-bit,elf,mips,Mozi   
1    offline  2024-09-20 07:22:27  malware_download    elf,ladvix,opendir   
2     online  2024-11-30 02:10:35        legitimate                  none   
3     online  2024-11-30 02:10:35        legitimate                  none   
4    offline  2024-09-10 05:51:25  malware_download  32-bit,elf,mips,Mozi   

                            urlhaus_link      reporter  Label  ...  \
0  https:/

In [74]:
def has_https_www_combo(url):
    return 1 if 'https://' in url and 'www.' in url else 0

df_downsampled['https_www_combo'] = df_downsampled['url'].apply(has_https_www_combo)
print(df_downsampled.head())

        id            dateadded                                      url  \
0  3230050  2024-10-11 12:23:33        http://117.209.4.108:57478/bin.sh   
1  3177228  2024-09-16 19:43:06  http://154.216.18.121/.c/loginclientbot   
2      815  2024-11-30 02:10:35                                 zoho.com   
3   792779  2024-11-30 02:10:35                            kp-velenje.si   
4  3157557  2024-09-05 06:43:05            http://119.117.99.166:41825/i   

  url_status          last_online            threat                  tags  \
0    offline  2024-10-11 12:23:33  malware_download  32-bit,elf,mips,Mozi   
1    offline  2024-09-20 07:22:27  malware_download    elf,ladvix,opendir   
2     online  2024-11-30 02:10:35        legitimate                  none   
3     online  2024-11-30 02:10:35        legitimate                  none   
4    offline  2024-09-10 05:51:25  malware_download  32-bit,elf,mips,Mozi   

                            urlhaus_link      reporter  Label  ...  \
0  https:/

In [76]:
# Save the resampled dataset to a CSV file
df_downsampled.to_csv('balanced_dataset.csv', index=False)
print("\nShape of the dataframe:", df_downsampled.shape)
print("Balanced dataset saved successfully!")


Shape of the dataframe: (347774, 27)
Balanced dataset saved successfully!


In [79]:
df_downsampled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 347774 entries, 0 to 347773
Data columns (total 27 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   id                      347774 non-null  int64  
 1   dateadded               347774 non-null  object 
 2   url                     347774 non-null  object 
 3   url_status              347774 non-null  object 
 4   last_online             330584 non-null  object 
 5   threat                  347774 non-null  object 
 6   tags                    343372 non-null  object 
 7   urlhaus_link            173887 non-null  object 
 8   reporter                347774 non-null  object 
 9   Label                   347774 non-null  int64  
 10  URL_Length              347774 non-null  int64  
 11  URL_Depth               347774 non-null  int64  
 12  Have_At                 347774 non-null  int64  
 13  Have_Percent            347774 non-null  int64  
 14  Have_Equals         