In [4]:
# Install the pandas library using pip
!pip install pandas



In [1]:
# Importing necessary libraries
# For data manipulation and analysis
import pandas as pd 
# For numerical operations
import numpy as np  

# Load dataset for phishing
data = pd.read_csv('C:/JN/dataset_phishing.csv')

# Display the first few rows of the dataset
print(data.head())


                                                 url  length_url  \
0              http://www.crestonwood.com/router.php          37   
1  http://shadetreetechnology.com/V4/validation/a...          77   
2  https://support-appleld.com.secureupdate.duila...         126   
3                                 http://rgipt.ac.in          18   
4  http://www.iracing.com/tracks/gateway-motorspo...          55   

   length_hostname  ip  nb_dots  nb_hyphens  nb_at  nb_qm  nb_and  nb_or  ...  \
0               19   0        3           0      0      0       0      0  ...   
1               23   1        1           0      0      0       0      0  ...   
2               50   1        4           1      0      1       2      0  ...   
3               11   0        2           0      0      0       0      0  ...   
4               15   0        2           2      0      0       0      0  ...   

   domain_in_title  domain_with_copyright  whois_registered_domain  \
0                0                

In [2]:
# List of relevant columns that we need to make a features and target data
relevant_columns = [
    'url', 'length_url', 'google_index', 'nb_hyperlinks', 
    'nb_subdomains', 'domain_age', 'https_token','length_words_raw', 'char_repeat', 
    'ratio_extHyperlinks', 'status'
]

# Filter the dataset to keep only the relevant columns
filtered_data = data[relevant_columns]

# Drop rows with missing values in critical columns
cleaned_data = filtered_data.dropna(subset=relevant_columns)

# Display the cleaned data
cleaned_data.head()


Unnamed: 0,url,length_url,google_index,nb_hyperlinks,nb_subdomains,domain_age,https_token,length_words_raw,char_repeat,ratio_extHyperlinks,status
0,http://www.crestonwood.com/router.php,37,1,17,3,-1,1,4,4,0.470588,legitimate
1,http://shadetreetechnology.com/V4/validation/a...,77,1,30,1,5767,1,4,4,0.033333,phishing
2,https://support-appleld.com.secureupdate.duila...,126,1,4,3,4004,0,12,2,0.0,phishing
3,http://rgipt.ac.in,18,0,149,2,-1,1,1,0,0.026846,legitimate
4,http://www.iracing.com/tracks/gateway-motorspo...,55,0,102,2,8175,1,6,3,0.529412,legitimate


In [5]:
# Rename the 'https_token' column to 'is_http'
filtered_data = filtered_data.rename(columns={'https_token': 'is_http'})

# Update relevant_columns to reflect the new column name
relevant_columns = [
    'url', 'length_url', 'google_index', 'nb_hyperlinks', 
    'nb_subdomains', 'domain_age', 'is_http', 'length_words_raw', 
    'char_repeat', 'ratio_extHyperlinks', 'status'
]

# Fill NaN values in 'domain_age' with the median value before conversion
median_domain_age = filtered_data['domain_age'].median()
filtered_data['domain_age'] = filtered_data['domain_age'].fillna(median_domain_age)

# Convert columns to appropriate data types
# Ensure all columns exist in the DataFrame to prevent errors
columns_to_convert = {
    'length_url': int,
    'google_index': int,
    'nb_hyperlinks': int,
    'nb_subdomains': int,
    'domain_age': int,
    'is_http': int,
    'length_words_raw': int,
    'char_repeat': int,
    'ratio_extHyperlinks': float,
    'status': str
}

for column, dtype in columns_to_convert.items():
    if column in filtered_data.columns:
        filtered_data[column] = filtered_data[column].astype(dtype)

# Drop rows with missing values in critical columns
cleaned_data = filtered_data.dropna(subset=relevant_columns)

In [6]:
# Extract a random sample of 1000 data points
# Ensure the dataset has at least 1000 rows to sample from
if len(cleaned_data) >= 1000:
    sample_data = cleaned_data.sample(n=1000, random_state=42)
else:
    print("Warning: Not enough data points to sample 1000 rows.")
    sample_data = cleaned_data

# Now, let's look at the cleaned and sampled dataset
print(sample_data.head())

                                                     url  length_url  \
7529                 https://www.rga.com/about/workplace          35   
11221                             http://starasia.com.hk          22   
4889         https://www.youtube.com/watch?v=XszqITK-UEw          43   
8962                          http://www.civico1845.com/          26   
4004   http://rapidpaws.com/wp-content/we_transfer/in...          62   

       google_index  nb_hyperlinks  nb_subdomains  domain_age  is_http  \
7529              0             97              2       11039        0   
11221             0            168              2          -1        1   
4889              0             52              2        5636        0   
8962              0             39              2        1938        1   
4004              1              1              2        1853        1   

       length_words_raw  char_repeat  ratio_extHyperlinks      status  
7529                  4            3             0

In [8]:
# Data cleaning summary
print("\nData Types:")
print(sample_data.dtypes)


Data Types:
url                     object
length_url               int64
google_index             int64
nb_hyperlinks            int64
nb_subdomains            int64
domain_age               int64
is_http                  int64
length_words_raw         int64
char_repeat              int64
ratio_extHyperlinks    float64
status                  object
dtype: object


In [9]:
# Checking for missing values in the sample after cleaning
print("\nMissing Values After Cleaning:")
print(sample_data.isnull().sum())


Missing Values After Cleaning:
url                    0
length_url             0
google_index           0
nb_hyperlinks          0
nb_subdomains          0
domain_age             0
is_http                0
length_words_raw       0
char_repeat            0
ratio_extHyperlinks    0
status                 0
dtype: int64


In [11]:
# Save the cleaned and sampled data to a CSV file
sample_data.to_csv('C:/JN/Phishing_sample_datanew.csv', index=False)