In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Preprocessing

Load data

In [2]:
data_url = pd.read_csv('verified_online.csv')
data_url.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47012 entries, 0 to 47011
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   phish_id           47012 non-null  int64 
 1   url                47012 non-null  object
 2   phish_detail_url   47012 non-null  object
 3   submission_time    47012 non-null  object
 4   verified           47012 non-null  object
 5   verification_time  47012 non-null  object
 6   online             47012 non-null  object
 7   target             47012 non-null  object
dtypes: int64(1), object(7)
memory usage: 2.9+ MB


Hapus kolom data yang tidak diperlukan

In [3]:
data_url = data_url.drop(columns=['phish_id', 'phish_detail_url', 'submission_time', 'verified', 'verification_time', 'online', 'target'])

Cek data duplikat

In [4]:
data_url.duplicated().sum()

np.int64(8)

Hapus data duplikat

In [5]:
data_url.drop_duplicates(inplace=True)
data_url.reset_index(drop=True, inplace=True)

Cek data null

In [6]:
data_url.isnull().sum()

url    0
dtype: int64

In [7]:
data_url['label'] = 1

In [8]:
data_url.head()

Unnamed: 0,url,label
0,https://therozgaar.in/japan.pla/Sites/index.html,1
1,https://kortas.cfd/indexco.jp,1
2,https://lawspoint.top/jp/,1
3,https://links.truthsocial.com/link/11569358163...,1
4,http://trezurlogoi.framer.media,1


In [9]:
data_url.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47004 entries, 0 to 47003
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   url     47004 non-null  object
 1   label   47004 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 734.6+ KB


Ektraksi Fitur

In [10]:
from urllib.parse import urlparse
import re
from datetime import datetime
import whois

In [None]:
SHORTENERS = [
    'bit.ly', 'tinyurl', 'goo.gl', 't.co', 'is.gd', 
    'ow.ly', 'buff.ly', 'bl.aq', 'cli.gs', 'tr.im'
]

def lexical_features(url):
    # Pastikan URL memiliki scheme agar urlparse bekerja optimal
    if not re.match(r'^https?://', url):
        url = 'http://' + url
        
    parsed = urlparse(url)
    domain = parsed.netloc
    
    # --- LOGIKA PRE-CALCULATION (Hitung dulu di luar dictionary) ---
    
    # Cek posisi double slash '//'
    # rfind mengembalikan indeks terakhir. Jika > 7, berarti ada redirect di tengah URL
    pos_double_slash = url.rfind('//')
    is_redirecting = 1 if pos_double_slash > 7 else 0

    return {
        # ====== Fitur Dasar ======
        'panjang_url': len(url),
        'jumlah_titik': url.count('.'),
        'jumlah_strip': url.count('-'),
        'jumlah_digit': sum(c.isdigit() for c in url),
        'ada_at': int('@' in url),
        'ada_https': int(parsed.scheme == 'https'),
        # ====== Fitur Klasik (URL-based) ======
        # Cek apakah domain berupa IP Address (misal: http://192.168.1.1/login)
        'UsingIP': int(bool(re.search(r'(\d{1,3}\.){3}\d{1,3}', domain))),
        
        # Cek apakah menggunakan ShortURL
        'ShortURL': int(any(s in domain for s in SHORTENERS)),
        
        # Logika: Redirecting// di tengah URL
        'Redirecting//': is_redirecting,
        
        # Cek apakah penipu menaruh kata "https" di domain (misal: http://https-secure-bank.com)
        'HTTPSDomainURL': int('https' in domain.lower()),
        
        # Abnormal URL: Jika hostname kosong (biasanya terjadi jika URL cacat)
        'AbnormalURL': int(0 if parsed.hostname else 1)
    }


In [12]:
# Psychological features
SENSITIVE_WORDS = [
    'login', 'secure', 'account', 'verify', 'update',
    'bank', 'signin', 'password', 'confirm'
]

def psychological_features(url):
    url_lower = url.lower()
    return {
        'jumlah_kata_sensitif': sum(word in url_lower for word in SENSITIVE_WORDS)
    }

In [13]:
def extract_features(url):
    features = {}
    features.update(lexical_features(url))
    features.update(psychological_features(url))
    return features

In [14]:
kolom_fitur = []

for url, label in zip(data_url['url'], data_url['label']):
    try:
        fitur = extract_features(url)
        fitur['label'] = label
        kolom_fitur.append(fitur)
    except Exception as e:
        continue

df = pd.DataFrame(kolom_fitur)

In [15]:
df.to_csv("data_phising.csv", index=False)