In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
df = pd.read_csv('/content/drive/MyDrive/URL/Kaggle_Malicious_URLs_Dataset/malicious_phish.csv')

In [None]:
import os, io, re, zipfile, csv
import pandas as pd

In [None]:
df

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement
...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing
651188,www.gamespot.com/xbox360/action/deadspace/,phishing
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing


In [None]:
df = df.rename(columns={'type': 'URL_Type_obf_Type'})

In [None]:
num_duplicates = df['url'].duplicated().sum()
print(f"Number of Duplicates: {num_duplicates}")

Number of Duplicates: 10072


In [None]:
df.drop_duplicates(subset='url', keep='first', inplace=True)

In [None]:
df.to_csv("/content/drive/MyDrive/URL/Kaggle_Malicious_URLs_Dataset/URLS.csv", index=False)

In [None]:
!pip install tldextract



In [None]:
import pandas as pd
import re
import tldextract
import os
import math
import idna
from collections import Counter
from urllib.parse import urlparse, parse_qs

def shannon_entropy(s: str) -> float:
    if not s:
        return 0.0
    counts = Counter(s)
    L = len(s)
    return -sum((c/L) * math.log2(c/L) for c in counts.values())

def get_longest_word_length(s: str) -> int:
    words = re.split(r"[^a-zA-Z]+", s or "")
    return max((len(w) for w in words if w), default=0)

def count_ldl(s: str) -> int:
    return len(re.findall(r"[A-Za-z]\d[A-Za-z]", s or ""))

def count_dld(s: str) -> int:
    return len(re.findall(r"\d[A-Za-z]\d", s or ""))

def character_continuity_rate(s: str) -> float:
    if not s:
        return 0.0
    max_l = max_d = max_s = 0
    cur_l = cur_d = cur_s = 0
    for ch in s:
        if ch.isalpha():
            cur_l += 1; max_l = max(max_l, cur_l)
            cur_d = 0; cur_s = 0
        elif ch.isdigit():
            cur_d += 1; max_d = max(max_d, cur_d)
            cur_l = 0; cur_s = 0
        else:
            cur_s += 1; max_s = max(max_s, cur_s)
            cur_l = 0; cur_d = 0
    return (max_l + max_d + max_s) / len(s)

def count_vowels(s: str) -> int:
    return sum(ch.lower() in "aeiou" for ch in s or "")

def count_consonants(s: str) -> int:
    return sum((s and ch.isalpha()) and ch.lower() not in "aeiou" for ch in s or "")

def is_executable_extension(ext: str) -> int:
    return int((ext or "").lower() == "exe")

def is_port_80(urlp) -> int:
    return int(urlp.port == 80)  # sadece explicit :80

def has_sensitive_word(url: str) -> int:
    suspicious_words = ["secure", "account", "bank", "login", "ebay", "paypal", "verify"]
    u = (url or "").lower()
    return int(any(w in u for w in suspicious_words))

def count_suspicious_special_chars(url: str) -> int:
    u = url or ""
    cnt = u.count("@")
    # '//' ama '://' olmayan
    cnt += len(re.findall(r"(?<!:)//", u))
    return cnt

def delimiter_count(s: str) -> int:
    delims = ".-_/=?&"
    return sum(ch in delims for ch in s or "")

def split_domain_tokens(domain_full: str):
    toks = re.split(r"[.-]+", domain_full or "")
    return [t for t in toks if t]

def avg_len(tokens) -> float:
    return (sum(len(t) for t in tokens) / len(tokens)) if tokens else 0.0

def longest_len(tokens) -> int:
    return max((len(t) for t in tokens), default=0)

def path_tokens(path_str: str):
    return [p for p in (path_str or "").split('/') if p]

def get_filename(path_str: str) -> str:
    toks = (path_str or "").split('/')
    return toks[-1] if toks else ''

def get_subdir(path_str: str) -> str:
    toks = (path_str or "").split('/')
    return '/'.join(toks[:-1]) if len(toks) > 1 else ''

def idna_encode(host: str) -> str:
    if not host:
        return ""
    try:
        # IPv6 köşeli parantezleri temizle
        clean = host.strip("[]")
        # IP ise IDNA gerekmez
        if re.match(r"^(?:\d{1,3}\.){3}\d{1,3}$", clean):
            return clean
        # IPv6 ise direkt dön
        if ":" in clean and not re.search(r"[a-zA-Z]", clean):
            return clean
        return idna.encode(clean).decode("ascii")
    except Exception:
        return host

###################################
# Özellik Çıkarım Fonksiyonu
###################################
def extract_features(url: str):
    url = url.strip()
    urlp = urlparse(url)

    scheme = urlp.scheme
    host = urlp.hostname or ""         # user:pass@host:port -> sadece host
    host = idna_encode(host)           # IDN normalize
    netloc = host                      # netloc yerine normalize edilmiş host'u kullanacağız
    path = (urlp.path or "").lstrip('/')
    query = urlp.query or ""

    filename = get_filename(path)
    file_ext = filename.split('.')[-1] if '.' in filename else ''
    subdir = get_subdir(path)

    # tldextract (host üzerinde çalışalım, tüm URL değil)
    ext = tldextract.extract(host)
    # domain_full: subdomain.domain.suffix (suffix yoksa sadece host'u kullan)
    if ext.suffix:
        parts = [p for p in [ext.subdomain, ext.domain, ext.suffix] if p]
        domain_full = ".".join(parts)
        domain_only = ext.domain or host
        tld_len = len(ext.suffix)
    else:
        domain_full = host
        domain_only = host
        tld_len = 0

    params = parse_qs(query, keep_blank_values=True)

    QueryLength = len(query)
    dom_tokens = split_domain_tokens(domain_full)
    domain_token_count_ = len(dom_tokens)
    path_toks = path_tokens(path)
    path_token_count_ = len(path_toks)
    avgdomaintokenlen_ = avg_len(dom_tokens)
    longdomaintokenlen_ = longest_len(dom_tokens)
    avgpathtokenlen_ = avg_len(path_toks)

    tld_ = tld_len
    charcompvowels_ = count_vowels(url)
    charcompace_ = count_consonants(url)

    ldl_url_ = count_ldl(url)
    ldl_domain_ = count_ldl(domain_full)
    ldl_path_ = count_ldl(path)
    ldl_filename_ = count_ldl(filename)
    ldl_getArg_ = count_ldl(query)

    dld_url_ = count_dld(url)
    dld_domain_ = count_dld(domain_full)
    dld_path_ = count_dld(path)
    dld_filename_ = count_dld(filename)
    dld_getArg_ = count_dld(query)

    urlLen_ = len(url)
    domainlength_ = len(domain_full)
    pathLength_ = len(path)
    subDirLen_ = len(subdir)
    fileNameLen_ = len(filename)
    fileExtLen_ = len(file_ext)
    ArgLen_ = QueryLength

    pathurlRatio_ = pathLength_ / urlLen_ if urlLen_ else 0
    ArgUrlRatio_ = ArgLen_ / urlLen_ if urlLen_ else 0
    argDomainRatio_ = ArgLen_ / domainlength_ if domainlength_ else 0
    domainUrlRatio_ = domainlength_ / urlLen_ if urlLen_ else 0
    pathDomainRatio_ = pathLength_ / domainlength_ if domainlength_ else 0
    argPathRatio_ = ArgLen_ / pathLength_ if pathLength_ else 0

    executable_ = is_executable_extension(file_ext)
    isPortEighty_ = is_port_80(urlp)

    NumberOfDotsInURL_ = url.count('.')               # orijinal ölçüm
    NumberOfDotsInHost_ = netloc.count('.')           # ek metrik (host özel)

    # IP tespiti (IPv4/IPv6)
    is_ip_addr = 0
    if re.match(r"^(?:\d{1,3}\.){3}\d{1,3}$", host) or host.startswith("[") or (":" in host and not re.search(r"[a-zA-Z]", host)):
        is_ip_addr = 1
    IsIPAddressInDomainName_ = is_ip_addr

    CharacterContinuityRate_ = character_continuity_rate(domain_full)

    longest_val = 0
    for _, vals in params.items():
        for val in vals:
            if len(val) > longest_val:
                longest_val = len(val)
    LongestVariableValue_ = longest_val

    URL_DigitCount_ = sum(ch.isdigit() for ch in url)
    host_DigitCount_ = sum(ch.isdigit() for ch in domain_full)
    Directory_DigitCount_ = sum(ch.isdigit() for ch in subdir)
    file_base = ".".join(filename.split('.')[:-1]) if '.' in filename else filename
    FileName_DigitCount_ = sum(ch.isdigit() for ch in file_base)
    Extension_DigitCount_ = sum(ch.isdigit() for ch in file_ext)
    Query_DigitCount_ = sum(ch.isdigit() for ch in query)

    URL_LetterCount_ = sum(ch.isalpha() for ch in url)
    host_LetterCount_ = sum(ch.isalpha() for ch in domain_full)
    Directory_LetterCount_ = sum(ch.isalpha() for ch in subdir)
    Filename_LetterCount_ = sum(ch.isalpha() for ch in file_base)
    Extension_LetterCount_ = sum(ch.isalpha() for ch in file_ext)
    Query_LetterCount_ = sum(ch.isalpha() for ch in query)

    LongestPathTokenLength_ = max((len(t) for t in path_toks), default=0)
    Domain_LongestWordLength_ = get_longest_word_length(domain_full)
    Path_LongestWordLength_ = get_longest_word_length(path)
    subDirectory_LongestWordLength_ = get_longest_word_length(subdir)
    Arguments_LongestWordLength_ = get_longest_word_length(query)

    URL_sensitiveWord_ = has_sensitive_word(url)

    URLQueries_variable_ = sum(len(v) for v in params.values())

    spcharUrl_ = count_suspicious_special_chars(url)
    delimeter_Domain_ = delimiter_count(domain_full)
    delimeter_path_ = delimiter_count(path)
    delimeter_Count_ = delimiter_count(url)

    NumberRate_URL_ = URL_DigitCount_ / urlLen_ if urlLen_ else 0
    NumberRate_Domain_ = host_DigitCount_ / domainlength_ if domainlength_ else 0
    NumberRate_DirectoryName_ = Directory_DigitCount_ / subDirLen_ if subDirLen_ else 0
    NumberRate_FileName_ = FileName_DigitCount_ / len(file_base) if file_base else 0
    NumberRate_Extension_ = Extension_DigitCount_ / fileExtLen_ if fileExtLen_ else 0
    NumberRate_AfterPath_ = Query_DigitCount_ / ArgLen_ if ArgLen_ else 0

    SymbolCount_URL_ = sum(not ch.isalnum() for ch in url)
    SymbolCount_Domain_ = sum(not ch.isalnum() for ch in domain_full)
    SymbolCount_DirectoryName_ = sum(not ch.isalnum() for ch in subdir)
    SymbolCount_FileName_ = sum(not ch.isalnum() for ch in file_base)
    SymbolCount_Extension_ = sum(not ch.isalnum() for ch in file_ext)
    SymbolCount_AfterPath_ = sum(not ch.isalnum() for ch in query)

    Entropy_URL_ = shannon_entropy(url)
    Entropy_Domain_ = shannon_entropy(domain_full)
    Entropy_DirectoryName_ = shannon_entropy(subdir)
    Entropy_Filename_ = shannon_entropy(file_base)
    Entropy_Extension_ = shannon_entropy(file_ext)
    Entropy_AfterPath_ = shannon_entropy(query)

    URL_Type_obf_Type = "unk"

    return [
        QueryLength,
        domain_token_count_,
        path_token_count_,
        avgdomaintokenlen_,
        longdomaintokenlen_,
        avgpathtokenlen_,
        tld_,
        charcompvowels_,
        charcompace_,
        ldl_url_,
        ldl_domain_,
        ldl_path_,
        ldl_filename_,
        ldl_getArg_,
        dld_url_,
        dld_domain_,
        dld_path_,
        dld_filename_,
        dld_getArg_,
        urlLen_,
        domainlength_,
        pathLength_,
        subDirLen_,
        fileNameLen_,
        fileExtLen_,
        ArgLen_,
        pathurlRatio_,
        ArgUrlRatio_,
        argDomainRatio_,
        domainUrlRatio_,
        pathDomainRatio_,
        argPathRatio_,
        executable_,
        isPortEighty_,
        NumberOfDotsInURL_,
        IsIPAddressInDomainName_,
        CharacterContinuityRate_,
        LongestVariableValue_,
        URL_DigitCount_,
        host_DigitCount_,
        Directory_DigitCount_,
        FileName_DigitCount_,
        Extension_DigitCount_,
        Query_DigitCount_,
        URL_LetterCount_,
        host_LetterCount_,
        Directory_LetterCount_,
        Filename_LetterCount_,
        Extension_LetterCount_,
        Query_LetterCount_,
        LongestPathTokenLength_,
        Domain_LongestWordLength_,
        Path_LongestWordLength_,
        subDirectory_LongestWordLength_,
        Arguments_LongestWordLength_,
        URL_sensitiveWord_,
        URLQueries_variable_,
        spcharUrl_,
        delimeter_Domain_,
        delimeter_path_,
        delimeter_Count_,
        NumberRate_URL_,
        NumberRate_Domain_,
        NumberRate_DirectoryName_,
        NumberRate_FileName_,
        NumberRate_Extension_,
        NumberRate_AfterPath_,
        SymbolCount_URL_,
        SymbolCount_Domain_,
        SymbolCount_DirectoryName_,
        SymbolCount_FileName_,
        SymbolCount_Extension_,
        SymbolCount_AfterPath_,
        Entropy_URL_,
        Entropy_Domain_,
        Entropy_DirectoryName_,
        Entropy_Filename_,
        Entropy_Extension_,
        Entropy_AfterPath_,
        # Ek metrik (host nokta sayısı) – istersen feature_names'e de ekleyebilirsin
        NumberOfDotsInHost_,
        URL_Type_obf_Type
    ]

feature_names = [
    "Querylength", "domain_token_count", "path_token_count", "avgdomaintokenlen", "longdomaintokenlen",
    "avgpathtokenlen", "tld", "charcompvowels", "charcompace", "ldl_url", "ldl_domain", "ldl_path",
    "ldl_filename", "ldl_getArg", "dld_url", "dld_domain", "dld_path", "dld_filename", "dld_getArg",
    "urlLen", "domainlength", "pathLength", "subDirLen", "fileNameLen", "fileExtLen", "ArgLen",
    "pathurlRatio", "ArgUrlRatio", "argDomainRatio", "domainUrlRatio", "pathDomainRatio", "argPathRatio",
    "executable", "isPortEighty", "NumberofDotsinURL", "IsIPAddressInDomainName", "CharacterContinuityRate",
    "LongestVariableValue", "URL_DigitCount", "host_DigitCount", "Directory_DigitCount", "FileName_DigitCount",
    "Extension_DigitCount", "Query_DigitCount", "URL_Letter_Count", "host_LetterCount", "Directory_LetterCount",
    "Filename_LetterCount", "Extension_LetterCount", "Query_LetterCount", "LongestPathTokenLength",
    "Domain_LongestWordLength", "Path_LongestWordLength", "subDirectory_LongestWordLength",
    "Arguments_LongestWordLength", "URL_sensitiveWord", "URLQueries_variable", "spcharUrl",
    "delimeter_Domain", "delimeter_path", "delimeter_Count", "NumberRate_URL", "NumberRate_Domain",
    "NumberRate_DirectoryName", "NumberRate_FileName", "NumberRate_Extension", "NumberRate_AfterPath",
    "SymbolCount_URL", "SymbolCount_Domain", "SymbolCount_DirectoryName", "SymbolCount_FileName",
    "SymbolCount_Extension", "SymbolCount_AfterPath", "Entropy_URL", "Entropy_Domain",
    "Entropy_DirectoryName", "Entropy_Filename", "Entropy_Extension", "Entropy_AfterPath",
    "NumberOfDotsInHost",
    "URL_Type_obf_Type"
]





In [None]:
from tqdm import tqdm

def build_features_csv(
    input_csv: str,
    output_csv: str,
    url_col: str = "url",
    tag_col: str = "URL_Type_obf_Type",
    encoding: str = "utf-8"
):
    if not os.path.exists(input_csv):
        raise FileNotFoundError(f"Girdi bulunamadı: {input_csv}")

    # Virgüllü/Tırnaklı alanlar için güvenli okuma
    df_in = pd.read_csv(input_csv, encoding=encoding)
    if url_col not in df_in.columns or tag_col not in df_in.columns:
        raise ValueError(f"Beklenen sütunlar yok: '{url_col}', '{tag_col}'. Bulunan: {list(df_in.columns)}")

    feature_rows = []
    for url, tag in tqdm(
        zip(df_in[url_col].astype(str), df_in[tag_col].astype(str)),
        total=len(df_in),
        desc="Özellik çıkarımı"
    ):
        feats = extract_features(url)
        feats[-1] = tag  # son alan etiket/tag
        feature_rows.append(feats)

    df_out = pd.DataFrame(feature_rows, columns=feature_names)
    df_out.to_csv(output_csv, index=False, encoding="utf-8")
    print(f"Feature extraction completed. Saved to: {output_csv}")


In [None]:
!pip install -q tqdm

In [None]:
build_features_csv(
    input_csv="/content/drive/MyDrive/URL/Kaggle_Malicious_URLs_Dataset/URLS.csv",
    output_csv="/content/drive/MyDrive/URL/Kaggle_Malicious_URLs_Dataset/MANU.csv",
    url_col="url",
    tag_col="URL_Type_obf_Type"
)

Özellik çıkarımı: 100%|██████████| 641119/641119 [01:22<00:00, 7818.20it/s] 


Feature extraction completed. Saved to: /content/drive/MyDrive/URL/Kaggle_Malicious_URLs_Dataset/MANU.csv


In [None]:
df_new = pd.read_csv("/content/drive/MyDrive/URL/Kaggle_Malicious_URLs_Dataset/MANU.csv")

In [None]:
df_new

Unnamed: 0,Querylength,domain_token_count,path_token_count,avgdomaintokenlen,longdomaintokenlen,avgpathtokenlen,tld,charcompvowels,charcompace,ldl_url,...,SymbolCount_Extension,SymbolCount_AfterPath,Entropy_URL,Entropy_Domain,Entropy_DirectoryName,Entropy_Filename,Entropy_Extension,Entropy_AfterPath,NumberOfDotsInHost,URL_Type_obf_Type
0,0,0,1,0.0,0,16.000000,0,4,9,0,...,0,0,3.375000,0.000000,0.000000,3.392747,1.000000,0.000000,0,phishing
1,0,0,3,0.0,0,11.000000,0,9,20,1,...,0,0,4.079143,0.000000,3.572469,2.855389,2.000000,0.000000,0,benign
2,0,0,4,0.0,0,7.000000,0,6,19,0,...,0,0,3.708093,0.000000,3.479080,-0.000000,1.584963,0.000000,0,benign
3,49,4,1,4.5,7,9.000000,2,22,41,0,...,0,9,4.660343,3.308751,0.000000,2.321928,0.918296,4.280341,2,defacement
4,194,3,1,7.0,9,9.000000,3,37,162,14,...,0,6,5.491293,3.501398,0.000000,2.321928,0.918296,5.360074,1,defacement
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641114,0,0,4,0.0,0,9.000000,0,5,16,0,...,0,0,4.355539,0.000000,4.134336,2.251629,2.000000,0.000000,0,phishing
641115,0,0,4,0.0,0,10.000000,0,11,18,0,...,0,0,4.243300,0.000000,4.257319,0.000000,0.000000,0.000000,0,phishing
641116,0,0,4,0.0,0,9.500000,0,12,21,0,...,0,0,4.147921,0.000000,4.161953,0.000000,0.000000,0.000000,0,phishing
641117,0,0,3,0.0,0,14.333333,0,18,18,0,...,0,0,4.102313,0.000000,3.458525,3.675311,0.000000,0.000000,0,phishing


In [None]:
num_duplicates = df_new.duplicated().sum()
print(f"Number of Duplicates: {num_duplicates}")

Number of Duplicates: 100452


In [None]:
df_new['URL_Type_obf_Type'].value_counts()

Unnamed: 0_level_0,count
URL_Type_obf_Type,Unnamed: 1_level_1
benign,428080
defacement,95308
phishing,94086
malware,23645
