# 01 — Data & Scraping
Goal: load phishing/legit URL lists, basic EDA, scrape small HTML sample, save urls table.

In [28]:
import pandas as pd
import numpy as np
from pathlib import Path

In [29]:
DATA_RAW = Path("../data/raw")
DATA_PROC = Path("../data/processed")
DATA_RAW.mkdir(parents=True, exist_ok=True)
DATA_PROC.mkdir(parents=True, exist_ok=True)

In [30]:
df = pd.read_csv(DATA_RAW / "malicious_phish.csv")
df.shape

(651191, 2)

In [34]:
df.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [35]:
df['type'].value_counts()

type
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64

In [36]:
label_map = {
    'benign': 0,
    'phishing': 1,
    'malware': 2,
    'defacement':3
}

In [37]:
df['label'] = df['type'].map(label_map)

In [38]:
df = df.drop(columns='type')

In [39]:
df.head()

Unnamed: 0,url,label
0,br-icloud.com.br,1
1,mp3raid.com/music/krizz_kaliko.html,0
2,bopsecrets.org/rexroth/cr/1.htm,0
3,http://www.garage-pirenne.be/index.php?option=...,3
4,http://adventure-nicaragua.net/index.php?optio...,3


In [40]:
df.isna().sum()

url      0
label    0
dtype: int64

In [47]:
df.duplicated(subset='url').sum()

10072

In [48]:
df = df.drop_duplicates(subset=['url']).reset_index(drop=True)

In [49]:
df.describe()

Unnamed: 0,label
count,641119.0
mean,0.666491
std,1.089953
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,3.0


In [53]:
df.to_parquet(DATA_PROC / 'urls.parquet', index=False)
df.shape

(641119, 2)

In [54]:
df.dtypes

url      object
label     int64
dtype: object