In [1]:
import pandas as pd
import os, sys

In [2]:
storage_folder = '../data/raw/'

In [3]:
os.listdir(storage_folder)

['cdx-00144',
 '.DS_Store',
 'cdx-00144.feather',
 'phising_urls.pkl',
 'cc-index.paths',
 '.ipynb_checkpoints']

In [3]:
import re

In [7]:
feather_file = [f for f in os.listdir(storage_folder) if re.search('.feather', f) is not None][0]

In [8]:
feather_file

'cdx-00144.feather'

In [11]:
feather_path = os.path.join(storage_folder, feather_file)

In [10]:
import feather

In [12]:
df = feather.read_dataframe(feather_path)

In [13]:
df.shape

(9921252, 2)

In [14]:
df.head()

Unnamed: 0,ts,url
0,20191114224852,https://cartolafcmix.com/
1,20191114224624,https://cartolafcmix.com/category/cartola-da-l...
2,20191114224801,https://cartolafcmix.com/category/dicas/
3,20191114224528,https://cartolafcmix.com/como-funciona-o-banco...
4,20191114224756,https://cartolafcmix.com/como-ganhar-200-carto...


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9921252 entries, 0 to 9921251
Data columns (total 2 columns):
ts     object
url    object
dtypes: object(2)
memory usage: 151.4+ MB


In [17]:
n = int(df.shape[0] * 0.1)
mini_df = df.sample(n)

In [18]:
import numpy as np

In [19]:
mini_df['url_has_numbers'] = np.where(mini_df['url'].str.contains(r'\d'), 1, 0)

In [20]:
mini_df['url_has_numbers'].mean()

0.6421065893914577

## Phishing URLs

In [4]:
ph_df = pd.read_pickle('../data/raw/phising_urls.pkl')

In [56]:
ph_df.shape

(11339, 1)

In [57]:
len(ph_df) / len(df)

0.0011429001097845312

In [58]:
ph_df.head()

Unnamed: 0,url
0,https://www2.amazon.co.jp.anamz-coco-nom.xyz/
1,https://www2.account-update.amazon.co.jp.7a732...
2,https://www2.account-update.amazon.co.jp.7a732...
3,http://jppost-ze.com/yue.html
4,http://jppost-za.com/yue.html


## Add labels

In [59]:
ph_df['label'] = 'phishing'

In [60]:
ph_df.head()

Unnamed: 0,url,label
0,https://www2.amazon.co.jp.anamz-coco-nom.xyz/,phishing
1,https://www2.account-update.amazon.co.jp.7a732...,phishing
2,https://www2.account-update.amazon.co.jp.7a732...,phishing
3,http://jppost-ze.com/yue.html,phishing
4,http://jppost-za.com/yue.html,phishing


In [61]:
df['label'] = 'benign'

In [62]:
df.head()

Unnamed: 0,ts,url,label
0,20191114224852,https://cartolafcmix.com/,benign
1,20191114224624,https://cartolafcmix.com/category/cartola-da-l...,benign
2,20191114224801,https://cartolafcmix.com/category/dicas/,benign
3,20191114224528,https://cartolafcmix.com/como-funciona-o-banco...,benign
4,20191114224756,https://cartolafcmix.com/como-ganhar-200-carto...,benign


In [64]:
df.drop(columns=['ts'], inplace=True)

## Merge df's

In [65]:
full_df = pd.concat([df, ph_df])

In [67]:
len(full_df) == len(df) + len(ph_df)

True

In [69]:
X = full_df['url']
y = full_df['label']

## Train-test split

In [68]:
from sklearn.model_selection import train_test_split

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25)

Verify that `stratify` worked as inteded:

In [72]:
y.value_counts(1)

benign      0.998858
phishing    0.001142
Name: label, dtype: float64

In [73]:
y_train.value_counts(1)

benign      0.998858
phishing    0.001142
Name: label, dtype: float64

In [74]:
y_test.value_counts(1)

benign      0.998858
phishing    0.001142
Name: label, dtype: float64

In [87]:
len(X_train), len(X_test)

(7449443, 2483148)

In [76]:
X_train[:10]

971619     https://www.caseycarpetoflascruces.com/carpet-...
5935614             http://m2z.cdqydq.com/viewspace-809.html
4716360    http://apps.cccski.com/ViewPointsDetails.asp?s...
1044022    http://cashbacktool.com/Default.aspx?retailer_...
8954152              https://cerablast.com/en/ceramic-beads/
1468701    http://www.casque-motocross.com/casque-moto-3d...
2527970    https://www.catherinedoucette.com/tag/universi...
8046471      https://www.centralbistroboston.com/blank-gtqtx
9253955    https://cerrajeriaocana.com/portfolio/escalera...
2310913            http://www.catchwestbound.com/116/15.html
Name: url, dtype: object

In [77]:
y_train[:10]

971619     benign
5935614    benign
4716360    benign
1044022    benign
8954152    benign
1468701    benign
2527970    benign
8046471    benign
9253955    benign
2310913    benign
Name: label, dtype: object

In [82]:
train_df = pd.concat([X_train, y_train], axis=1)

In [83]:
train_df.head()

Unnamed: 0,url,label
971619,https://www.caseycarpetoflascruces.com/carpet-...,benign
5935614,http://m2z.cdqydq.com/viewspace-809.html,benign
4716360,http://apps.cccski.com/ViewPointsDetails.asp?s...,benign
1044022,http://cashbacktool.com/Default.aspx?retailer_...,benign
8954152,https://cerablast.com/en/ceramic-beads/,benign


In [84]:
test_df = pd.concat([X_test, y_test], axis=1)

In [85]:
train_df.to_pickle('../data/interim/train_df.pkl')
test_df.to_pickle('../data/interim/test_df.pkl')