In [1]:
%run make_features.py -h

make_features.py

Make features from <filename> data. Loads from ../../data/interim.
Saves to original file

Usage:
    make_features.py <filename> [options]

Arguments:
    filename <file>       Filename, with extension. Data will be loaded
                            from ../../data/interim.

Options
    -h --help               Show docstring.
    -t                      Test mode.


In [2]:
%run make_features.py test_df.pkl

DEBUG:root:Loading data...


Loading data...


DEBUG:root:Creating features...


Creating features...


DEBUG:root:Saving data...


Saving data...
Script complete!
Elapsed time: 1549594920 minutes


# Trouble-shoot
run by hand

In [30]:
import re
from collections import Counter
import os
import pandas as pd
import numpy as np
import tldextract
import logging
from docopt import docopt

logging.basicConfig(level=logging.DEBUG)


def shannon_specific_entropy(X):
    N = len(X)
    c = Counter(X)
    n = len(c.keys())
    h_sum = 0
    for x, n_i in c.items():
        f_i = n_i/N
        h_sum += (f_i) * np.log2(f_i)

    h_sum = -h_sum
    return h_sum

def get_query(url):
    QUERY_REGEX = re.compile(r"\?([a-z0-9\-._~%!$&'()*+,;=:@/]*)#?")
    try:
        return re.search(QUERY_REGEX, url).group(0)
    except AttributeError:
        return None

def get_path(url):
    # note: this returns '/' if there is no path
    # source: https://www.oreilly.com/library/view/regular-expressions-cookbook/9780596802837/ch07s12.html

    # path_regex = re.compile(r"([a-z0-9\-._~%!$&'()*+,;=:@/]*)")
    path_regex = re.compile(r"^([a-z][a-z0-9+\-.]*:(//[^/?#]+)?)?([a-zA-Z0-9\-._~%!$&'()*+,;=:@/]*)")
    try:
        return re.findall(path_regex, url)[0][-1]
    except IndexError:
        return None

In [31]:
filename = 'train_df.pkl'

In [33]:
input_path = os.path.join('../../data/interim', filename)
output_path = os.path.join('../../data/processed', filename)
_
# Load data
logging.debug('Loading data...')
print('Loading data...')

df = pd.read_pickle(input_path)

logging.debug('Creating features...')
print('Creating features...')

# extract TLD parts
df['tld_extract'] = df['url'].map(tldextract.extract)
df['subdomain'] = df['tld_extract'].apply(lambda x: x.subdomain)
df['domain'] = df['tld_extract'].apply(lambda x: x.domain)
df['suffix'] = df['tld_extract'].apply(lambda x: x.suffix)
df['hostname'] = df['tld_extract'].apply(lambda x: '.'.join(x))
df['path'] = df['url'].apply(get_path)
df['query'] = df['url'].apply(get_query)


# Subdomain ind's
df['subdomain_null_ind'] = np.where(df['subdomain'] == '', 1, 0)
df['subdomain_www_ind'] = np.where(df['subdomain'] == 'www', 1, 0)

# String lengths
df['length_url'] = df['url'].map(len)
df['length_domain'] = df['domain'].map(len)
df['length_path'] = df['path'].map(len)

# "Special" characters: counts, indicators
df['domain_dot_cnt'] = df['domain'].apply(lambda s: s.count('.'))
df['url_slash_cnt'] = df['url'].apply(lambda x: x.count('/'))
df['path_dot_cnt'] = df['path'].apply(lambda x: x.count('.'))
df['hostname_dash_cnt'] = df['hostname'].apply(lambda x: x.count('-'))

digits = re.compile(r'[0-9]')
df['url_digit_cnt'] = df['url'].apply(lambda x: len(re.findall(digits, x)))

special_chars = re.compile(r"[$-_.+!*'\(\)\,]")
df['url_special_char_cnt'] = df['url'].apply(lambda x: len(re.findall(special_chars, x)))

reserved_chars = re.compile(r'[;/\?:@=&]')
df['url_reserved_char_cnt'] = df['url'].apply(lambda x: len(re.findall(reserved_chars, x)))

hex_pattern = re.compile(r"(%[0-9A-F]{2})")
df['url_hex_pattern_ind'] = df['url'].apply(lambda x:
                                            1 if re.search(hex_pattern, x)
                                            is not None else 0)

# Entropy
df['hostname_entropy'] = df['hostname'].apply(shannon_specific_entropy)
df['url_entropy'] = df['url'].apply(shannon_specific_entropy)

# Suspicious words
suspicious_words = ['php', 'abuse', 'admin', 'verification']

for word in suspicious_words:
    col_name = word + '_ind'
    df[col_name] = np.where(df['url'].str.count(word) == 0, 0, 1)

# Save data
print('Saving data...')
logging.debug('Saving data...')
df.to_pickle(file_path)

print('Script complete!')

Loading data...
Creating features...
Saving data...


OSError: [Errno 22] Invalid argument

In [34]:
df.head()

Unnamed: 0,url,label,tld_extract,subdomain,domain,suffix,hostname,path,query,subdomain_null_ind,...,url_digit_cnt,url_special_char_cnt,url_reserved_char_cnt,url_hex_pattern_ind,hostname_entropy,url_entropy,php_ind,abuse_ind,admin_ind,verification_ind
0,https://casadeldisfraz.com/en/381-disfraces-ma...,benign,"(, casadeldisfraz, com)",,casadeldisfraz,com,.casadeldisfraz.com,/en/381-disfraces-mascotas,,1,...,3,11,5,0,3.576618,4.21148,0,0,0,0
1,https://www.cbs7.com/content/news/Engineer-sen...,benign,"(www, cbs7, com)",www,cbs7,com,www.cbs7.com,/content/news/Engineer-sent-to-the-hospital-af...,,0,...,10,33,6,0,2.855389,4.536487,0,0,0,0
2,https://century21norte.com/en/propiedades/edif...,benign,"(, century21norte, com)",,century21norte,com,.century21norte.com,/en/propiedades/edificios/local-comercial-de-c...,,1,...,16,51,7,0,3.511085,4.479739,0,0,0,0
3,http://apps.cccski.com/ViewSkierDetails.asp?sk...,benign,"(apps, cccski, com)",apps,cccski,com,apps.cccski.com,/ViewSkierDetails.asp,?skierid=30834,0,...,5,17,6,0,2.973557,4.448787,0,0,0,0
4,https://ccbcessexknights.com/sports/wvball/201...,benign,"(, ccbcessexknights, com)",,ccbcessexknights,com,.ccbcessexknights.com,/sports/wvball/2018-19/teams/ccbcessex,,1,...,6,16,8,0,3.594466,4.350357,0,0,0,0


In [35]:
df.shape

(7449443, 28)

In [36]:
import feather

In [39]:
output_path = os.path.join('../../data/processed', filename.replace('.pkl', '.feather'))

In [40]:
feather.write_dataframe(df, output_path)

ArrowInvalid: ("Could not convert ExtractResult(subdomain='', domain='casadeldisfraz', suffix='com') with type ExtractResult: did not recognize Python value type when inferring an Arrow data type", 'Conversion failed for column tld_extract with type object')

In [42]:
df.drop(columns=['tld_extract'], inplace=True)

In [43]:
feather.write_dataframe(df, output_path)