In [20]:
import numpy as np
import pandas as pd
import math
import os 
import pickle

from os.path import abspath

In [27]:
# Constants
vowel = 'aeiou'
numbers = '0123456789'
consonants = 'bcdfghjklmnpqrstvwxyz'

def pairs_entropy(domain):
        # Extract the number of consecutive vowels, numbers and consonants
        count_pair_vowel = 0
        count_pair_consonants = 0
        count_pair_numbers = 0
        # Strip domain
        domain_stripped = domain.strip()
        # If find a consecutive vowel/number/consonant, increase counter
        for index in range(len(domain_stripped)-1):
            if (domain_stripped[index] in vowel and
                    domain_stripped[index+1] in vowel):
                count_pair_vowel += 1
            if (domain_stripped[index] in consonants and
                    domain_stripped[index+1] in consonants):
                count_pair_consonants += 1
            if (domain_stripped[index] in numbers and
                    domain_stripped[index+1] in numbers):
                count_pair_numbers += 1
        # Shannon Entropy quantifies the amount of information in a variable
        character_frequency = {}
        entropy = 0
        lenght = len(domain)
        for character in domain:
            character_frequency[character] = domain.count(character)
        for character in character_frequency:
            probability_character = float(
                character_frequency[character]) / float(lenght)
            entropy -= probability_character*math.log(probability_character, 2)
        return (count_pair_vowel, count_pair_consonants,
                count_pair_numbers, round(entropy, 3))


def lexical_features_extractor(_list, output):
    # Extraction of lexical features, 
    df = pd.DataFrame(_list,columns=['domain'])
    # Self explanatory name columns to DataFrame df
    df['length'] = df['domain'].str.len()
    df['vowel'] = df['domain'].str.count(r'[aeiou]')
    df['vowel_rate'] = round(df.vowel / df.length, 3)
    df['consonant'] = df['domain'].str.count(r'[a-z]') - df['vowel']
    df['consonant_rate'] = round(df.consonant / df.length, 3)
    df['number'] = df['domain'].str.count(r'[0-9]')
    df['num_rate'] = round(df.number / df.length, 3)
    df['dash'] = df['domain'].str.count(r'[-]')
    df['dash_rate'] = round(df.dash / df.length, 3)
    # Concatenate pairs_entropy features
    np_consec = np.zeros(shape=(len(df), 4))
    for index in df.itertuples():
        np_consec[index.Index] = pairs_entropy(index.domain)
    consec = pd.DataFrame.from_records(np_consec)
    consec.columns = ['vowel_pair', 'consonant_pair', 'number_pair', 'entropy']
    df = pd.concat([df, consec], axis=1)
    
    pickle.dump(df, open(abspath('./data/features/' + output), 'wb'))

In [28]:
# Import lists
legitimates = [line.rstrip('\n') for line in open(
    abspath('./data/lists/cleaned/legitimate_cleaned'))]
malicious = [line.rstrip('\n') for line in open(
    abspath('./data/lists/cleaned/malicious_cleaned'))]

In [29]:
# Run method to extract lexical features from public legitimate domains
lexical_features_extractor(legitimates, 'public_leg')

# Transform into a DataFrame
legitimates = pd.read_pickle(abspath('./data/features/public_leg'))
# Remove domain name
legitimates.pop('domain')
# Add target
legitimates.insert(loc=0, column='class', value=0)

# Same thing for the public malicious domains
lexical_features_extractor(malicious, 'public_mal')
malicious = pd.read_pickle(abspath('./data/features/public_mal'))
malicious.pop('domain')
malicious.insert(loc=0, column='class', value=1)


public_lexical = pd.concat([legitimates, malicious])
public_lexical.dropna(inplace = True)
public_lexical.to_csv('./data/features/public_lexical.csv', index=False)