In [88]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

## Loading and Prepping data

In [2]:
data = pd.read_csv('./data/train_data1.csv')

In [5]:
data.tail()

Unnamed: 0,domain,domain_type
199995,shao-shavers.myshopify.com,benign
199996,aissf.in,benign
199997,bossofthefloss.blogspot.com,benign
199998,symextech.com,benign
199999,kartell.com,benign


In [15]:
sample_data = data.sample(50000)

In [16]:
sample_data.groupby('domain_type').count()

Unnamed: 0_level_0,domain
domain_type,Unnamed: 1_level_1
benign,24965
dga,25035


In [17]:
sample_data['target'] = sample_data.domain_type.apply(lambda x: 0 if x=='benign' else 1)

In [12]:
sample_data.head()

Unnamed: 0,domain,domain_type,target
32565,rfujfnghexiywao.net,dga,1
109515,bookshelf.ucoz.ua,benign,0
42582,cyfmktvfatupvkq.net,dga,1
64018,rpofkwjededyu.ru,dga,1
166266,jantibilisim.com,benign,0


## Feature Engineering
All we have is a string and for any model having multiple features is important. Thus, in this section we will try to create features from the name of our domains. Since XGBoost needs numeric features, we need to encode the characters. I have created a dictionary of our character mappings.

In [24]:
##We might use it later
def ngrams(n, arr):
    ''' Creates ngram mappings and their counts of a given list of strings
        Returns a dictionary
    '''
    freq = {}
    for s in arr:
        for i in range(len(s)-2):
            key = s[i:i+3]
            if key in freq:
                freq[key]+=1
            else:
                freq[key]=1
    return freq

In [28]:
##Testing the above function
ngrams(3, ['facebook', 'uk'])

{'fac': 1, 'ace': 1, 'ceb': 1, 'ebo': 1, 'boo': 1, 'ook': 1}

In [29]:
sample_data['domain_list'] = sample_data.domain.apply(lambda x: ngrams(3, x.split('.')))

In [51]:
sample_data['length'] = sample_data.domain.apply(lambda x: len(x))
sample_data.head()

Unnamed: 0,domain,domain_type,target,domain_list,length
11947,klngeudrfnxa.co.uk,dga,1,"{'kln': 1, 'lng': 1, 'nge': 1, 'geu': 1, 'eud'...",18
1622,wrpnprasfgwwwaa.co.uk,dga,1,"{'wrp': 1, 'rpn': 1, 'pnp': 1, 'npr': 1, 'pra'...",21
82938,dhylqerijlprhq.biz,dga,1,"{'dhy': 1, 'hyl': 1, 'ylq': 1, 'lqe': 1, 'qer'...",18
124266,openproxy.space,benign,0,"{'ope': 1, 'pen': 1, 'enp': 1, 'npr': 1, 'pro'...",15
45673,ebxqtkfihphpe.co.uk,dga,1,"{'ebx': 1, 'bxq': 1, 'xqt': 1, 'qtk': 1, 'tkf'...",19


In [60]:
##Encoding the characters:
char_map = {chr(i+97): i for i in range(26)} ##For all the alphabets
char_map['.'] = 27
char_map['_'] = 28
char_map['-'] = 29

In [61]:
count = 30
##For all the digits
for i in range(10):
    char_map[str(i)] = count
    count+=1

In [63]:
##Padding
char_map[' '] = 40 

In [64]:
max_length = max(sample_data.length) ##length of our embedding

In [65]:
##Creates encoded vector 
def encodeDomain(domain, length):
    vector = np.full(shape= length, fill_value=29)
    for i in range(len(domain)):
        if domain[i] not in char_map:
            print('Invalid key ', domain[i])
        else:
            vector[i] = char_map[domain[i]]
    return vector

In [66]:
##Testing above function
encodeDomain('facebook.com', 40)

array([ 5,  0,  2,  4,  1, 14, 14, 10, 27,  2, 14, 12, 29, 29, 29, 29, 29,
       29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
       29, 29, 29, 29, 29, 29])

In [67]:
embedding = sample_data.domain.apply(lambda x : encodeDomain(x, max_length))

## Model 1: With just the encodings

In [83]:
# split data into train and test sets
seed = 7
test_size = 0.33
X = np.array(embedding.tolist())
y = np.array(sample_data.target)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

In [84]:
##Training the model
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints=None,
       learning_rate=0.300000012, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=nan, monotone_constraints=None,
       n_estimators=100, n_jobs=0, num_parallel_tree=1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
       validate_parameters=False, verbosity=None)

In [86]:
##Testing the model
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

  if diff:


In [96]:
#Evaluating predictions
accuracy = accuracy_score(y_test, predictions)
f_score = f1_score(y_test, predictions)
print("Accuracy: %.3f%%" % (accuracy * 100.0))
print("F1_score: ", f_score)

Accuracy: 98.558%
F1_score:  0.9856522787557269


The accuracy and F1-score are both really high. Hence, our first version of the model is doing good.