# Logistic Regression classification model

A simple logictic regression is used to classify the domains as DGA or non-DGA. This model is much faster in inference compared to LSTMs, and will serve as a baseline for comparison.


In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression

from keras.preprocessing import sequence
from keras.preprocessing import text

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Path and file variables for saving model information
path_dir = '.\\saved_models\\'
name_m_report        = path_dir + 'logisticR_metrics_report'
name_c_report        = path_dir + 'logisticR_class_report'

# format and report dump switches
dump_pred_results    = 0x03                                 # bitmask switches to dump prediction results: 
                                                            #       0x01: prediction metrics
                                                            #       0x02: domains' mis-classifications table
format_m_report       = 'json'                              # 'json' format only, csv doesn't fit correctly here
format_c_report       = ['json', 'csv']                     # atleast 1 of: 'csv' or 'json'

# Category names
name_DGA = 'DGA'
name_nonDGA = 'non-DGA'

In [3]:
# Read DGA and Cisco high confidence data
dga_df = pd.read_csv('..\\data\\2018_0923\\dga-feed-high.csv', header=None, skiprows=15)
cisco_df = pd.read_csv('..\\data\\2018_0923\\top-1m.csv', header=None)

In [4]:
# display head
def display_df(dga_df_, cisco_df_):
    display("DGA feed sample: {}".format( dga_df_.shape) )
    display(dga_df_.head())
    display("Cisco feed sample: {}".format( cisco_df_.shape) )
    display(cisco_df_.head())

In [5]:
# Remove unused columns, add output label 'dga'

dga_df_slim =   dga_df.drop(columns=range(1,dga_df.shape[1]), inplace=False)
dga_df_slim.columns = ['domain']
cisco_df_slim = cisco_df.drop(columns=[0], inplace=False)
cisco_df_slim.columns = ['domain']
dga_df_slim['dga'] = name_DGA
cisco_df_slim['dga'] = name_nonDGA

display_df(dga_df_slim, cisco_df_slim)
unified_df = pd.concat([cisco_df_slim, dga_df_slim], ignore_index=True)
unified_df['dga'], labels = pd.factorize(unified_df['dga'], sort=True)   # binary factorization and potentially realigning the DGA categories

'DGA feed sample: (381953, 2)'

Unnamed: 0,domain,dga
0,plvklpgwivery.com,DGA
1,dnuxdhcgblsgy.net,DGA
2,qjlullhfkiowp.biz,DGA
3,elkidddodxdly.ru,DGA
4,rnbfwuprlwfor.org,DGA


'Cisco feed sample: (1000000, 2)'

Unnamed: 0,domain,dga
0,netflix.com,non-DGA
1,api-global.netflix.com,non-DGA
2,prod.netflix.com,non-DGA
3,push.prod.netflix.com,non-DGA
4,google.com,non-DGA


In [6]:
# Separate input sequences (domains) and output labels (DGA 0/1), and do train/test split

X = unified_df['domain']
Y = unified_df['dga']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,random_state=23)

In [11]:
# Logistic regression classification

TRAIN_MODEL = True                                         # Load saved model otherwise

max_seq_len = 75                                            # maximum char length of domain name
                                                     # train the model
# encode string characters to integers
encoder = text.Tokenizer(num_words=500, char_level=True)
encoder.fit_on_texts(X_train)                            # build character indices
X_train_tz = encoder.texts_to_sequences(X_train)

# Model definition
model=LogisticRegression(tol=0.01, C=1, solver='liblinear', max_iter=1000, verbose=20)

# Pad sequence where sequences are case insensitive characters encoded to
# integers from 0 to number of valid characters
X_train_pad=sequence.pad_sequences(X_train_tz, maxlen=75)

# Training
model_hist = model.fit(X_train_pad, Y_train)

[LibLinear]

In [12]:
# Validation on test dataset
# Compute prediction probabilities of classes 

X_test_pad = sequence.pad_sequences(encoder.texts_to_sequences(X_test), maxlen=max_seq_len)
Y_pred_prob = model.predict_proba(X_test_pad)
Y_pred = np.argmax(Y_pred_prob, axis=1)
acc = accuracy_score(Y_test, Y_pred)
print("Model accuracy = {:8.3f} %".format(acc*100))

Model accuracy =   86.872 %


In [13]:
# Inspect a few prediction probabilities

pred_table = X_test.to_frame()
pred_table.columns = ['domain']
pred_table['trueClass'] = [labels[i] for i in Y_test]
pred_table['predClass'] = [labels[i] for i in Y_pred]
pred_table['predProb'] = [Y_pred_prob[idx][Y_pred[idx]] for idx in range(0, Y_pred.shape[0]) ]

print('\nCorrectly predicted Domains:')
display(pred_table[pred_table['trueClass'] == pred_table['predClass'] ].head(10) )

print('\nMis-predicted Domains:')
display(pred_table[pred_table['trueClass'] != pred_table['predClass'] ].head(10) )


pred_table_FP = pred_table[(pred_table['trueClass'] == name_nonDGA) & (pred_table['predClass'] == name_DGA) ]
pred_FP_frac = pred_table_FP.shape[0]/pred_table.shape[0]
print('\nPercentage of False Positives (i.e. nonDGA domains classified as DGA): {:6.4f} %'
      .format(100*pred_FP_frac))

pred_table_FN = pred_table[(pred_table['trueClass'] == name_DGA) & (pred_table['predClass'] == name_nonDGA) ]
pred_FN_frac = pred_table_FN.shape[0]/pred_table.shape[0]
print('\nPercentage of False Negatives (i.e. DGA domains classified as nonDGA): {:6.4f} %'
      .format(100*pred_FN_frac))


Correctly predicted Domains:


Unnamed: 0,domain,trueClass,predClass,predProb
124546,ns47.domaincontrol.com,non-DGA,non-DGA,0.988869
660921,britishlibrary.typepad.co.uk,non-DGA,non-DGA,0.595031
446456,a538.casalemedia.com,non-DGA,non-DGA,0.97842
600919,ign-ar8de21s8pinm-8d3d0d118-4d8d69dgoogleplayd...,non-DGA,non-DGA,0.999997
1186650,gbggekvj.eu,DGA,DGA,0.588945
115543,ewr-66.ewr-rtb1.rfihub.com,non-DGA,non-DGA,0.84147
1360357,vsagkcaahpxrfbmqljnnxutj.com,DGA,DGA,0.541498
464912,static.bladeandsoul.com,non-DGA,non-DGA,0.946659
1097547,dlpyniywfxxp.com,DGA,DGA,0.777868
606453,trans11212.addressy.com,non-DGA,non-DGA,0.649959



Mis-predicted Domains:


Unnamed: 0,domain,trueClass,predClass,predProb
1359727,uxamuoylbidlktngprh.com,DGA,non-DGA,0.65995
1372453,snoumrqnyi.mooo.com,DGA,non-DGA,0.951096
882015,ebanking-ch1.ubs.com,non-DGA,DGA,0.549897
90390,6htb5ck86hk8i9.com,non-DGA,DGA,0.997115
266868,cdn.quizzclub.com,non-DGA,DGA,0.685064
1234281,bimyhmupgtetju.me,DGA,non-DGA,0.683658
1010085,1exf5ov1251gcf11d1v5s1af6cca.org,DGA,non-DGA,0.585229
1256521,asoqhiiugkeq.net,DGA,non-DGA,0.55342
1373227,uqmwgucn.mooo.com,DGA,non-DGA,0.968331
1058709,mtjuddabaum.pw,DGA,non-DGA,0.527847



Percentage of False Positives (i.e. nonDGA domains classified as DGA): 4.4889 %

Percentage of False Negatives (i.e. DGA domains classified as nonDGA): 8.6392 %


In [14]:
# dump classification metrics and FP/PF domains

# accuracy, precision, recall, f1, false positive, false negative
if dump_pred_results & 0x01:
    metrics_report = classification_report(Y_test, Y_pred, target_names=labels, output_dict=True)
    metrics_report['accuracy'] = acc
    metrics_report['false positives'] = pred_FP_frac
    metrics_report['false negatives'] = pred_FN_frac
    
    if format_m_report == 'json':
        fileName = name_m_report + '.' + format_m_report
        with open(fileName, 'w') as filePath:
            json.dump(metrics_report, fp=filePath)
    
# False Positives and False Negatives
if dump_pred_results & 0x02:
    pred_table_FP.insert(0, 'type', 'FP')
    pred_table_FN.insert(0, 'type', 'FN')
    
    for extn in format_c_report:
        filePath = name_c_report + '.' + extn
        if extn == 'csv':
            pred_table_FP.to_csv(filePath, mode='w', index=False, header=True)
            pred_table_FN.to_csv(filePath, mode='a', index=False, header=False)
        elif extn == 'json':
            pred_table_FP.append(pred_table_FN)
            pred_table_FP.to_json(filePath, orient='table', index=False)

### Summary:
Although the training and inference of Logistic Regression is very fast compared to LSTM, its accuracy lags behind substantially. This is prohibitive in terms of production deployment of this model, as it would mean that roughly 5 % of the domains would be Falsely classified as DGA domains and hence blocked. This would not be a tolerable experience by the end user/client. 