# Logistic Regression classification model

A simple logictic regression is used to classify the domains as DGA or non-DGA. This model is much faster in inference compared to LSTMs, and will serve as a baseline for comparison.


In [39]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

from keras.preprocessing import sequence
from keras.preprocessing import text

In [40]:
# Path and file variables for saving model information
path_dir = '.\\saved_models\\'
name_m_report        = path_dir + 'logisticR_metrics_report'
name_c_report        = path_dir + 'logisticR_class_report'

# format and report dump switches
dump_pred_results    = 0x03                                 # bitmask switches to dump prediction results: 
                                                            #       0x01: prediction metrics
                                                            #       0x02: domains' mis-classifications table
format_m_report       = 'json'                              # 'json' format only, csv doesn't fit correctly here
format_c_report       = ['json', 'csv']                     # atleast 1 of: 'csv' or 'json'

# Category names
name_DGA = 'DGA'
name_nonDGA = 'non-DGA'

In [41]:
# Read DGA and Cisco high confidence data
dga_df = pd.read_csv('..\\data\\2018_0923\\dga-feed-high.csv', header=None, skiprows=15)
cisco_df = pd.read_csv('..\\data\\2018_0923\\top-1m.csv', header=None)

In [42]:
# display head
def display_df(dga_df_, cisco_df_):
    display("DGA feed sample: {}".format( dga_df_.shape) )
    display(dga_df_.head())
    display("Cisco feed sample: {}".format( cisco_df_.shape) )
    display(cisco_df_.head())

In [43]:
# Remove unused columns, add output label 'dga'

dga_df_slim =   dga_df.drop(columns=range(1,dga_df.shape[1]), inplace=False)
dga_df_slim.columns = ['domain']
cisco_df_slim = cisco_df.drop(columns=[0], inplace=False)
cisco_df_slim.columns = ['domain']
dga_df_slim['dga'] = name_DGA
cisco_df_slim['dga'] = name_nonDGA

display_df(dga_df_slim, cisco_df_slim)
unified_df = pd.concat([cisco_df_slim, dga_df_slim], ignore_index=True)
unified_df['dga'], labels = pd.factorize(unified_df['dga'], sort=True)   # binary factorization and potentially realigning the DGA categories

'DGA feed sample: (381953, 2)'

Unnamed: 0,domain,dga
0,plvklpgwivery.com,DGA
1,dnuxdhcgblsgy.net,DGA
2,qjlullhfkiowp.biz,DGA
3,elkidddodxdly.ru,DGA
4,rnbfwuprlwfor.org,DGA


'Cisco feed sample: (1000000, 2)'

Unnamed: 0,domain,dga
0,netflix.com,non-DGA
1,api-global.netflix.com,non-DGA
2,prod.netflix.com,non-DGA
3,push.prod.netflix.com,non-DGA
4,google.com,non-DGA


In [44]:
# Separate input sequences (domains) and output labels (DGA 0/1), and do train/test split

X = unified_df['domain']
Y = unified_df['dga']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,random_state=23)

In [45]:
# Logistic regression classification

TRAIN_MODEL = True                                         # Load saved model otherwise

max_seq_len = 75                                            # maximum char length of domain name
                                                     # train the model
# encode string characters to integers
encoder = text.Tokenizer(num_words=500, char_level=True)
encoder.fit_on_texts(X_train)                            # build character indices
X_train_tz = encoder.texts_to_sequences(X_train)

# Model definition
model=LogisticRegression(tol=0.01, C=1, solver='liblinear', max_iter=1000, verbose=20)

# Pad sequence where sequences are case insensitive characters encoded to
# integers from 0 to number of valid characters
X_train_pad=sequence.pad_sequences(X_train_tz, maxlen=75)

# Training
model_hist = model.fit(X_train_pad, Y_train)

[LibLinear]

In [46]:
# Validation on test dataset

def report_metrics(model, labels, X_test, Y_test, name_nonDGA=name_nonDGA, name_DGA=name_DGA, verbose=True):
    
    # Compute prediction probabilities of classes 
    
    X_test_pad = sequence.pad_sequences(encoder.texts_to_sequences(X_test), maxlen=max_seq_len)
    Y_pred_prob = model.predict_proba(X_test_pad)
    Y_pred = np.argmax(Y_pred_prob, axis=1)
    acc = accuracy_score(Y_test, Y_pred)
    print("Model accuracy = {:8.3f} %".format(acc*100))
    
    # Inspect a few prediction probabilities
    
    pred_table = X_test.to_frame()
    pred_table.columns = ['domain']
    pred_table['trueClass'] = [labels[i] for i in Y_test]
    pred_table['predClass'] = [labels[i] for i in Y_pred]
    pred_table['predProb'] = [Y_pred_prob[idx][Y_pred[idx]] for idx in range(0, Y_pred.shape[0]) ]
    
    pred_table_FP = pred_table[(pred_table['trueClass'] == name_nonDGA) & (pred_table['predClass'] == name_DGA) ]
    pred_FP_frac = pred_table_FP.shape[0]/pred_table.shape[0]
    print('\nPercentage of False Positives (i.e. nonDGA domains classified as DGA): {:6.4f} %'
          .format(100*pred_FP_frac))
    
    pred_table_FN = pred_table[(pred_table['trueClass'] == name_DGA) & (pred_table['predClass'] == name_nonDGA) ]
    pred_FN_frac = pred_table_FN.shape[0]/pred_table.shape[0]
    print('\nPercentage of False Negatives (i.e. DGA domains classified as nonDGA): {:6.4f} %'
          .format(100*pred_FN_frac))
    
    if verbose:
        print('\nSample of correctly predicted Domains:')
        display(pred_table[pred_table['trueClass'] == pred_table['predClass'] ].head(10) )
        
        print('\nSample of mis-predicted Domains:')
        display(pred_table[pred_table['trueClass'] != pred_table['predClass'] ].head(10) )
    
    return Y_pred

In [58]:
report_metrics(model, labels, X_test, Y_test)

Model accuracy =   86.872 %

Percentage of False Positives (i.e. nonDGA domains classified as DGA): 4.4889 %

Percentage of False Negatives (i.e. DGA domains classified as nonDGA): 8.6392 %

Sample of correctly predicted Domains:


Unnamed: 0,domain,trueClass,predClass,predProb
124546,ns47.domaincontrol.com,non-DGA,non-DGA,0.988869
660921,britishlibrary.typepad.co.uk,non-DGA,non-DGA,0.595031
446456,a538.casalemedia.com,non-DGA,non-DGA,0.97842
600919,ign-ar8de21s8pinm-8d3d0d118-4d8d69dgoogleplayd...,non-DGA,non-DGA,0.999997
1186650,gbggekvj.eu,DGA,DGA,0.588945
115543,ewr-66.ewr-rtb1.rfihub.com,non-DGA,non-DGA,0.84147
1360357,vsagkcaahpxrfbmqljnnxutj.com,DGA,DGA,0.541498
464912,static.bladeandsoul.com,non-DGA,non-DGA,0.946659
1097547,dlpyniywfxxp.com,DGA,DGA,0.777868
606453,trans11212.addressy.com,non-DGA,non-DGA,0.649959



Sample of mis-predicted Domains:


Unnamed: 0,domain,trueClass,predClass,predProb
1359727,uxamuoylbidlktngprh.com,DGA,non-DGA,0.65995
1372453,snoumrqnyi.mooo.com,DGA,non-DGA,0.951096
882015,ebanking-ch1.ubs.com,non-DGA,DGA,0.549897
90390,6htb5ck86hk8i9.com,non-DGA,DGA,0.997115
266868,cdn.quizzclub.com,non-DGA,DGA,0.685064
1234281,bimyhmupgtetju.me,DGA,non-DGA,0.683658
1010085,1exf5ov1251gcf11d1v5s1af6cca.org,DGA,non-DGA,0.585229
1256521,asoqhiiugkeq.net,DGA,non-DGA,0.55342
1373227,uqmwgucn.mooo.com,DGA,non-DGA,0.968331
1058709,mtjuddabaum.pw,DGA,non-DGA,0.527847


array([1, 1, 1, ..., 1, 0, 0], dtype=int64)

### Initial impression:
Although the training and inference of Logistic Regression is very fast compared to LSTM, its accuracy lags behind substantially. This is prohibitive in terms of production deployment of this model, as it would mean that roughly 5 % of the domains would be Falsely classified as DGA domains and hence blocked. This would not be a tolerable experience by the end user/client. 

### Hyperparameters' optimization
In this section, the `GridSearchCV` is used to optimize the values of hyperparameters for maximizing accuracy.

In [10]:
parall_jobs = 5                                           # number of parallel jobs to launch
print("Optimizing hyperparameters part 1")
searchParams1 = {
    'penalty': ['l1'],
    'tol':     [1e-2, 1e-3, 1e-4, 1e-5, 1e-6],
    'C':       np.arange(1.0, 10.0, 1.0),
    'solver':  ['liblinear', 'saga']
}

optModel1 = GridSearchCV(model, param_grid=searchParams1, n_jobs=parall_jobs, cv=3, verbose=0)
optModel1.fit(X_train_pad, Y_train)

Optimizing hyperparameters part 1
Fitting 3 folds for each of 90 candidates, totalling 270 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:  2.0min
[Parallel(n_jobs=5)]: Done  62 tasks      | elapsed: 121.9min
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed: 351.2min
[Parallel(n_jobs=5)]: Done 270 out of 270 | elapsed: 687.8min finished


[LibLinear]

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.01, verbose=20, warm_start=False),
       fit_params=None, iid='warn', n_jobs=5,
       param_grid={'penalty': ['l1'], 'tol': [0.01, 0.001, 0.0001, 1e-05, 1e-06], 'C': array([1., 2., 3., 4., 5., 6., 7., 8., 9.]), 'solver': ['liblinear', 'saga']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=5)

In [12]:
print("Optimizing hyperparameters part 2")
searchParams2 = {
    'penalty': ['l2'],
    'tol':     [1e-2, 1e-3, 1e-4, 1e-5, 1e-6],
    'C':       np.arange(1.0, 10.0, 1.0),
    'solver':  ['newton-cg', 'lbfgs', 'sag']
}

optModel2 = GridSearchCV(model, param_grid=searchParams2, n_jobs=parall_jobs, cv=3, verbose=0)
optModel2.fit(X_train_pad, Y_train)

Optimizing hyperparameters part 2


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.5min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.5min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.01, verbose=20, warm_start=False),
       fit_params=None, iid='warn', n_jobs=5,
       param_grid={'penalty': ['l2'], 'tol': [0.01, 0.001, 0.0001, 1e-05, 1e-06], 'C': array([1., 2., 3., 4., 5., 6., 7., 8., 9.]), 'solver': ['newton-cg', 'lbfgs', 'sag']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [23]:
# Examine the best estimator settings from hyperparameters' search

print("\nOptimization part1 best score: {} and settings: \n".format(optModel1.best_score_))
print(optModel1.best_estimator_ )
print("\nOptimization part2 best score: {} and settings: \n".format(optModel2.best_score_))
print(optModel2.best_estimator_ )


Optimization part1 best score: 0.8680146387086387 and settings: 

LogisticRegression(C=2.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='liblinear',
          tol=1e-05, verbose=20, warm_start=False)

Optimization part2 best score: 0.8680227793647032 and settings: 

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.01, verbose=20, warm_start=False)


### Final model evaluation:

In [48]:
# Re evaluate with the best model settings
modelF = LogisticRegression(tol=0.01, C=1.0, solver='lbfgs', max_iter=1000, verbose=0)

modelF_hist = modelF.fit(X_train_pad, Y_train)

report_metrics(modelF, labels, X_test, Y_test, verbose=False)

Model accuracy =   86.899 %

Percentage of False Positives (i.e. nonDGA domains classified as DGA): 4.5551 %

Percentage of False Negatives (i.e. DGA domains classified as nonDGA): 8.5462 %


array([1, 1, 1, ..., 1, 0, 0], dtype=int64)

### Classify a new feed with a model trained on old feed:

An old feed from September is used to train the model (100 % training, no test split) first and then this model is used to classify a newer feed from November. This exercise exposes if the model is able to adapt to temporal changes, and how does it compare with similar exercise carried out on the LSTM model that showed **97.887 %** accuracy.

In [73]:
# Prepare dataset from a newer feed
dga_new_df = pd.read_csv('..\\data\\2018-11-12\\dga-feed-high.csv', header=None, skiprows=15)
dga_new_df_slim =   dga_new_df.drop(columns=range(1,dga_new_df.shape[1]), inplace=False)
dga_new_df_slim.columns = ['domain']
dga_new_df_slim['dga'] = name_DGA

labels_dict = dict()
for i, cat in enumerate(labels):
    labels_dict[cat] = i

dga_new_df_slim['dga'] = dga_new_df_slim.apply(lambda row: labels_dict[row['dga']], axis=1)

print("Sample of new feed:")
display(dga_new_df_slim.head())

X_train_old = unified_df['domain']
Y_train_old = unified_df['dga']
X_test_new  = dga_new_df_slim['domain']
Y_test_new  = dga_new_df_slim['dga']

encoderON = text.Tokenizer(num_words=500, char_level=True)
encoderON.fit_on_texts(X_train_old)
X_train_old_tz = encoderON.texts_to_sequences(X_train_old)
X_train_old_pad = sequence.pad_sequences(X_train_old_tz, maxlen=75)

Sample of new feed:


Unnamed: 0,domain,dga
0,frbkpjqimjibis.com,0
1,spgjjbdvddxorx.net,0
2,cjputxijcqnfky.biz,0
3,phutnpuwskdskg.ru,0
4,dbfrtoqkorceir.org,0


In [74]:
# Train on old datset and infer on a newer dataset
modelON = LogisticRegression(tol=0.01, C=1.0, solver='lbfgs', max_iter=1000, verbose=0)
modelON_hist = modelON.fit(X_train_old_pad, Y_train_old)
Y_pred_new = report_metrics(modelON, labels, X_test_new, Y_test_new, verbose=False)
print(classification_report(Y_test_new, Y_pred_new, target_names=labels, output_dict=False))

Model accuracy =   68.960 %

Percentage of False Positives (i.e. nonDGA domains classified as DGA): 0.0000 %

Percentage of False Negatives (i.e. DGA domains classified as nonDGA): 31.0403 %


  'recall', 'true', average, warn_for)


              precision    recall  f1-score   support

         DGA       1.00      0.69      0.82    371623
     non-DGA       0.00      0.00      0.00         0

   micro avg       0.69      0.69      0.69    371623
   macro avg       0.50      0.34      0.41    371623
weighted avg       1.00      0.69      0.82    371623



**The tuned, logistic model achieves only 69 % accuracy after being trained on old feed and used for inference on a new feed.**

In [14]:
# dump classification metrics and FP/PF domains - SKIPPED FOR NOW

# # accuracy, precision, recall, f1, false positive, false negative
# if dump_pred_results & 0x01:
#     metrics_report = classification_report(Y_test, Y_pred, target_names=labels, output_dict=True)
#     metrics_report['accuracy'] = acc
#     metrics_report['false positives'] = pred_FP_frac
#     metrics_report['false negatives'] = pred_FN_frac
#     
#     if format_m_report == 'json':
#         fileName = name_m_report + '.' + format_m_report
#         with open(fileName, 'w') as filePath:
#             json.dump(metrics_report, fp=filePath)
#     
# # False Positives and False Negatives
# if dump_pred_results & 0x02:
#     pred_table_FP.insert(0, 'type', 'FP')
#     pred_table_FN.insert(0, 'type', 'FN')
#     
#     for extn in format_c_report:
#         filePath = name_c_report + '.' + extn
#         if extn == 'csv':
#             pred_table_FP.to_csv(filePath, mode='w', index=False, header=True)
#             pred_table_FN.to_csv(filePath, mode='a', index=False, header=False)
#         elif extn == 'json':
#             pred_table_FP.append(pred_table_FN)
#             pred_table_FP.to_json(filePath, orient='table', index=False)