In [1]:
import pandas as pd
import numpy as np
import random
from autoencoder import Autoencoder
from preprocessing import extract_features, get_ngram_frequencies, extract_character_level_representation
import torch
import os
import re
from sklearn.model_selection import train_test_split

In [2]:
RANDOM_SEED = 33
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# Importing data

## Bambenek

Obtaining data from raw .txt files

In [3]:
def extract_bambnek_csv_from_txt(txt_path):
  with open(txt_path, 'rt') as f:
    lines = f.readlines()
    domains, malwares = [], []
    for line in lines:
      re_obj = re.search(r'^(?P<domain>[^\,]+),Domain used by (?P<malware>[^\-, ]+)', line)
      if re_obj is not None:
        domains.append(re_obj['domain'])
        malwares.append(re_obj['malware'])
    df = pd.DataFrame({'domain':domains, 'malware':malwares})
    return df

In [4]:
PATH_TXT_BAMBNEK_03_23 = '../data/Bambenek/bambnek-dga-feed-2023_03_23.txt'
PATH_TXT_BAMBNEK_03_30 = '../data/Bambenek/bambnek-dga-feed-2023_03_30.txt'

In [5]:
df_bambnek_03_23 = extract_bambnek_csv_from_txt(PATH_TXT_BAMBNEK_03_23)
df_bambnek_03_30 = extract_bambnek_csv_from_txt(PATH_TXT_BAMBNEK_03_30)

Cleaning data:

- Removing under represented malwares DGAs
- Removing wordlist based DGAs

In [6]:
def remove_under_represented_DGAs(df, min_instances=100):
  malwares_list = list(df.groupby('malware').size()[df.groupby('malware').size() < min_instances].index)
  return df.loc[~df['malware'].isin(malwares_list)].reset_index(drop=True)

def remove_wordlist_based_DGAs(df, wordlist_based_dgas_list=['fosniw', 'volatile', 'beebone', 'cryptowall']):
  return df.loc[~df['malware'].isin(wordlist_based_dgas_list)].reset_index(drop=True)

In [7]:
df_bambnek = pd.concat([df_bambnek_03_23, df_bambnek_03_30], ignore_index=True)
df_bambnek = remove_under_represented_DGAs(df_bambnek, min_instances=100)
df_bambnek = remove_wordlist_based_DGAs(df_bambnek)

In [8]:
# Dropping domains without dots
df_bambnek = df_bambnek[df_bambnek['domain'].str.contains('\.')].reset_index(drop=True)

  df_bambnek = df_bambnek[df_bambnek['domain'].str.contains('\.')].reset_index(drop=True)


In [9]:
# Dropping excessive rows
df_bambnek_reduced, _ = train_test_split(df_bambnek, test_size=0.55, random_state=RANDOM_SEED, stratify=df_bambnek['malware'])

In [10]:
df_bambnek_reduced['label'] = 1

In [11]:
df_bambnek_reduced

Unnamed: 0,domain,malware,label
35084,haqs8lg9uknhxs5plds3lj8q.org,Post,1
353441,acgeizel.bazar,bazarbackdoor,1
659685,elnfkyutnlclemlyqwebaj.com,ramnit,1
587104,ijrspwworvnr.biz,tinba,1
761775,gjykmcqu.co,necurs,1
...,...,...,...
974793,toqiuhib.bazar,bazarbackdoor,1
679526,nruftanfhyrntmnwjvhxnvvn.com,ramnit,1
227148,njajofvxedwkoj.net,ranbyus,1
105617,iuklmffitdmn.com,tinba,1


# Majestic Million

In [12]:
PATH_MAJESTIC = '../data/Majestic/majestic_million.csv'

df_majestic = pd.read_csv(PATH_MAJESTIC)
df_majestic = df_majestic[['Domain']][:len(df_bambnek_reduced)]

In [13]:
df_majestic['malware'] = 'benign'
df_majestic['label'] = 0
df_majestic = df_majestic.rename({'Domain':'domain'}, axis='columns')

In [14]:
df_majestic

Unnamed: 0,domain,malware,label
0,google.com,benign,0
1,facebook.com,benign,0
2,youtube.com,benign,0
3,twitter.com,benign,0
4,instagram.com,benign,0
...,...,...,...
499992,travelbabbo.com,benign,0
499993,8muses.com,benign,0
499994,skinnyspatula.com,benign,0
499995,whiteonwhite.co,benign,0


# Creating training, validation and test sets

In [15]:
df_full = pd.concat([df_bambnek_reduced, df_majestic], ignore_index=True)
df_full = df_full.drop_duplicates()
df_full.shape

(938358, 3)

In [16]:
df_tv, df_test = train_test_split(df_full, test_size=0.25, random_state=RANDOM_SEED, stratify=df_full['malware'], shuffle=True)
df_train, df_val = train_test_split(df_tv, test_size=0.25, random_state=RANDOM_SEED, stratify=df_tv['malware'], shuffle=True)

In [17]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)


In [18]:
df_train.shape, df_val.shape, df_test.shape

((527826, 3), (175942, 3), (234590, 3))

#  Preprocessing data and extracting features

In [19]:
PATH_ALEXA_1M = '../data/Alexa/top-1m.csv'
COLUMN_NAMES_ALEXA = ['ranking', 'domain']

df_alexa = pd.read_csv(PATH_ALEXA_1M, names=COLUMN_NAMES_ALEXA)
df_alexa = df_alexa.drop(['ranking'], axis='columns')
df_alexa['label'] = 0
df_alexa_top_100k = df_alexa[:100000]

In [20]:
ngram_frequencies = get_ngram_frequencies(df_alexa_top_100k['domain'])

In [21]:
df_train_pp = extract_features(df_train, 'domain', ngram_frequencies)

In [22]:
df_val_pp = extract_features(df_val, 'domain', ngram_frequencies)

In [23]:
df_val_pp

Unnamed: 0,domain,malware,label,domain_len,max_len_label,max_len_continuous_int,max_len_continuous_string,special_freq,special_ratio,int_freq,...,1_freq,2_freq,3_freq,4_freq,5_freq,6_freq,7_freq,8_freq,9_freq,reputation_value
0,5lo5xwvigo2ng5l.net,shiotob/urlzone/bebloh,1,19,15,1,6,0,0.00000,4,...,0,1,0,0,3,0,0,0,0,12.349074
1,antidote.info,benign,0,13,8,0,8,0,0.00000,0,...,0,0,0,0,0,0,0,0,0,38.802500
2,topukrainianhotels.com,benign,0,22,18,0,18,0,0.00000,0,...,0,0,0,0,0,0,0,0,0,111.926983
3,teensanalquest.com,benign,0,18,14,0,14,0,0.00000,0,...,0,0,0,0,0,0,0,0,0,82.544705
4,fulitss.cn,benign,0,10,7,0,7,0,0.00000,0,...,0,0,0,0,0,0,0,0,0,25.200155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175937,taofenquan11.com,benign,0,16,12,2,10,0,0.00000,2,...,2,0,0,0,0,0,0,0,0,34.856263
175938,whsc-online.com,benign,0,15,11,0,6,1,0.06667,0,...,0,0,0,0,0,0,0,0,0,104.025468
175939,solver.com,benign,0,10,6,0,6,0,0.00000,0,...,0,0,0,0,0,0,0,0,0,45.925217
175940,csbddnvdbhjsix.su,ranbyus,1,17,14,0,14,0,0.00000,0,...,0,0,0,0,0,0,0,0,0,6.800080


In [24]:
df_test_pp = extract_features(df_test, 'domain', ngram_frequencies)

In [26]:
PATH_SAVE = '../data'

if not os.path.exists(PATH_SAVE):
    os.makedirs(PATH_SAVE)

df_train_pp.to_csv(f'{PATH_SAVE}/train_MajBa_preprocessed.csv', index=False)
df_val_pp.to_csv(f'{PATH_SAVE}/val_MajBa_preprocessed.csv', index=False)
df_test_pp.to_csv(f'{PATH_SAVE}/test_MajBa_preprocessed.csv', index=False)

# Extracting character level features (for baseline comparison)

In [3]:
PATH_SAVE = '../data'
df_train = pd.read_csv(f'{PATH_SAVE}/train_MajBa_preprocessed.csv', usecols=['domain'])
df_val = pd.read_csv(f'{PATH_SAVE}/val_MajBa_preprocessed.csv', usecols=['domain'])
df_test = pd.read_csv(f'{PATH_SAVE}/test_MajBa_preprocessed.csv', usecols=['domain'])

In [4]:
cl_features_train = extract_character_level_representation(df_train, domain_col='domain', max_len=256, should_remove_TLD=True)
cl_features_val = extract_character_level_representation(df_val, domain_col='domain', max_len=256, should_remove_TLD=True)
cl_features_test = extract_character_level_representation(df_test, domain_col='domain', max_len=256, should_remove_TLD=True)

In [5]:
PATH_SAVE = '../data'

if not os.path.exists(PATH_SAVE):
    os.makedirs(PATH_SAVE)

np.save(f'{PATH_SAVE}/train_MajBa_charlevel_features.npy', cl_features_train)
np.save(f'{PATH_SAVE}/val_MajBa_charlevel_features.npy', cl_features_val)
np.save(f'{PATH_SAVE}/test_MajBa_charlevel_features.npy', cl_features_test)