In [None]:
import pandas as pd
import numpy as np
import random
from autoencoder import Autoencoder
from preprocessing import extract_features, get_ngram_frequencies, extract_character_level_representation
import torch
import os

In [None]:
RANDOM_SEED = 33
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# Importing data

## Alexa

In [None]:
PATH_ALEXA_1M = '../data/Alexa/top-1m.csv'
COLUMN_NAMES_ALEXA = ['ranking', 'domain']

df_alexa = pd.read_csv(PATH_ALEXA_1M, names=COLUMN_NAMES_ALEXA)
df_alexa = df_alexa.drop(['ranking'], axis='columns')
df_alexa['label'] = 0

In [None]:
df_alexa.shape

## AmritaDGA

In [None]:
# Set 1 for AmritaDGA samples used for TRAINING
PATH_AMRITA_DGA_SET_1 = '../data/AmritaDGA/Task 1/training/training.csv'
COLUMN_NAMES_AMRITA_SET_1 = ['domain', 'label']

df_amrita_set_1 = pd.read_csv(PATH_AMRITA_DGA_SET_1, names=COLUMN_NAMES_AMRITA_SET_1)


# Set 2 for AmritaDGA samples used for TESTING
PATH_AMRITA_DGA_SET_2_DOMAINS = '../data/AmritaDGA/Task 1/testing/first testing/test1.txt'
PATH_AMRITA_DGA_SET_2_LABELS = '../data/AmritaDGA/Task 1/testing/first testing/test1label.txt'

amrita_set_2_domains = pd.read_csv(PATH_AMRITA_DGA_SET_2_DOMAINS, names=['domain'])
amrita_set_2_labels = pd.read_csv(PATH_AMRITA_DGA_SET_2_LABELS, names=['label'])
df_amrita_set_2 = pd.concat([amrita_set_2_domains, amrita_set_2_labels], axis=1)

In [None]:
'AmritaDGA Set 1:', df_amrita_set_1.shape, df_amrita_set_1.query('label == 1').shape, df_amrita_set_1.query('label == 0').shape

In [None]:
'AmritaDGA Set 2:', df_amrita_set_2.shape, df_amrita_set_2.query('label == 1').shape, df_amrita_set_2.query('label == 0').shape

## OSINT (Bambenek Consulting Feeds)

In [None]:
PATH_OSINT = '../data/OSINT/bambenek_dga_feed.txt'
COLUMN_NAMES_OSINT = ['domain', 'malware', 'date','link']

df_osint = pd.read_csv(PATH_OSINT, skiprows=15, names=COLUMN_NAMES_OSINT)
df_osint = df_osint['domain'].to_frame()
df_osint['label'] = 1

In [None]:
df_osint.shape

# Creating training and test sets

## Training set

In [None]:
# Alexa ----------------------------------------
N_TRAIN_SAMPLES_ALEXA = 480000

# Separating top 100k to be used to n-gram reputation value feature computation
df_alexa_top_100k = df_alexa[:100000]

# Obtaining the training samples from Alexa
df_train_alexa = df_alexa[100000:100000 + N_TRAIN_SAMPLES_ALEXA].reset_index(drop=True)


# AmritaDGA (Set 1) ----------------------------------------
df_train_amrita_malicious = df_amrita_set_1.query('label == 1').reset_index(drop=True)
df_train_amrita_benign = df_amrita_set_1.query('label == 0').reset_index(drop=True)

In [None]:
df_train_AxAm = pd.concat([df_train_alexa, df_train_amrita_malicious], ignore_index=True).sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
df_train_AmAm = pd.concat([df_train_amrita_benign, df_train_amrita_malicious], ignore_index=True).sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

In [None]:
df_train_AxAm.value_counts('label'), df_train_AmAm.value_counts('label')

## Testing set

In [None]:
# AmritaDGA (Set 2) --------------------------------------
N_TEST_SAMPLES_AMRITA = 9000

df_test_amrita_benign = df_amrita_set_2.query('label == 0').sample(n=N_TEST_SAMPLES_AMRITA, random_state=RANDOM_SEED).reset_index(drop=True)

# OSINT --------------------------------------
N_TEST_SAMPLES_OSINT = 1000

df_osint = df_osint.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
df_test_osint = df_osint[:N_TEST_SAMPLES_OSINT]

In [None]:
df_test = pd.concat([df_test_amrita_benign, df_test_osint], ignore_index=True).sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

In [None]:
df_test.value_counts('label')

## Extra: Validation set

'K. H. Park, H. M. Song, J. D. Yoo, S.-Y. Hong, B. Cho, K. Kim, et al., "Unsupervised malicious domain detection with less labeling effort", Comput. Secur., vol. 116, May 2022.' does not provides any information if a validation set was used. However, let's consider a validation set to prevent overfitting in training data.

In [None]:
# Alexa ----------------------------------------
N_VAL_SAMPLES_ALEXA = 100000

last_train_index_alexa = 100000 + N_TRAIN_SAMPLES_ALEXA
df_val_alexa = df_alexa[last_train_index_alexa:last_train_index_alexa+N_VAL_SAMPLES_ALEXA].reset_index(drop=True)

# OSINT ----------------------------------------
N_VAL_SAMPLES_OSINT = 30000
df_val_osint = df_osint[N_TEST_SAMPLES_OSINT:N_TEST_SAMPLES_OSINT + N_VAL_SAMPLES_OSINT].reset_index(drop=True)

In [None]:
df_val = pd.concat([df_val_alexa, df_val_osint], ignore_index=True).sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

In [None]:
df_val.value_counts('label')

#  Preprocessing data and extracting features

In [None]:
ngram_frequencies = get_ngram_frequencies(df_alexa_top_100k['domain'])    

In [None]:
df_train_AxAm_pp = extract_features(df_train_AxAm, 'domain', ngram_frequencies)

In [None]:
df_train_AmAm_pp = extract_features(df_train_AmAm, 'domain', ngram_frequencies)

In [None]:
df_val_pp = extract_features(df_val, 'domain', ngram_frequencies)

In [None]:
df_test_pp = extract_features(df_test, 'domain', ngram_frequencies)

In [None]:
PATH_SAVE = '../data'

if not os.path.exists(PATH_SAVE):
    os.makedirs(PATH_SAVE)

df_train_AxAm_pp.to_csv(f'{PATH_SAVE}/train_AxAm_preprocessed.csv', index=False)
df_train_AmAm_pp.to_csv(f'{PATH_SAVE}/train_AmAm_preprocessed.csv', index=False)
df_val_pp.to_csv(f'{PATH_SAVE}/val_preprocessed.csv', index=False)
df_test_pp.to_csv(f'{PATH_SAVE}/test_preprocessed.csv', index=False)

# Extracting character level features (for baseline comparison)

In [None]:
PATH_SAVE = '../data'
df_train_AxAm = pd.read_csv(f'{PATH_SAVE}/train_AxAm_preprocessed.csv', usecols=['domain'])
df_train_AmAm = pd.read_csv(f'{PATH_SAVE}/train_AmAm_preprocessed.csv', usecols=['domain'])
df_val = pd.read_csv(f'{PATH_SAVE}/val_preprocessed.csv', usecols=['domain'])
df_test = pd.read_csv(f'{PATH_SAVE}/test_preprocessed.csv', usecols=['domain'])

In [None]:
cl_features_train_AxAm = extract_character_level_representation(df_train_AxAm, domain_col='domain', max_len=256, should_remove_TLD=True)
cl_features_train_AmAm = extract_character_level_representation(df_train_AmAm, domain_col='domain', max_len=256, should_remove_TLD=True)
cl_features_val = extract_character_level_representation(df_val, domain_col='domain', max_len=256, should_remove_TLD=True)
cl_features_test = extract_character_level_representation(df_test, domain_col='domain', max_len=256, should_remove_TLD=True)

In [None]:
PATH_SAVE = '../data'

if not os.path.exists(PATH_SAVE):
    os.makedirs(PATH_SAVE)

np.save(f'{PATH_SAVE}/train_AxAm_charlevel_features.npy', cl_features_train_AxAm)
np.save(f'{PATH_SAVE}/train_AmAm_charlevel_features.npy', cl_features_train_AmAm)
np.save(f'{PATH_SAVE}/val_charlevel_features.npy', cl_features_val)
np.save(f'{PATH_SAVE}/test_charlevel_features.npy', cl_features_test)