In [1]:
import pandas as pd
import numpy as np
import random
from autoencoder import Autoencoder
from preprocessing import extract_features, get_ngram_frequencies
import torch
import os

In [2]:
RANDOM_SEED = 33
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# Importing data

## Alexa

In [3]:
PATH_ALEXA_1M = '../data/Alexa/top-1m.csv'
COLUMN_NAMES_ALEXA = ['ranking', 'domain']

df_alexa = pd.read_csv(PATH_ALEXA_1M, names=COLUMN_NAMES_ALEXA)
df_alexa = df_alexa.drop(['ranking'], axis='columns')
df_alexa['label'] = 0

In [4]:
df_alexa.shape

(1000000, 2)

## AmritaDGA

In [5]:
# Set 1 for AmritaDGA samples used for TRAINING
PATH_AMRITA_DGA_SET_1 = '../data/AmritaDGA/Task 1/training/training.csv'
COLUMN_NAMES_AMRITA_SET_1 = ['domain', 'label']

df_amrita_set_1 = pd.read_csv(PATH_AMRITA_DGA_SET_1, names=COLUMN_NAMES_AMRITA_SET_1)


# Set 2 for AmritaDGA samples used for TESTING
PATH_AMRITA_DGA_SET_2_DOMAINS = '../data/AmritaDGA/Task 1/testing/first testing/test1.txt'
PATH_AMRITA_DGA_SET_2_LABELS = '../data/AmritaDGA/Task 1/testing/first testing/test1label.txt'

amrita_set_2_domains = pd.read_csv(PATH_AMRITA_DGA_SET_2_DOMAINS, names=['domain'])
amrita_set_2_labels = pd.read_csv(PATH_AMRITA_DGA_SET_2_LABELS, names=['label'])
df_amrita_set_2 = pd.concat([amrita_set_2_domains, amrita_set_2_labels], axis=1)

In [6]:
'AmritaDGA Set 1:', df_amrita_set_1.shape, df_amrita_set_1.query('label == 1').shape, df_amrita_set_1.query('label == 0').shape

('AmritaDGA Set 1:', (790739, 2), (135056, 2), (655683, 2))

In [7]:
'AmritaDGA Set 2:', df_amrita_set_2.shape, df_amrita_set_2.query('label == 1').shape, df_amrita_set_2.query('label == 0').shape

('AmritaDGA Set 2:', (2457407, 2), (108076, 2), (2349331, 2))

## OSINT (Bambenek Consulting Feeds)

In [8]:
PATH_OSINT = '../data/OSINT/bambenek_dga_feed.txt'
COLUMN_NAMES_OSINT = ['domain', 'malware', 'date','link']

df_osint = pd.read_csv(PATH_OSINT, skiprows=15, names=COLUMN_NAMES_OSINT)
df_osint = df_osint['domain'].to_frame()
df_osint['label'] = 1

In [9]:
df_osint.shape

(580873, 2)

# Creating training and test sets

## Training set

In [10]:
# Alexa ----------------------------------------
N_TRAIN_SAMPLES_ALEXA = 480000

# Separating top 100k to be used to n-gram reputation value feature computation
df_alexa_top_100k = df_alexa[:100000]

# Obtaining the training samples from Alexa
df_train_alexa = df_alexa[100000:100000 + N_TRAIN_SAMPLES_ALEXA].reset_index(drop=True)


# AmritaDGA (Set 1) ----------------------------------------
df_train_amrita_malicious = df_amrita_set_1.query('label == 1').reset_index(drop=True)
df_train_amrita_benign = df_amrita_set_1.query('label == 0').reset_index(drop=True)

In [11]:
df_train_AxAm = pd.concat([df_train_alexa, df_train_amrita_malicious], ignore_index=True).sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
df_train_AmAm = pd.concat([df_train_amrita_benign, df_train_amrita_malicious], ignore_index=True).sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

In [12]:
df_train_AxAm.value_counts('label'), df_train_AmAm.value_counts('label')

(label
 0    480000
 1    135056
 Name: count, dtype: int64,
 label
 0    655683
 1    135056
 Name: count, dtype: int64)

## Testing set

In [13]:
# AmritaDGA (Set 2) --------------------------------------
N_TEST_SAMPLES_AMRITA = 9000

df_test_amrita_benign = df_amrita_set_2.query('label == 0').sample(n=N_TEST_SAMPLES_AMRITA, random_state=RANDOM_SEED).reset_index(drop=True)

# OSINT --------------------------------------
N_TEST_SAMPLES_OSINT = 1000

df_test_osint = df_osint.sample(n=N_TEST_SAMPLES_OSINT, random_state=RANDOM_SEED).reset_index(drop=True)

In [14]:
df_test = pd.concat([df_test_amrita_benign, df_test_osint], ignore_index=True).sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

In [15]:
df_test.value_counts('label')

label
0    9000
1    1000
Name: count, dtype: int64

#  Preprocessing data and extracting features

In [16]:
ngram_frequencies = get_ngram_frequencies(df_alexa_top_100k['domain'])    

In [17]:
df_train_AxAm_pp = extract_features(df_train_AxAm, 'domain', ngram_frequencies)



In [18]:
df_train_AmAm_pp = extract_features(df_train_AmAm, 'domain', ngram_frequencies)



In [19]:
df_test_pp = extract_features(df_test, 'domain', ngram_frequencies)



In [20]:
PATH_SAVE = '../data'

if not os.path.exists(PATH_SAVE):
    os.makedirs(PATH_SAVE)

df_train_AxAm_pp.to_csv(f'{PATH_SAVE}/train_AxAm_preprocessed.csv', index=False)
df_train_AmAm_pp.to_csv(f'{PATH_SAVE}/train_AmAm_preprocessed.csv', index=False)
df_test_pp.to_csv(f'{PATH_SAVE}/test_preprocessed.csv', index=False)