In [7]:
import itertools
from os import path
import os
import random

from ngram import NGram
import numpy as np
import tensorflow as tf

from sklearn.model_selection import train_test_split

In [17]:
class data_cleaner:
    def __init__(self):
        self.DATA_DIR = "../../data"

        bro_file = path.join(self.DATA_DIR, "bro.dat")
        good_urls_file = path.join(self.DATA_DIR, "top-1m.csv")
        tf_record_file = path.join(self.DATA_DIR, "rnn_data.tfrecord")

        DOMAIN_STR = 'Intel::DOMAIN'
        MAX_DOMAINS = 50000

        bad_domains = []
        good_domains = []

        with open(bro_file) as f:                                               
            f.readline()  # first line is a comment. skip over it now
            for idx, line in enumerate(f):
                l = line.strip().split("\t")
                if len(l) is not 4:
                    continue

                if l[1] == DOMAIN_STR and len(bad_domains) < MAX_DOMAINS:
                    bad_domains.append(l[0].lower())

        with open(good_urls_file) as f:
            for idx, line in enumerate(f):
                if idx < MAX_DOMAINS:
                    good_domain = line.rstrip().split(",")[1].lower()
                    good_domains.append(good_domain)

        self.domains = [[d,0] for d in good_domains] + [[d,1] for d in bad_domains]
        random.shuffle(self.domains)   
        
        alph = list("abcdefghijklmnopqrstuvwxyz1234567890.-")
        char_lookup = dict(zip(alph,range(len(alph))))

        with open(tf_record_file, 'w') as f:
            writer = tf.python_io.TFRecordWriter(f.name)
            for domain, label in self.domains:
                ex = self._make_example(domain, label)
                writer.write(ex.SerializeToString())
            writer.close()

    def _make_example(self, domain, label):
        ex = tf.train.SequenceExample()

        domain_length = len(domain)
        ex.context.feature["length"].int64_list.value.append(domain_length)
        ex.context.feature["label"].int64_list.value.append(label)
        ex.context.feature["domains"].bytes_list.value.append(bytes(domain, encoding="utf8"))
            
        return ex

    @staticmethod
    def to_one_hot(number, length):
        if number >= length:
            length = number + 1
        arr = [0] * length
        arr[number] = 1

        return arr

In [18]:
x = data_cleaner()

In [10]:
data_cleaner.to_one_hot(2,5)

[0, 0, 1, 0, 0]