In [1]:
!pip install torch  



In [None]:
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

In [None]:
!mkdir -p drive
!google-drive-ocamlfuse drive

In [159]:
import torch
import numpy as np

torch.manual_seed(1);

Initial boilerplate code

In [275]:
from collections import defaultdict

# Define vocabulary dictionary
next_index = 0
def get_new_index():
  global next_index
  index = next_index
  next_index += 1
  return index

vocabulary = defaultdict(get_new_index)

class args:
    batch_size = 64   
    seq_len = 64
    train_size = 1000
    n_epochs = 1
    print_every = 20

Load the phishing domains from a file. The file is expected to contain a pickled Python list of strings.

In [172]:
import pickle
from random import shuffle, seed

seed(1)

TRAIN_FILE = "data/phishing_domains.dat"
with open(TRAIN_FILE, 'rb') as f:
    data = pickle.load(f)  
    
# Randomly sample domains
shuffle(data)
data = data[:args.train_size]

Tokenize data and fill (reverse) vocabulary

In [224]:
import itertools

all_chars = list(itertools.chain.from_iterable([list(domain.strip()) + ['<eos>'] for domain in data]))
tokens = torch.tensor([vocabulary[char] for char in all_chars])
vocab_size = len(vocabulary)
reverse_vocab = dict([(v,k) for k, v in vocabulary.items()])

Split data into batches for training

In [259]:
def batchify(data, batch_size):
    data_size = (len(tokens) // batch_size) * batch_size    
    return data[:data_size].reshape(-1, batch_size).contiguous()

def iterate_seq(data, seq_len):
    for i in range(0, len(data), seq_len):
        cur_seq_len = min(seq_len, len(data)-i-1)
        samples = data[i  : i+cur_seq_len]
        targets = data[i+1: i+cur_seq_len+1]
        yield samples, targets

train_data = batchify(tokens, args.batch_size)

Define model

In [265]:
import torch.nn as nn

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class RNNModel(nn.Module):
    def __init__(self, ntoken, ninp, nhid, nlayers):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(ntoken, ninp)
        self.rnn = nn.LSTM(ninp, nhid, nlayers)
        self.project = nn.Linear(nhid, ntoken)
        
    def forward(self, input, hidden=None):
        embed = self.embedding(input)
        rnn_out, new_hidden = self.rnn(embed, hidden)
        return self.project(rnn_out), new_hidden
    
model = RNNModel(vocab_size, 128 ,256, 2).to(device)
optimizer = torch.optim.Adam(model.parameters())
lossfn = nn.CrossEntropyLoss()

Train model

In [None]:
def train(model, train_data):
    for epoch in range(1, args.n_epochs+1):
        print("Epoch: %d" % epoch)
        hidden = None
        for i, (samples, targets) in enumerate(iterate_seq(train_data, args.seq_len)):      
            output, hidden = model(samples, hidden)
            hidden = tuple(h.detach() for h in hidden)
            loss = lossfn(output.reshape(-1, vocab_size),
                          targets.reshape(-1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()                      
            if i % args.print_every == args.print_every - 1:
                print("Loss: %.4f" % loss.item())                

train(model, train_data)

Epoch: 1
Loss: 3.2865
Loss: 3.3105
Loss: 3.3445
Loss: 3.2768
Loss: 3.3182
Loss: 3.2711
Loss: 3.3304
Loss: 3.3096
Loss: 3.2918
Loss: 3.3087
Loss: 3.3395
Loss: 3.3033
Loss: 3.2529
Loss: 3.3297
Loss: 3.3094
Loss: 3.3097
