In [68]:
!pip install torch  



In [0]:
import sys

if sys.platform == 'linux':
  !apt-get install -y -qq software-properties-common python-software-properties module-init-tools
  !add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
  !apt-get update -qq 2>&1 > /dev/null
  !apt-get -y install -qq google-drive-ocamlfuse fuse
  from google.colab import auth
  auth.authenticate_user()
  from oauth2client.client import GoogleCredentials
  creds = GoogleCredentials.get_application_default()
  import getpass
  !google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
  vcode = getpass.getpass()
  !echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

  !mkdir -p data
  !google-drive-ocamlfuse data

In [0]:
import torch
import numpy as np

torch.manual_seed(1);

In [0]:
class args:
  train_size = 20000
  train_batch_size = 64   
  
  test_size = 10000
  test_batch_size = 32
  
  seq_len = 64
  n_epochs = 5
  print_every = 20

Initial boilerplate code

In [0]:
from collections import defaultdict

# Define vocabulary dictionary
next_index = 0
def get_new_index():
  global next_index
  index = next_index
  next_index += 1
  return index

vocabulary = defaultdict(get_new_index)

Load the phishing domains from a file. The file is expected to contain a pickled Python list of strings.

In [0]:
import pickle
from random import shuffle, seed

seed(1)

TRAIN_FILE = "data/phishing_domains.dat"
with open(TRAIN_FILE, 'rb') as f:
    data = pickle.load(f)  
    
# Randomly sample domains
shuffle(data)

Tokenize data and fill (reverse) vocabulary

In [0]:
import itertools

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def tokenize(data):
  chars = list(itertools.chain.from_iterable([list(domain.strip()) + ['<eos>'] for domain in data]))
  return torch.tensor([vocabulary[char] for char in chars])

  
train_tokens = tokenize(data[:args.train_size]).to(device)  
test_tokens = tokenize(data[args.train_size:args.train_size+args.test_size]).to(device)
  
vocab_size = len(vocabulary)
reverse_vocab = dict([(v,k) for k, v in vocabulary.items()])

Split data into batches for training

In [0]:
def batchify(data, batch_size):
    data_size = (len(data) // batch_size) * batch_size    
    return data[:data_size].reshape(-1, batch_size).contiguous()

def iterate_seq(data, seq_len):
    for i in range(0, len(data), seq_len):
        cur_seq_len = min(seq_len, len(data)-i-1)
        samples = data[i  : i+cur_seq_len]
        targets = data[i+1: i+cur_seq_len+1]
        yield samples, targets

train_data = batchify(train_tokens, args.train_batch_size)
test_data = batchify(test_tokens, args.test_batch_size)

Define model

In [0]:
import torch.nn as nn

class RNNModel(nn.Module):
    def __init__(self, ntoken, ninp, nhid, nlayers):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(ntoken, ninp)
        self.rnn = nn.LSTM(ninp, nhid, nlayers)
        self.project = nn.Linear(nhid, ntoken)
        
    def forward(self, input, hidden=None):
        embed = self.embedding(input)
        rnn_out, new_hidden = self.rnn(embed, hidden)
        return self.project(rnn_out), new_hidden
    
model = RNNModel(vocab_size, 32 ,64, 2).to(device)
optimizer = torch.optim.Adam(model.parameters())
lossfn = nn.CrossEntropyLoss()

Train model

In [128]:
def evaluate(model, data):  
  # validate the model  
  total_loss = 0
  with torch.no_grad():
    hidden = None
    for samples, targets in iterate_seq(data, args.seq_len):            
      output, hidden = model(samples, hidden)
      hidden = tuple(h.detach() for h in hidden)
      loss = lossfn(output.reshape(-1, vocab_size),
                    targets.reshape(-1))
      total_loss += loss
  return total_loss / len(data)

def train(model, train_data):
    test_error = []
    
    for epoch in range(1, args.n_epochs+1):
        # train
        model.train()
        print("Epoch: %d" % epoch)
        hidden = None
        for i, (samples, targets) in enumerate(iterate_seq(train_data, args.seq_len)):      
            output, hidden = model(samples, hidden)
            hidden = tuple(h.detach() for h in hidden)
            loss = lossfn(output.reshape(-1, vocab_size),
                          targets.reshape(-1))
            print(loss)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()                      
            if i % args.print_every == args.print_every - 1:
                print("Loss: %.4f" % loss.item())     
        # performance on test set
        model.eval()
        test_error.append(evaluate(model, test_data))
    return test_error

test_error = train(model, train_data)


Epoch: 1
tensor(3.2883, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.3029, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.3519, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.3499, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.3303, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.2953, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.3107, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.2695, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.3164, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.2950, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.2885, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.2751, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.3253, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.3201, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.3270, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.3294, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.2798, device='cuda:0', grad_fn=<NllLos

Exception: ignored

In [126]:
test_error

[tensor(0.0521, device='cuda:0'),
 tensor(0.0521, device='cuda:0'),
 tensor(0.0521, device='cuda:0'),
 tensor(0.0521, device='cuda:0'),
 tensor(0.0521, device='cuda:0')]