In [1]:
import os
import random
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from torch.backends import cudnn

In [3]:
from option import get_option, parser
from trainer import Trainer
from utils import save_option
import data_loader
import models

In [4]:
import logging
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO)
#logging.getLogger().setLevel(logging.INFO)
logger = logging.getLogger(__name__)

import warnings
warnings.filterwarnings("ignore")
with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=DeprecationWarning)

In [5]:
seed = 87
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [6]:
from urlnet_utils import *

In [7]:
min_word_freq = 10
max_len_words = 200
max_len_chars = 200
max_len_subwords = 20
delimit_mode = 1
dev_pct = 0.0

In [8]:
urls, labels = read_data("../taipei/out.csv")

high_freq_words = None
if min_word_freq > 0:
    x1, word_reverse_dict = get_word_vocab(urls, max_len_words, min_word_freq) 
    high_freq_words = sorted(list(word_reverse_dict.values()))
    print("Number of words with freq >={}: {}".format(min_word_freq, len(high_freq_words)))

x, word_reverse_dict = get_word_vocab(urls, max_len_words)
word_x = get_words(x, word_reverse_dict, delimit_mode, urls)
ngramed_id_x, ngrams_dict, worded_id_x, words_dict = ngram_id_x(word_x, max_len_subwords, high_freq_words)
reverse_dict = {words_dict[i]:i for i in words_dict}
chars_dict = ngrams_dict
chared_id_x = char_id_x(urls, chars_dict, max_len_chars)
print("Overall Mal/Ben rate: {}/{}".format(np.sum(labels==0), np.sum(labels==1)))

######## balance ################
nmal = (labels==0).sum()
nbeg = (labels==1).sum()
total = min(nmal, nbeg)-1
mal_idx = np.argsort((labels==1))[:nmal]
beg_idx = np.argsort((labels==0))[:nbeg]

train = np.concatenate([mal_idx[:total], beg_idx[:total]])
test = np.concatenate([mal_idx[total:], beg_idx[total:]])
assert len(np.unique(labels[test])) > 1
assert len(np.unique(labels[train])) > 1


########  shuffle & split  ######
# shuffle_idx = np.random.permutation(np.arange(len(labels)))
# train, test = prep_train_test(len(labels), dev_pct)
np.random.shuffle(train)
np.random.shuffle(test)
y_train, y_val = labels[train], labels[test]



#x_train_char = ngramed_id_x[train]
#x_val_char = ngramed_id_x[test]
x_train_word = pad_seq_in_word(worded_id_x[train], max_len_words)
x_val_word = pad_seq_in_word(worded_id_x[test], max_len_words)
#x_train_char_seq = pad_seq_in_word(chared_id_x[train], max_len_chars)
#x_val_char_seq = pad_seq_in_word(chared_id_x[test], max_len_chars)
print(x_train_word.shape, x_val_word.shape)
#print(x_train_char_seq.shape, x_val_char_seq.shape)

Instructions for updating:
Please use tensorflow/transform or tf.data.
Instructions for updating:
Please use tensorflow/transform or tf.data.
Instructions for updating:
Please use tensorflow/transform or tf.data.


KeyboardInterrupt: 

In [None]:
def foo(word_x, labels):
    if not os.path.isfile("word_mal_ratio_large.pkl"):
        df_tmp = pd.DataFrame({'token':word_x, "label":labels})
        word_mal_ratio = dict()

        for word in tqdm(words_dict):
            df_tmp['exist'] = df_tmp['token'].map(lambda x:word in x)
            cnt = len(df_tmp.query('exist == True & label == 0'))
            word_mal_ratio[word] = cnt / df_tmp.exist.sum()

        with open("word_mal_ratio_large.pkl", "wb") as f:
            pickle.dump(word_mal_ratio, f)
    else:
        with open("word_mal_ratio_large.pkl", "rb") as f:
            word_mal_ratio = pickle.load(f)


    tmp = [word_mal_ratio[i] for i in word_mal_ratio]
    tmp = pd.DataFrame({'word_mal_ratio':tmp, 'word':[i for i in word_mal_ratio]})
    #tmp.plot(kind='hist')
    return word_mal_ratio, tmp
  

# from multiprocessing.pool import ThreadPool
# pool = ThreadPool(processes=4)

# async_result = pool.apply_async(foo, (word_x, labels))
# word_mal_ratio, tmp = async_result.get()

In [None]:
word_mal_ratio, tmp = foo(word_x, labels)

In [None]:
tmp1 = words_dict
tmp2 = ngrams_dict

In [None]:
urls, labels = read_data("../URLNet/val_10000.txt")
x, word_reverse_dict = get_word_vocab(urls, max_len_words) 
word_x = get_words(x, word_reverse_dict, delimit_mode, urls) 
ngramed_id_x, worded_id_x = ngram_id_x_from_dict(word_x, max_len_subwords, ngrams_dict, words_dict) 
chared_id_x = char_id_x(urls, chars_dict, max_len_chars)
print("Number of testing urls: {}".format(len(labels)))

y_test = labels
#x_test_char = ngramed_id_x
x_test_word = pad_seq_in_word(worded_id_x, max_len_words)
#x_test_char_seq = pad_seq_in_womax_len_words_id_x, max_len_chars)
print(len(x_test_word))#, x_test_char_seq.shape, x_test_char_seq.shape)

In [None]:
assert tmp1 == words_dict
assert tmp2 == ngrams_dict

In [None]:
urls, labels = read_data("../URLNet/train_10000.txt")#"../URLNet/test_10000.txt"
x, word_reverse_dict = get_word_vocab(urls, max_len_words) 
word_x = get_words(x, word_reverse_dict, delimit_mode, urls) 
ngramed_id_x, worded_id_x = ngram_id_x_from_dict(word_x, max_len_subwords, ngrams_dict, words_dict)
chared_id_x = char_id_x(urls, chars_dict, max_len_chars)
print("Number of testing urls: {}".format(len(labels)))

y_test2 = labels
#x_test_char2 = ngramed_id_x
x_test_word2 = pad_seq_in_word(worded_id_x, max_len_words)
#x_test_char_seq2 = pad_seq_in_word(chared_id_x, max_len_chars)
print(len(x_test_word2))#, x_test_char_seq2.shape, x_test_char_seq2.shape)

In [None]:
assert tmp1 == words_dict
assert tmp2 == ngrams_dict

In [None]:
emb_dim = 32
l2_reg_lambda = 0.0
emb_mode = 2
filter_size = [3,4,5,6]
batch_size = 128
epochs = 5
lr = 1e-3

In [None]:
from torch.utils.data import Dataset, DataLoader
class mydata(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, index):
        return self.X[index], self.Y[index]

train_dataset = mydata(x_train_word, y_train)
val_dataset = mydata(x_val_word, y_val)
test_dataset1 = mydata(x_test_word, y_test)
test_dataset2 = mydata(x_test_word2, y_test2)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader1 = DataLoader(test_dataset1, batch_size=batch_size, shuffle=False)
test_loader2 = DataLoader(test_dataset2, batch_size=batch_size, shuffle=False)

In [None]:
def accuracy_thresh_sum(y_pred:Tensor, y_true:Tensor, thresh:float=0.5, sigmoid:bool=True):
    y_pred = F.sigmoid(y_pred)
    return ((y_pred>thresh)==y_true.byte()).float().sum().item()

def accuracy_thresh_score(y_pred, y_true, thresh:float=0.5):
    return ((y_pred>0.5).astype(int) == y_true).mean()

In [None]:
def soft_cross_entropy(inp, target, size_average=True):
    if size_average:
        return torch.mean(torch.sum(-target * F.log_softmax(inp), dim=1))
    else:
        return torch.sum(torch.sum(-target * F.log_softmax(inp), dim=1))

class NegativeEntropyLoss(torch.nn.Module):
    def __init__(self):
        super(NegativeEntropyLoss, self).__init__()

    def forward(self, x):
        b = F.softmax(x, dim=1) * F.log_softmax(x, dim=1)
        b = -1.0 * b.sum()
        return b

def special_loss(target):
    target = F.softmax(target, dim=1)
    return torch.mean(torch.sum(target*torch.log(target), 1))

In [None]:
model = models.WordCNN(
            word_ngram_vocab_size = len(words_dict)+1,
            emb_size=emb_dim,
            word_seq_len=max_len_words,
            l2_reg_lambda=l2_reg_lambda,
            kernel_sizes=filter_size).cuda()

print('paramters count', sum(p.numel() for p in model.parameters()))
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
model

In [None]:
def step(model, optimizer, loader, train=True, message=""):
    loss_total = 0
    all_logits = []
    all_labels = []
    
    model.train() if train else model.eval()
    mode = torch.enable_grad if train else torch.no_grad
    
    with mode():
        for (data, label) in tqdm(loader, desc=message):
            data, label = data.long().cuda(), label.float().cuda()
            logits = model(data).squeeze()
            loss = F.binary_cross_entropy_with_logits(logits, label, reduction="mean")
            if train:
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
            loss_total += loss.item()
            all_labels.append(label.detach().cpu().numpy())
            all_logits.append(F.sigmoid(logits).detach().cpu().numpy())
            
            
    loss_total /= len(loader)
    all_labels = np.hstack(all_labels)
    all_logits = np.hstack(all_logits)
    auc = roc_auc_score(all_labels, all_logits) if len(np.unique(all_labels)) > 1 else 0
    acc = accuracy_thresh_score(all_logits, all_labels)
    
    #logger.info(message)
    logger.info('Loss     : {}'.format(loss_total))
    logger.info('Accuracy : {}'.format(acc))
    logger.info('AUC      : {}'.format(auc))
    
    return {'loss':loss_total, 'acc':acc, 'auc':auc}

In [None]:
# start straining

history = {'train':[], 'valid':[], 'test1':[], 'test2':[]}
for e in tqdm(range(10), desc="Epoch"):
    train_result = step(model, optimizer, train_loader, train=True, message="Training :{}".format(e))
    valid_result = step(model, optimizer, val_loader, train=False, message="Validation :{}".format(e))
    test_result1 = step(model, optimizer, test_loader1, train=False, message="Test 1 :{}".format(e))
    test_result2 = step(model, optimizer, test_loader2, train=False, message=" Test 2:{}".format(e)) # org train
    
    history['train'].append(train_result)
    history['valid'].append(valid_result)
    history['test1'].append(test_result1)
    history['test2'].append(test_result2)

In [None]:
save = "baseline-large"

train_loss = [l['loss'] for l in history['train']]
valid_loss = [l['loss'] for l in history['valid']]
test1_loss = [l['loss'] for l in history['test1']]
test2_loss = [l['loss'] for l in history['test2']]

train_auc = [l['auc'] for l in history['train']]
valid_auc = [l['auc'] for l in history['valid']]
test1_auc = [l['auc'] for l in history['test1']]
test2_auc = [l['auc'] for l in history['test2']]

plt.figure(figsize=(7,5))
plt.title('Loss')
plt.plot(train_loss, label='train')
plt.plot(valid_loss, label='valid')
plt.plot(test1_loss, label='test1')
plt.plot(test2_loss, label='test2')
plt.legend()
if save:
    path = os.path.join("output", save+'_loss.png')
    plt.savefig(path)
plt.show()

plt.figure(figsize=(7,5))
plt.title('AUC Score')
plt.plot(train_auc, label='train')
plt.plot(valid_auc, label='valid')
plt.plot(test1_auc, label='test1')
plt.plot(test2_auc, label='test2')
plt.legend()
if save:
    path = os.path.join("output", save+'_auc.png')
    plt.savefig(path)
plt.show()

# Adversarial training

In [None]:
UNKId = words_dict['<UNKNOWN>']
word_mal_ratio['<UNKNOWN>'] = 0.5
data = list(map(lambda x:words_dict.get(x, UNKId), word_mal_ratio.keys()))
label = list(map(lambda x:np.array([x, 1-x]), word_mal_ratio.values()))

emb_ratio_dataset = mydata(data, label)
emb_ratio_loader = DataLoader(emb_ratio_dataset, batch_size=batch_size, shuffle=True)

In [None]:
d, l = next(iter(emb_ratio_loader))
d.unsqueeze(1).shape

In [None]:
def emb_step(model, optimizer, train=False, iters=1):
    
    loader = emb_ratio_loader
    loss_total = 0
    _lambda = 0.1
    
    model.train() if train else model.eval()
    mode = torch.enable_grad if train else torch.no_grad
    with mode():
        for _ in range(iters):
            for (data, label) in loader:
            #for (data, label) in tqdm(loader, desc="emb bias"):
                data, label = data.long().unsqueeze(1).cuda(), label.float().cuda()
                logits = model(data)
                loss = soft_cross_entropy(logits, label, size_average=True) * _lambda
                if train:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                    optimizer.step()
                    optimizer.zero_grad()
                loss_total += loss.item()
                
    loss_total /= (len(loader)*iters/batch_size)
    return loss_total

In [None]:
def adv_step(model, optimizer, loader, train=True, message=""):
    loss_total = 0
    bias_loss_total = 0
    all_logits = []
    all_labels = []
    
    model.train() if train else model.eval()
    bias_model.train if train else bias_model.eval()
    mode = torch.enable_grad if train else torch.no_grad
    
    with mode():
        for (data, label) in tqdm(loader, desc=message):
            data, label = data.long().cuda(), label.float().cuda()
            logits = model(data).squeeze()
            loss = F.binary_cross_entropy_with_logits(logits, label, reduction="sum")
            if train:
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
            loss_total += loss.item()
            all_labels.append(label.detach().cpu().numpy())
            all_logits.append(F.sigmoid(logits).detach().cpu().numpy())
            
#             ###adv step for each original batch
#             logits = bias_model(data).squeeze()
#             _lambda = 0.01
#             bias_loss = _lambda * soft_cross_entropy(logits)
#             if train:
#                 bias_loss.backward()
#                 bias__optimizer.step()
#                 bias__optimizer.zero_grad()
#             bias_loss_total += bias_loss.item()
#     bias_loss_total = bias_loss_total/(len(loader)/batch_size
    
    loss_total /= (len(loader)/batch_size)
    all_labels = np.hstack(all_labels)
    all_logits = np.hstack(all_logits)
    auc = roc_auc_score(all_labels, all_logits) if len(np.unique(all_labels)) > 1 else 0
    acc = accuracy_thresh_score(all_logits, all_labels)
    

    bias_loss_total = emb_step(bias_model, bias_optimizer, train, 1)
    logger.info('Loss     : {}'.format(loss_total))
    logger.info('Bias Loss: {}'.format(bias_loss_total))
    logger.info('Accuracy : {}'.format(acc))
    logger.info('AUC      : {}'.format(auc))
    
    return {'loss':loss_total, 'acc':acc, 'auc':auc, "bias":bias_loss_total}

In [None]:
adv_model = models.WordCNN(
            word_ngram_vocab_size = len(words_dict)+1,
            emb_size=emb_dim,
            word_seq_len=max_len_words,
            l2_reg_lambda=l2_reg_lambda,
            kernel_sizes=filter_size).cuda()
adv_optimizer = torch.optim.Adam(adv_model.parameters(), lr=1e-3)

bias_model = models.BiasPredictor(adv_model.word_emb, emb_dim).cuda()
bias_optimizer = torch.optim.Adam(bias_model.parameters(), lr=1e-3)

In [None]:
# start straining

history = {'train':[], 'valid':[], 'test1':[], 'test2':[]}
for e in tqdm(range(10), desc="Epoch"):
    train_result = adv_step(adv_model, adv_optimizer, train_loader, train=True, message="Training :{}".format(e))
    valid_result = adv_step(adv_model, adv_optimizer, val_loader, train=False, message="Validation :{}".format(e))
    test_result1 = adv_step(adv_model, adv_optimizer, test_loader1, train=False, message="Test 1 :{}".format(e))
    test_result2 = adv_step(adv_model, adv_optimizer, test_loader2, train=False, message="Test 2 :{}".format(e))
    
    history['train'].append(train_result)
    history['valid'].append(valid_result)
    history['test1'].append(test_result1)
    history['test2'].append(test_result2)

In [None]:
save = "adv"

train_loss = [l['loss'] for l in history['train']]
valid_loss = [l['loss'] for l in history['valid']]
test1_loss = [l['loss'] for l in history['test1']]
test2_loss = [l['loss'] for l in history['test2']]

train_bloss = [l['bias'] for l in history['train']]
valid_bloss = [l['bias'] for l in history['valid']]
test1_bloss = [l['bias'] for l in history['test1']]
test2_bloss = [l['bias'] for l in history['test2']]

train_auc = [l['auc'] for l in history['train']]
valid_auc = [l['auc'] for l in history['valid']]
test1_auc = [l['auc'] for l in history['test1']]
test2_auc = [l['auc'] for l in history['test2']]

plt.figure(figsize=(7,5))
plt.title('Loss')
plt.plot(train_loss, label='train')
plt.plot(valid_loss, label='valid')
plt.plot(test1_loss, label='test1')
plt.plot(test2_loss, label='test2')
plt.legend()
if save:
    path = os.path.join("output", save+'_loss.png')
    plt.savefig(path)
plt.show()


plt.figure(figsize=(7,5))
plt.title('Bias Loss')
plt.plot(train_bloss, label='train')
plt.plot(valid_bloss, label='valid')
plt.plot(test1_bloss, label='test1')
plt.plot(test2_bloss, label='test2')
plt.legend()
if save:
    path = os.path.join("output", save+'_bias_loss.png')
    plt.savefig(path)
plt.show()


plt.figure(figsize=(7,5))
plt.title('AUC Score')
plt.plot(train_auc, label='train')
plt.plot(valid_auc, label='valid')
plt.plot(test1_auc, label='test1')
plt.plot(test2_auc, label='test2')
plt.legend()
if save:
    path = os.path.join("output", save+'_auc.png')
    plt.savefig(path)
plt.show()