In [1]:
import networkx as nx
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy
import numpy as np
import math
import pickle
from tqdm import tqdm_notebook
from multiprocessing import Pool

np.set_printoptions(suppress=True)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import DataLoader
from torch.autograd import Variable
from torch.optim.lr_scheduler import ExponentialLR

In [3]:
torch.cuda.set_device(3)

## Model Declaration

In [4]:
class GRU(nn.Module):
    def __init__(self, in_dim, out_dim):
        super(GRU, self).__init__()
        
        self.in_dim = in_dim
        self.hid1 = 200
        self.hid2 = 100
        self.rnn = nn.GRU(self.in_dim, self.hid1, num_layers=3, 
                          bidirectional=True, batch_first=True, dropout=0.3)
        self.relu = nn.ReLU()
        
        self.out1 = nn.Sequential(
            nn.Linear(2*self.hid1, 64),
            nn.ReLU(),
            nn.Dropout(p=0.4),
            
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            
            nn.Linear(32, out_dim),
            nn.Sigmoid(),
        )
    
    def forward(self, x):
        rnn, hid = self.rnn(x)
        return self.out1(self.relu(rnn[:, -1]))
        
    def get_trainable_parameters(self):
        return (param for param in self.parameters() if param.requires_grad)

In [5]:
class FocalLoss2(nn.Module):
    def __init__(self, alpha=0.01, gamma_pos=3, gamma_neg=2, logits=False, reduce=True):
        super(FocalLoss2, self).__init__()
        self.alpha = alpha
        self.gamma_pos = gamma_pos
        self.gamma_neg = gamma_neg
        self.logits = logits
        self.reduce = reduce
    
    def forward(self, inputs, targets):
        if self.logits:
            BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduce=False)
        else:
            BCE_loss = F.binary_cross_entropy(inputs, targets, reduce=False)
        pt = torch.exp(-BCE_loss)
        gamma_diff = self.gamma_pos - self.gamma_neg
        F_loss_pos = self.alpha * targets * (1-pt)**self.gamma_pos * BCE_loss
        F_loss_pos = torch.mean(pt)**(-gamma_diff) * F_loss_pos
        F_loss_neg = self.alpha * (1 - targets) * (1-pt)**self.gamma_neg * BCE_loss
        F_loss = F_loss_pos + F_loss_neg
        
        avg_F_loss_pos = torch.sum(F_loss_pos) / torch.sum(targets)
        avg_F_loss_neg = torch.sum(F_loss_neg) / torch.sum(1-targets)
        
        if self.reduce:
            return torch.mean(F_loss), avg_F_loss_pos, avg_F_loss_neg
        else:
            return F_loss, F_loss_pos, F_loss_neg

## Parameters Settings

In [6]:
#
# Classifier
# ---------------------
## focal loss
alpha = 1
gamma_pos = 6
gamma_neg = 2
learn_rate = 1e-5
grad_clip = 1

#
# VAT
# ---------------------
vat_xi = 1e-6
vat_eps_pos = 1
vat_eps_neg = 0.01
vat_ip = 1

#
# Training process
# ---------------------
train_batch_size = 128
test_batch_size = 256

max_epochs = 100

## Data Preparation

In [7]:
train_data = np.load('../datasets/Training_data_heter.npz', allow_pickle=True)
test_data  = np.load('../datasets/Testing_data_heter.npz',  allow_pickle=True)

training_data, training_label, training_announce, training_FILTER = train_data['arr_0'], train_data['arr_1'], train_data['arr_2'], train_data['arr_3']
testing_data,  testing_label,  testing_announce,  testing_FILTER = test_data['arr_0'], test_data['arr_1'], test_data['arr_2'], test_data['arr_3']

X_train = training_data#[(training_announce == 1) & (training_FILTER == 0 )]
y_train = training_label#[(training_announce == 1) & (training_FILTER == 0 )]

X_test  = testing_data#[(testing_announce == 1) & (testing_FILTER == 0 )]
y_test  = testing_label#[(testing_announce == 1) & (testing_FILTER == 0 )]

In [51]:
len(test_data)/7

68927.0

In [50]:
test_data[0][0].shape

torch.Size([3, 661])

In [8]:
# data = np.load('GRUArray_and_label_for_NewEmbedding_heter_superv_recur_focal_logisticMF.npz', allow_pickle=True)

# GPUArray = data['arr_0']
# label = data['arr_1']

# GPUArray = GPUArray[-1033905:,:,:]
# label = label[-1033905:]

# X_train, X_test, y_train, y_test = train_test_split(GPUArray, label, random_state=42)
X_train = torch.FloatTensor(X_train)
X_test = torch.FloatTensor(X_test)
y_train = torch.FloatTensor(y_train)
y_test = torch.FloatTensor(y_test)

train_data = []
for i in range(len(X_train)):
    train_data.append((X_train[i], y_train[i]))
    
test_data = []
for i in range(len(X_test)):
    test_data.append((X_test[i], y_test[i]))

train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
test_dataloader = DataLoader(test_data, shuffle=False, batch_size=test_batch_size)

In [9]:
classifier = GRU(in_dim=X_train.shape[2], out_dim=2).cuda()
focal_loss = FocalLoss2(alpha, gamma_pos, gamma_neg)
# optim_clsfr = optim.Adam(filter(lambda p: p.requires_grad, classifier.parameters()), 
#                          lr=learn_rate)

### Predict

In [10]:
import matplotlib.pyplot as plt

In [90]:
def plotting(pred_y_list, label_list, PATH):
    ###########################################################
    plt.ylabel('Count(log)')
    plt.xlabel('Score')
    plt.yscale('log')
    
    ax = plt.gca()
    ax.spines['right'].set_color('none')
    ax.spines['top'].set_color('none')
    ax.spines['left'].set_color('none')
    ax.spines['bottom'].set_color('none')
    
    plt.grid(color = '#9999CC')
    plt.hist(np.array(pred_y_list)[np.where(np.array(label_list) == 0)], 
             bins=[n/200 for n in range(50, 150)], 
             label='Negative',
             color='#598987')
    plt.hist(np.array(pred_y_list)[np.where(np.array(label_list) == 1)], 
             bins=[n/200 for n in range(50, 150)], 
             label='Positive', 
             color='#FFD000')
    plt.legend(loc='upper right')
    plt.savefig("{}_test_1.jpg".format(PATH.split('/')[-1]), dpi=1000, quality=100)
    plt.show()
    ###########################################################
    plt.ylabel('Log Count')
    plt.xlabel('Score')
    plt.yscale('log')
    
    ax = plt.gca()
    ax.spines['right'].set_color('none')
    ax.spines['top'].set_color('none')
    ax.spines['left'].set_color('none')
    ax.spines['bottom'].set_color('none')
    
    plt.grid(color = '#9999CC')
    plt.hist(np.array(pred_y_list)[np.where(np.array(label_list) == 0)], 
             bins=[n/2000 for n in range(530, 600)], 
             label='Negative', 
             color='#598987')
    plt.hist(np.array(pred_y_list)[np.where(np.array(label_list) == 1)], 
             bins=[n/2000 for n in range(530, 600)], 
             label='Positive', 
             color='#FFD000')
    plt.legend(loc='upper right')
    plt.savefig("{}_test_2.jpg".format(PATH.split('/')[-1]), dpi=1000, quality=100)
    plt.show()

def func(PATH, thres):
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, classifier.parameters()), lr=learn_rate)

    model = GRU(in_dim=X_train.shape[2], out_dim=2).cuda()
    checkpoint = torch.load(PATH)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']

    model.eval()

    label_list = []
    pred_y_list = []

    for batch_idx, (data, target) in tqdm_notebook(enumerate(test_dataloader)):
        if data.size()[0] != test_dataloader.batch_size:
            continue
        data, target = Variable(data.cuda()), Variable(target.cuda())
        # Update classifier

        pred_y = model(data).squeeze(-1)
        pred_y = torch.nn.functional.softmax(pred_y, dim=1)[:, 1]

        label_list += list(target.cpu().detach().numpy())
        pred_y_list += list(pred_y.cpu().detach().numpy())
    
    # thres = sorted(np.array(pred_y_list)[np.where(np.array(label_list) == 0)])[int(len(pred_y_list)*0.997)]
    
    print("Testing Treshold: {}".format(np.min(np.array(pred_y_list)[np.where(np.array(label_list) == 1)])))
    print("Total Positve: {}".format(len(np.where(np.array(label_list) == 1)[0])))
    print("Total Candidate: {}".format(np.sum(pred_y_list >= np.min(np.array(pred_y_list)[np.where(np.array(label_list) == 1)]))))
    print("Negative Mean: {}".format(np.array(pred_y_list)[np.where(np.array(label_list) == 0)].mean()))
    print("Negative Variance: {}".format(np.array(pred_y_list)[np.where(np.array(label_list) == 0)].std()))
    print("Negative Q997: {}".format(sorted(np.array(pred_y_list)[np.where(np.array(label_list) == 0)])[int(len(pred_y_list)*0.997)]))
    print("Prec: {}".format(np.sum(np.array(label_list)[np.where(np.array(pred_y_list) > thres)])/np.sum(np.array(pred_y_list) > thres) ))
    
    #     plotting(pred_y_list, label_list, PATH)
    # plotting(pred_y_list, label_list, PATH)

In [91]:
PATH = 'saved_models/VATGRU_heter_clsfr_xi6_eps03_focal42_BestAUC_1'
func(PATH, 0.26920807361602783)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Testing Treshold: 0.2693917751312256
Total Positve: 46
Total Candidate: 1372
Negative Mean: 0.2689572870731354
Negative Variance: 0.0004295199760235846
Negative Q997: 0.269335001707077
Prec: 0.02804878048780488


In [92]:
PATH = 'saved_models/VATGRU_heter_clsfr_xi6_eps03_focal42_BestAUC_2'
func(PATH, 0.26898202300071716)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Testing Treshold: 0.2690467834472656
Total Positve: 46
Total Candidate: 1315
Negative Mean: 0.268947571516037
Negative Variance: 0.0001918014750117436
Negative Q997: 0.26901817321777344
Prec: 0.027577937649880094


In [93]:
PATH = 'saved_models/VATGRU_heter_clsfr_xi6_eps03_focal42_BestAUC_3'
func(PATH, 0.26902028918266296)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Testing Treshold: 0.2691945731639862
Total Positve: 46
Total Candidate: 1254
Negative Mean: 0.26895132660865784
Negative Variance: 0.00034796789987012744
Negative Q997: 0.26911211013793945
Prec: 0.028065893837705917


In [94]:
PATH = 'saved_models/VATGRU_heter_clsfr_xi6_eps03_focal42_BestAUC_4'
func(PATH, 0.2690264582633972)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Testing Treshold: 0.2692015469074249
Total Positve: 46
Total Candidate: 1164
Negative Mean: 0.26894959807395935
Negative Variance: 0.0002595408004708588
Negative Q997: 0.26909157633781433
Prec: 0.0283775447254781


In [95]:
PATH = 'saved_models/VATGRU_heter_clsfr_xi6_eps03_focal42_BestAUC_5'
func(PATH, 0.2690237760543823)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Testing Treshold: 0.269040584564209
Total Positve: 46
Total Candidate: 1520
Negative Mean: 0.2689470648765564
Negative Variance: 0.00015061203157529235
Negative Q997: 0.269059419631958
Prec: 0.02857142857142857


1:
    Training Treshold: 0.3381294740905762
    Testing Treshold: 0.2693917751312256
    Total Positve: 46
    Total Candidate: 1372  
    Precision: 3.36%

2: 
    Training Treshold: 0.38438619511032107
    Testing Treshold: 0.2690467834472656
    Total Positve: 46
    Total Candidate: 1315
    Precision: 3.50%

3:
    Training Treshold: 0.3429013325443268
    Testing Treshold: 0.2691945731639862
    Total Positve: 46
    Total Candidate: 1254
    Precision: 3.67%

4:
    Training Treshold: 0.33618862764739993
    Testing Treshold: 0.2692015469074249
    Total Positve: 46
    Total Candidate: 1164
    Precision: 3.94%

5:
    Training Treshold: 0.31790391938591006
    Testing Treshold: 0.269040584564209
    Total Positve: 46
    Total Candidate: 1520
    Precision: 3.03%