In [45]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import scipy.sparse as sp
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, precision_score, recall_score
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

from tqdm import tqdm

import os
os.path.join('../')

import parser
import dataset

import datetime
from datetime import timedelta

from custom_parser import get_parser

import numpy as np 
import pandas as pd 
import torch
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from torch_geometric.utils import from_networkx, to_undirected
from torch_geometric.data import Data, DataLoader, Dataset
from tqdm import tqdm, tqdm_notebook, trange
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from collections import defaultdict
import random
from xgboost import XGBClassifier
%config Completer.use_jedi = False


In [46]:
data = dataset.Tdata(path='../../tdata.csv')
parser = get_parser()
args = parser.parse_args(args=
                         ["--data","real-t", 
                          "--sampling","xgb",
                          "--mode","scratch",
                          "--train_from","20140101",
                          "--test_from","20170101",
                          "--test_length","365",
                          "--valid_length","180",
                          "--initial_inspection_rate", "3",
                          "--final_inspection_rate", "10",
                         ])

In [47]:
# args
seed = args.seed
epochs = args.epoch
dim = args.dim
lr = args.lr
weight_decay = args.l2
initial_inspection_rate = args.initial_inspection_rate
inspection_rate_option = args.inspection_plan
mode = args.mode
train_begin = args.train_from 
test_begin = args.test_from
test_length = args.test_length
valid_length = args.valid_length
chosen_data = args.data
numWeeks = args.numweeks
semi_supervised = args.semi_supervised
save = args.save
gpu_id = args.device

# Initial dataset split
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)

# Initial dataset split
train_start_day = datetime.date(int(train_begin[:4]), int(train_begin[4:6]), int(train_begin[6:8]))
test_start_day = datetime.date(int(test_begin[:4]), int(test_begin[4:6]), int(test_begin[6:8]))
test_length = timedelta(days=test_length)    
test_end_day = test_start_day + test_length
valid_length = timedelta(days=valid_length)
valid_start_day = test_start_day - valid_length

# data
data.split(train_start_day, valid_start_day, test_start_day, test_end_day, valid_length, test_length, args)
data.featureEngineering()

Data size:
Train labeled: (40475, 41), Train unlabeled: (1308679, 41), Valid labeled: (437124, 41), Valid unlabeled: (0, 13), Test: (858180, 41)
Checking label distribution
Training: 0.07383529661466624
Validation: 0.07495764097746672
Testing: 0.0957648251549135


In [51]:
sys.path.append('../graph_sage')
from utils import *
from pygData_util import *

In [52]:
categories=["importer.id","HS6"]
gdata = GraphData(data,use_xgb=True, categories=categories)

Training XGBoost model...


In [53]:
best_thresh, best_auc = find_best_threshold(gdata.xgb,data.dfvalidx_lab, data.valid_cls_label)
xgb_test_pred = gdata.xgb.predict_proba(data.dfvalidx_lab)[:,-1]
overall_f1,auc,pr, re, f, rev = metrics(xgb_test_pred, data.valid_cls_label,data.valid_reg_label,best_thresh)
print("-"*50)
xgb_test_pred = gdata.xgb.predict_proba(data.dftestx)[:,-1]
overall_f1,auc,pr, re, f, rev = metrics(xgb_test_pred, data.test_cls_label,data.test_reg_label,best_thresh)

Checking top 1% suspicious transactions: 4371
Precision: 0.6287, Recall: 0.0902, Revenue: 0.1119
Checking top 2% suspicious transactions: 8742
Precision: 0.5420, Recall: 0.1554, Revenue: 0.1860
Checking top 5% suspicious transactions: 21857
Precision: 0.3742, Recall: 0.2683, Revenue: 0.3006
Checking top 10% suspicious transactions: 43712
Precision: 0.2683, Recall: 0.3848, Revenue: 0.4311
--------------------------------------------------
Checking top 1% suspicious transactions: 8581
Precision: 0.5809, Recall: 0.0665, Revenue: 0.0778
Checking top 2% suspicious transactions: 17164
Precision: 0.4996, Recall: 0.1143, Revenue: 0.1364
Checking top 5% suspicious transactions: 42909
Precision: 0.3674, Recall: 0.2102, Revenue: 0.2498
Checking top 10% suspicious transactions: 85818
Precision: 0.2769, Recall: 0.3168, Revenue: 0.3690


In [54]:
stage = "train_lab"
trainLab_data = gdata.get_data(stage)
train_nodeidx = torch.tensor(gdata.get_AttNode(stage))
trainLab_data.node_idx = train_nodeidx

In [55]:
stage = "train_unlab"
unlab_data = gdata.get_data(stage)
unlab_nodeidx = torch.tensor(gdata.get_AttNode(stage))
unlab_data.node_idx = unlab_nodeidx

In [56]:
stage = "valid"
valid_data = gdata.get_data(stage)
valid_nodeidx = torch.tensor(gdata.get_AttNode(stage))
valid_data.node_idx = valid_nodeidx

In [58]:
trainLab_data

Data(edge_attr=[80950], edge_index=[2, 161900], edge_label=[161900], node_idx=[40475], rev=[52368], x=[52368, 100], y=[52368])

# Model

In [3]:
class LabelPredictor(nn.Module):
    def __init__(self,in_channels):
        super(LabelPredictor, self).__init__()
        
        self.linear = nn.Linear(in_channels * 3, 1)

    def forward(self, emb_a, emb_b): 
        
        emb_abs = torch.abs(emb_a - emb_b)
        emb_sum = emb_a + emb_b
        emb_mult = emb_a * emb_b
        
        x = torch.cat([emb_abs, emb_sum, emb_mult], dim=-1)

        x = self.linear(x)
        x = torch.sigmoid(x)
        
        return x
 

In [None]:
# class GraphSage()