In [None]:
import unittest
import pytest
import numpy as np
from os.path import join
import sys
from s2and.data import ANDData
from s2and.autofeaturizer import FeaturizationInfo, many_pairs_featurize, featurize, preprocess_features
from s2and.consts import LARGE_INTEGER
from s2and.eval import pairwise_eval, cluster_eval
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
from collections import Counter

In [None]:
dataset_name = "qian" # support dataset: aminer, arnetminer, kisti, pubmed, qian, zbmath
parent_dir = f"/{dataset_name}/" # Your the dataset root
dataset = ANDData(
    signatures=join(parent_dir, f"{dataset_name}_signatures.json"),
    papers=join(parent_dir, f"{dataset_name}_papers.json"),
    mode="train",
    specter_embeddings=join(parent_dir, f"{dataset_name}_specter.pickle"),
    clusters=join(parent_dir, f"{dataset_name}_clusters.json"),
    block_type="s2",
    name=dataset_name,
    n_jobs=8,
    preprocess=True,
    load_name_counts=False,
    train_pairs_size = 200000,
    val_pairs_size = 20000,
    test_pairs_size = 20000,
    W2Vmodels=parent_dir + f'{dataset_name}_word2vec',
    idf_counts=parent_dir + f'{dataset_name}_counts',
)


In [None]:
features_to_use = [
    "affiliation_similarity",
    "coauthor_similarity",
    "venue_similarity",
    "journal_similarity",
    "title_similarity",
    "reference_authors_similarity",
    "reference_titles_similarity",
    "reference_journals_similarity",
    "year_diff",
    'misc_features'
]
featurization_info = FeaturizationInfo(features_to_use=features_to_use)
train, val, test = featurize(dataset, featurization_info, n_jobs=8, use_cache=True, all_pair=False)


In [None]:
X_train, y_train, _, params = preprocess_features(train,phase='train')
X_test, y_test, _, _ = preprocess_features(test,phase='test',params=params)
from imblearn.under_sampling import RandomUnderSampler
under_sampler = RandomUnderSampler(random_state=42)
X_train, y_train = under_sampler.fit_resample(X_train, y_train)
print(f"Testing target statistics: {Counter(y_train)}")

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import itertools
from sklearn.metrics import roc_curve, auc, f1_score, precision_recall_curve, average_precision_score
import copy
class LR(nn.Module):
    def __init__(self, x_dim):
        super(LR,self).__init__()
        self.linear1=nn.Linear(x_dim, 32)
        self.bn=nn.BatchNorm1d(32)
        self.relu=nn.ReLU()
        self.dp = nn.Dropout(0.15)
        self.linear2=nn.Linear(32, 1)
        self.sigmoid=nn.Sigmoid()
        
    def forward(self,x):
        x=self.linear1(x)
        x=self.bn(x)
        x=self.relu(x)
        x=self.dp(x)
        x=self.linear2(x)
        x=self.sigmoid(x)
        return x

class feature_selection(nn.Module):
    def __init__(self, groups):#[6,7]
        super(feature_selection,self).__init__() 
        self.groups = groups
        for i,g in enumerate(groups):
            self.register_parameter('fs_group%d'%i, 
                                    nn.Parameter(1e-3*torch.ones(g[1]-g[0]), requires_grad=True))
        self.x_dim = len(groups)
       
    def forward(self,x):
        output_x = []
        for i, (g, p) in enumerate(zip(self.groups,self.parameters())):
            selected_feature = torch.mm(x[:,g[0]:g[1]], F.gumbel_softmax(p, tau=0.1, hard=True).unsqueeze(1))
            output_x.append(selected_feature)
        return torch.cat(output_x,dim=1)
    
    def infer(self,x, manual):
        # manual = np.setdiff1d(np.arange(0,self.x_dim),[])
        return x[:,manual]

from sklearn.model_selection import train_test_split
X_train = X_train[:,:46]
X_test = X_test[:,:46]
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1)

misc_feature = np.arange(46,X_train.shape[1]).tolist()

low_high_group = [[0,6],[6,11],[11,17],[17,23],[23,30],[30,34],[34,38],[38,42],[42,46]]
random_feature = [np.random.randint(low,high) for low,high in low_high_group] 

fs = feature_selection(low_high_group)
model = LR(9)
proxy_model = LR(9)
proxy_model.load_state_dict(model.state_dict())

def train():
    global y_val
    torch.manual_seed(11)
    trainset = TensorDataset(torch.FloatTensor(X_train), torch.FloatTensor(y_train))
    valset = TensorDataset(torch.FloatTensor(X_val), torch.FloatTensor(y_val))
    testset = TensorDataset(torch.FloatTensor(X_test), torch.FloatTensor(y_test))
    train_loader = DataLoader(trainset, batch_size=100,shuffle=True)
    val_loader = DataLoader(valset, batch_size=10,shuffle=True)
    test_loader = DataLoader(testset, batch_size=100,shuffle=False)

    epoch = 5
    loss_fn=nn.BCELoss()

    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=2e-4)
    optimizer_proxy = torch.optim.SGD(proxy_model.parameters(), lr=0.01, momentum=0.9, weight_decay=2e-4)
    optimizer_fs = torch.optim.SGD(fs.parameters(), lr=0.01, momentum=0.9, weight_decay=2e-4)

    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
        milestones=[0.5*epoch,0.75*epoch,0.9*epoch], gamma=0.1)

    for i in range(epoch):
        model.train() 
        proxy_model.train()   
        for (x, y), (x_val, y_val) in zip(train_loader,val_loader):
            proxy_model.load_state_dict(model.state_dict())
            model.zero_grad()
            proxy_model.zero_grad()

            #update proxy
            with torch.no_grad():
                x_fs = fs(x)
            y_pred = proxy_model(x_fs)
            loss = loss_fn(y_pred.squeeze(), y)
            loss.backward()
            optimizer_proxy.step()

            #update Gumbel-Softmax
            x_val_fs = fs(x_val)
            y_pred = proxy_model(x_val_fs)
            loss = loss_fn(y_pred.squeeze(), y_val)
            loss.backward()
            optimizer_fs.step()

            #update prediction model
            x_fs = fs(x)
            y_pred = model(x_fs)
            loss = loss_fn(y_pred.squeeze(), y)
            loss.backward()
            optimizer.step()

        print(fs.state_dict()['fs_group0'])
        
        model.eval()
        pred = model(fs(testset.tensors[0])).detach().numpy()
        fpr, tpr, threshold = roc_curve(np.array(y_test,np.int), pred)
        aucres = auc(fpr,tpr)
        print("Epoch:{}  Auc:{}".format(i, aucres))

    return model, aucres
train()