# 1. Preprocess
This section is adopted from MAGNN(https://github.com/cynricfu/MAGNN).

Before running, you should:

1. Visit MAGNN to download the required glove.6B.50d.txt file 
   
    And visit HAN(https://github.com/Jhy1993/HAN) to download DBLP4057_GAT_with_idx.mat. 

    Save two files to `./data`.

2. Install nltk and download 4 nltk data(see below).
 

In [None]:
import nltk
#nltk.set_proxy('http://127.0.0.1:1080')
#nltk.download_gui()
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt')

In [1]:
import pathlib
import numpy as np
import scipy.sparse
import scipy.io
import pandas as pd
import pickle
import networkx as nx 
import utils.preprocess
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stopwords
from utils.data import load_glove_vectors
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize
from nltk.corpus import stopwords as nltk_stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
save_prefix = r'./data/DBLP_processed/'
read_perfix = r'./data/DBLP/'
num_ntypes = 4

In [5]:
author_label = pd.read_csv(read_perfix + 'author_label.txt', sep='\t', header=None, names=['author_id', 'label', 'author_name'], keep_default_na=False, encoding='utf-8')
paper_author = pd.read_csv(read_perfix + 'paper_author.txt', sep='\t', header=None, names=['paper_id', 'author_id'], keep_default_na=False, encoding='utf-8')
paper_conf = pd.read_csv(read_perfix + 'paper_conf.txt', sep='\t', header=None, names=['paper_id', 'conf_id'], keep_default_na=False, encoding='utf-8')
paper_term = pd.read_csv(read_perfix + 'paper_term.txt', sep='\t', header=None, names=['paper_id', 'term_id'], keep_default_na=False, encoding='utf-8')
papers = pd.read_csv(read_perfix + 'paper.txt', sep='\t', header=None, names=['paper_id', 'paper_title'], keep_default_na=False, encoding='cp1252')
terms = pd.read_csv(read_perfix + 'term.txt', sep='\t', header=None, names=['term_id', 'term'], keep_default_na=False, encoding='utf-8')
confs = pd.read_csv(read_perfix + 'conf.txt', sep='\t', header=None, names=['conf_id', 'conf'], keep_default_na=False, encoding='utf-8')

In [6]:
glove_dim = 50
glove_vectors = load_glove_vectors(dim=glove_dim)

Loading GloVe pretrained word vectors
Done. 400000 words loaded!


In [7]:
# filter out all nodes which does not associated with labeled authors
labeled_authors = author_label['author_id'].to_list()
paper_author = paper_author[paper_author['author_id'].isin(labeled_authors)].reset_index(drop=True)
valid_papers = paper_author['paper_id'].unique()
papers = papers[papers['paper_id'].isin(valid_papers)].reset_index(drop=True)
paper_conf = paper_conf[paper_conf['paper_id'].isin(valid_papers)].reset_index(drop=True)
paper_term = paper_term[paper_term['paper_id'].isin(valid_papers)].reset_index(drop=True)
valid_terms = paper_term['term_id'].unique()
terms = terms[terms['term_id'].isin(valid_terms)].reset_index(drop=True)

In [10]:
# term lemmatization and grouping
lemmatizer = WordNetLemmatizer()
lemma_id_mapping = {}
lemma_list = []
lemma_id_list = []
i = 0
for _, row in terms.iterrows():
    i += 1
    lemma = lemmatizer.lemmatize(row['term'])
    lemma_list.append(lemma)
    if lemma not in lemma_id_mapping:
        lemma_id_mapping[lemma] = row['term_id']
    lemma_id_list.append(lemma_id_mapping[lemma])
terms['lemma'] = lemma_list
terms['lemma_id'] = lemma_id_list

print(terms)
print(paper_term)

term_lemma_mapping = {row['term_id']: row['lemma_id'] for _, row in terms.iterrows()}
lemma_id_list = []
for _, row in paper_term.iterrows():
    lemma_id_list.append(term_lemma_mapping[row['term_id']])
paper_term['lemma_id'] = lemma_id_list

paper_term = paper_term[['paper_id', 'lemma_id']]
paper_term.columns = ['paper_id', 'term_id']
paper_term = paper_term.drop_duplicates()
terms = terms[['lemma_id', 'lemma']]
terms.columns = ['term_id', 'term']
terms = terms.drop_duplicates()

      term_id         term        lemma  lemma_id
0           1          the          the         1
1           2    automatic    automatic         2
2           3  acquisition  acquisition         3
3           4           of           of         4
4           5        proof        proof         5
...       ...          ...          ...       ...
8893    13567      gapprox      gapprox     13567
8894    13568       poetry       poetry     13568
8895    13569       estmax       estmax     13569
8896    13570        zonal        zonal     13570
8897    13571    fractures     fracture     13571

[8898 rows x 4 columns]
        paper_id  term_id
0           7601        7
1           7601        8
2           7601        9
3           7601       10
4           7601       11
...          ...      ...
114268    654269      580
114269    654269      723
114270    654269      902
114271    654269     1653
114272    654269     2229

[114273 rows x 2 columns]


In [11]:
# filter out stopwords from terms
stopwords = sklearn_stopwords.union(set(nltk_stopwords.words('english')))
stopword_id_list = terms[terms['term'].isin(stopwords)]['term_id'].to_list()
paper_term = paper_term[~(paper_term['term_id'].isin(stopword_id_list))].reset_index(drop=True)
terms = terms[~(terms['term'].isin(stopwords))].reset_index(drop=True)

In [14]:
author_label = author_label.sort_values('author_id').reset_index(drop=True)
papers = papers.sort_values('paper_id').reset_index(drop=True)
terms = terms.sort_values('term_id').reset_index(drop=True)
confs = confs.sort_values('conf_id').reset_index(drop=True)

In [15]:
# extract labels of authors
labels = author_label['label'].to_numpy()
print(labels)

[2 2 3 ... 0 0 0]


In [16]:
# build the adjacency matrix for the graph consisting of authors, papers, terms and conferences
# 0 for authors, 1 for papers, 2 for terms, 3 for conferences
dim = len(author_label) + len(papers) + len(terms) + len(confs)
type_mask = np.zeros((dim), dtype=int)
type_mask[len(author_label):len(author_label)+len(papers)] = 1
type_mask[len(author_label)+len(papers):len(author_label)+len(papers)+len(terms)] = 2
type_mask[len(author_label)+len(papers)+len(terms):] = 3

author_id_mapping = {row['author_id']: i for i, row in author_label.iterrows()}
paper_id_mapping = {row['paper_id']: i + len(author_label) for i, row in papers.iterrows()}
term_id_mapping = {row['term_id']: i + len(author_label) + len(papers) for i, row in terms.iterrows()}
conf_id_mapping = {row['conf_id']: i + len(author_label) + len(papers) + len(terms) for i, row in confs.iterrows()}

adjM = np.zeros((dim, dim), dtype=int)
for _, row in paper_author.iterrows(): 
    idx1 = paper_id_mapping[row['paper_id']]
    idx2 = author_id_mapping[row['author_id']]
    adjM[idx1, idx2] = 1
    adjM[idx2, idx1] = 1
for _, row in paper_term.iterrows():
    idx1 = paper_id_mapping[row['paper_id']]
    idx2 = term_id_mapping[row['term_id']]
    adjM[idx1, idx2] = 1
    adjM[idx2, idx1] = 1
for _, row in paper_conf.iterrows():
    idx1 = paper_id_mapping[row['paper_id']]
    idx2 = conf_id_mapping[row['conf_id']]
    adjM[idx1, idx2] = 1
    adjM[idx2, idx1] = 1
print(adjM)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [17]:
# use HAN paper's preprocessed data as the features of authors (https://github.com/Jhy1993/HAN)
mat = scipy.io.loadmat('data/DBLP4057_GAT_with_idx.mat')
features_author = np.array(list(zip(*sorted(zip(labeled_authors, mat['features']), key=lambda tup: tup[0])))[1])
features_author = scipy.sparse.csr_matrix(features_author)

In [18]:
# use bag-of-words representation of paper titles as the features of papers
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
vectorizer = CountVectorizer(min_df=2, stop_words=stopwords, tokenizer=LemmaTokenizer())
features_paper = vectorizer.fit_transform(papers['paper_title'].values)

  % sorted(inconsistent)


In [19]:
# use pretrained GloVe vectors as the features of terms 
features_term = np.zeros((len(terms), glove_dim))
for i, row in terms.iterrows():
    features_term[i] = glove_vectors.get(row['term'], glove_vectors['the'])

## save dataset

In [None]:
# author_label.to_csv(save_prefix + 'author_label.csv')
# papers.to_csv(save_prefix + 'papers.csv')
# terms.to_csv(save_prefix + 'terms.csv')
# confs.to_csv(save_prefix + 'confs.csv')

# 2. Schema instances

We search schema instances based on the cleaned dataset.

In [None]:
plan = {0: {0: {'main': [0, 1, 3]}}, 1: {0: {'main': [1, 2], 'to': 0}}}  # output of Schema decomposition

schema = utils.preprocess.plan_transform_v2(plan)
schema
# schema = {'stem': [0, 1, 3], 'branch': {0: [1, 2]}}  # or decompose manually

Prepare some operators (adj, type_mask, prefix_operator)

In [26]:
import scipy.sparse
adj = scipy.sparse.lil_matrix(adjM)
raw_dims = [sum(type_mask==p) for p in range(num_ntypes)] # num of each type
dim = sum(raw_dims) # total num of nodes
print(raw_dims)
print(dim)
prefix_operator = np.ones((len(raw_dims)+1, len(raw_dims)))
prefix_operator = np.tril(prefix_operator, k=-1)   
prefix_operator = prefix_operator.dot(raw_dims).astype(int)
print(prefix_operator)

[4057, 14328, 7723, 20]
26128
[    0  4057 18385 26108 26128]


## search instances

In [None]:
chain_intances = utils.preprocess.get_intances(adj, type_mask, schema, prefix_operator)
#print(chain_intances)
subgraphs = utils.preprocess.get_schema_subgraphs_parallel(schema, chain_intances)
subgraphs = subgraphs[subgraphs.columns.sort_values()]
print('=======done=======')


In [36]:
subgraphs = subgraphs.values

## save

### split

In [35]:
# subgraphs train/validation/test splits
rand_seed = 33333333
train_val_idx, test_idx = train_test_split(np.arange(len(adjM)), test_size=0.1, random_state=rand_seed)
a = np.isin(subgraphs,test_idx)
a = a.sum(axis=1).astype('bool')
subgraphs_test = subgraphs[a]
subgraphs_tr_val = subgraphs[~a]
subgraphs[a].shape
print(subgraphs_test.shape[0]/len(subgraphs)) # 30% for test
train_idx, val_idx = train_test_split(train_val_idx, test_size=0.025, random_state=rand_seed)
b = np.isin(subgraphs_tr_val,val_idx)
b = b.sum(axis=1).astype('bool')
subgraphs_val = subgraphs_tr_val[b]
subgraphs_train = subgraphs_tr_val[~b]
subgraphs_tr_val[b].shape
print(subgraphs_val.shape[0]/len(subgraphs)) # 10% for val
print(len(subgraphs_train)/len(subgraphs)) # 60% for train

0.3030669656076822
0.09650154405854731
0.6004314903337704


In [None]:
np.savez(save_prefix + 'subgraphs_train_val_test.npz',
         subgraphs_train=subgraphs_train,
         subgraphs_val=subgraphs_val,
         subgraphs_test=subgraphs_test) # subgraph train&val&test
# save data
np.save(save_prefix + 'schema.npy', schema) # schema
# type prefix
np.save(save_prefix + 'prefix_operator.npy',prefix_operator)
# subgraph table
np.save(save_prefix + 'subgraphs.npy', subgraphs)
# all nodes adjacency matrix
scipy.sparse.save_npz(save_prefix + 'adjM.npz', scipy.sparse.csr_matrix(adjM))
# all nodes (authors, papers, terms and conferences) features
# currently only have features of authors, papers and terms
scipy.sparse.save_npz(save_prefix + 'features_{}.npz'.format(0), features_author)
scipy.sparse.save_npz(save_prefix + 'features_{}.npz'.format(1), features_paper)
np.save(save_prefix + 'features_{}.npy'.format(2), features_term)
# all nodes (authors, papers, terms and conferences) type labels
np.save(save_prefix + 'node_types.npy', type_mask)
# author labels
np.save(save_prefix + 'labels.npy', labels)
