In [1]:
from HLab.hmd.text import get_window
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import numpy as np
import dgl
from HLab.hmd import Utilities as Util
from HLab.hmd.preprocessing import *
from HLab.hmd.text import *
from sklearn.model_selection import train_test_split
import torch.nn as nn
from dgl.nn import GraphConv
import torch.nn.functional as F

In [2]:
data, labels = fetch_20newsgroups(
        data_home=Util.get_data_path('20newsgroups'),
        subset='test',
        return_X_y=True
    )

In [3]:
print(data[0])
print(labels[0])

From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. GANDLER)
Subject: Need info on 88-89 Bonneville
Organization: University at Buffalo
Lines: 10
News-Software: VAX/VMS VNEWS 1.41
Nntp-Posting-Host: ubvmsd.cc.buffalo.edu


 I am a little confused on all of the models of the 88-89 bonnevilles.
I have heard of the LE SE LSE SSE SSEI. Could someone tell me the
differences are far as features or performance. I am also curious to
know what the book value is for prefereably the 89 model. And how much
less than book value can you usually get them for. In other words how
much are they in demand this time of year. I have heard that the mid-spring
early summer is the best time to buy.

			Neil Gandler

7


In [4]:
edges_src = []
edges_dst = []
edge_features = []

In [5]:
preprocessor = StringPreprocessing()
preprocessor.add_handler(ToLowerCase())
preprocessor.add_handler(RemoveWhiteSpace())
preprocessor.add_handler(RemovePunctuation())
preprocessor.add_handler(EnglishTokenizer())

corpus = [preprocessor.execute(d) for d in data]

In [6]:
vectorizer = TfidfVectorizer(token_pattern=r"\S+")
tfidf_vec = vectorizer.fit_transform(corpus)
lexicon = vectorizer.vocabulary_

doc_nodes = len(corpus)
word_nodes = len(lexicon)
num_nodes = doc_nodes + word_nodes

print(doc_nodes)
print(word_nodes)

7532
98479


In [7]:
for idx, row in tqdm(enumerate(tfidf_vec), desc="generate tfidf edge"):
    for col_ind, value in zip(row.indices, row.data):
        edges_src.append(idx) # doc_id
        edges_dst.append(doc_nodes + col_ind) # word_id
        edge_features.append(value)

generate tfidf edge: 7532it [00:01, 7005.86it/s]


In [8]:
word_window_freq, word_pair_count, windows_count = get_window(corpus, 20)
pmi_edge_lst = count_pmi(word_window_freq, word_pair_count, windows_count, threshold=0)

Split by window: 100%|██████████| 7532/7532 [01:01<00:00, 122.93it/s]
Calculate pmi between words: 100%|██████████| 10220817/10220817 [00:08<00:00, 1196824.40it/s]


In [9]:
for edge_item in pmi_edge_lst:    
    w1_idx = doc_nodes + lexicon[edge_item[0]]
    w2_idx = doc_nodes + lexicon[edge_item[1]]
    edges_src.append(w1_idx) # word_1
    edges_dst.append(w2_idx) # word_2
    edge_features.append(edge_item[2])


In [10]:
labels = [lbl + 1 for lbl in labels]

In [11]:
print(len(labels))

print(doc_nodes)
print(word_nodes)

7532
7532
98479


In [12]:
word_labels = [0] * word_nodes
labels = labels + word_labels
print(len(labels))
print(labels[-20:])

106011
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [13]:
edges_src = torch.from_numpy(np.array(edges_src))
edges_dst = torch.from_numpy(np.array(edges_dst))

In [14]:
num_nodes = doc_nodes + word_nodes

In [15]:
graph = dgl.graph(
    (edges_src, edges_dst), num_nodes=num_nodes
)

In [16]:
graph = dgl.add_reverse_edges(graph) # chuyển về đồ thị vô hướng
graph = dgl.add_self_loop(graph) # + eye matrix

In [17]:
graph.ndata["labels"] = torch.from_numpy(np.array(labels))

In [18]:
import scipy.sparse as sp


def normalize(adj):
    """ normalize adjacency matrix with normalization-trick that is faithful to
    the original paper.

    Arguments:
        a (scipy.sparse.coo_matrix): Unnormalied adjacency matrix

    Returns:
        scipy.sparse.coo_matrix: Normalized adjacency matrix
    """
    # no need to add identity matrix because self connection has already been added
    # a += sp.eye(a.shape[0])
    rowsum = np.array(adj.sum(1))
    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    # ~D in the GCN paper
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
    return d_mat_inv_sqrt.dot(adj).dot(d_mat_inv_sqrt)



def normalize_pygcn(adj):
    """ normalize adjacency matrix with normalization-trick. This variant
    is proposed in https://github.com/tkipf/pygcn .
    Refer https://github.com/tkipf/pygcn/issues/11 for the author's comment.

    Arguments:
        a (scipy.sparse.coo_matrix): Unnormalied adjacency matrix

    Returns:
        scipy.sparse.coo_matrix: Normalized adjacency matrix
    """
    # no need to add identity matrix because self connection has already been added
    # a += sp.eye(a.shape[0])
    rowsum = np.array(adj.sum(1))
    rowsum_inv = np.power(rowsum, -1).flatten()
    rowsum_inv[np.isinf(rowsum_inv)] = 0.
    # ~D in the GCN paper
    d_tilde = sp.diags(rowsum_inv)
    return d_tilde.dot(adj)

In [19]:
# shape = (num_nodes, num_nodes)
# indices = torch.from_numpy(np.vstack((edges_src, edges_dst)).astype(np.int64))
# values = torch.FloatTensor(edge_features)
# shape = torch.Size(shape)

# features = torch.sparse_coo_tensor(indices, values, shape)
# print(type(features))
# # graph.ndata["features"] = normalize(features)


from scipy.sparse import coo_matrix

# eye 
for i in range(len(num_nodes)):
    edges_src.append(i)
    edges_dst.append(i)
    edge_features.append(1)

adj = coo_matrix((edge_features, (edges_src, edges_dst)), shape=(num_nodes, num_nodes))


adj_norm = normalize(adj)

: 

In [19]:
i = 0
print(corpus[i])
print(labels[i])

from v064mb9kubvmsdccbuffaloedu neil b gandler subject need info on 8889 bonneville organization university at buffalo lines 10 newssoftware vaxvms vnews 141 nntppostinghost ubvmsdccbuffaloedu i am a little confused on all of the models of the 8889 bonnevilles i have heard of the le se lse sse ssei could someone tell me the differences are far as features or performance i am also curious to know what the book value is for prefereably the 89 model and how much less than book value can you usually get them for in other words how much are they in demand this time of year i have heard that the midspring early summer is the best time to buy neil gandler
8


In [32]:
dat_i = features[i]
# for idx, val in enumerate(dat_i):
#     if val.numpy() != 0:
#         print(idx)

print(dat_i[85778+doc_nodes])
    
print(tfidf_vec[0])

tensor(0.1082)
  (0, 22275)	0.07703137119748274
  (0, 19618)	0.06618728473856625
  (0, 85778)	0.10818219952536623
  (0, 34111)	0.08827121802476652
  (0, 59883)	0.177224866723644
  (0, 87975)	0.026082547669486207
  (0, 97657)	0.06894060223817204
  (0, 88690)	0.09831481888634297
  (0, 88280)	0.029295757202467628
  (0, 30743)	0.11361619870204591
  (0, 88190)	0.037868017944496825
  (0, 95971)	0.0826568359989083
  (0, 67904)	0.045579393315896714
  (0, 46908)	0.04498579100579172
  (0, 88037)	0.04650826076532019
  (0, 40900)	0.04419955617778666
  (0, 92332)	0.08395122624587281
  (0, 97856)	0.03039177515100854
  (0, 23273)	0.0357510300495787
  (0, 87952)	0.04646600901917184
  (0, 53349)	0.07191621527809072
  (0, 62913)	0.10529879385363713
  (0, 45385)	0.0892523736246828
  (0, 15540)	0.022718123388412535
  (0, 61342)	0.08972224993144326
  :	:
  (0, 15187)	0.10126610859075477
  (0, 45972)	0.09481744138039236
  (0, 90813)	0.142835899691619
  (0, 65423)	0.03517416052146206
  (0, 2380)	0.0997962996

In [None]:
node_indices = [i for i in range(num_nodes)]

x_train, x_test, y_train, y_test = train_test_split(node_indices, labels, test_size=0.33, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.33, random_state=42)

In [None]:
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

In [None]:
train_mask[x_train] = True
val_mask[val_mask] = True
test_mask[test_mask] = True


graph.ndata["train_mask"] = train_mask
graph.ndata["val_mask"] = val_mask
graph.ndata["test_mask"] = test_mask

In [None]:
class GCN(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats)
        self.conv2 = GraphConv(h_feats, num_classes)

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h


In [None]:
def train(g, model):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.02)
    best_val_acc = 0
    best_test_acc = 0

    features = g.ndata["features"]
    labels = g.ndata["labels"]
    train_mask = g.ndata["train_mask"]
    val_mask = g.ndata["val_mask"]
    test_mask = g.ndata["test_mask"]
    for e in range(100):
        # Forward
        logits = model(g, features)

        # Compute prediction
        pred = logits.argmax(1)

        # Compute loss
        # Note that you should only compute the losses of the nodes in the training set.
        loss = F.cross_entropy(logits[train_mask], labels[train_mask])

        # Compute accuracy on training/validation/test
        train_acc = (pred[train_mask] == labels[train_mask]).float().mean()
        val_acc = (pred[val_mask] == labels[val_mask]).float().mean()
        test_acc = (pred[test_mask] == labels[test_mask]).float().mean()

        # Save the best validation accuracy and the corresponding test accuracy.
        if best_val_acc < val_acc:
            best_val_acc = val_acc
            best_test_acc = test_acc

        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if e % 5 == 0:
            print(
                f"In epoch {e}, loss: {loss:.3f}, val acc: {val_acc:.3f} (best {best_val_acc:.3f}), test acc: {test_acc:.3f} (best {best_test_acc:.3f})"
            )

In [None]:
num_classes = len(set(labels)) + 1
model = GCN(graph.ndata["features"].shape[1], 200, num_classes)

In [None]:
train(graph, model)