In [1]:
from HLab.hmd.text import get_window
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import numpy as np
import dgl
from HLab.hmd import Utilities as Util
from HLab.hmd.preprocessing import *
from HLab.hmd.text import *
from sklearn.model_selection import train_test_split
import torch.nn as nn
from dgl.nn import GraphConv
import torch.nn.functional as F

In [2]:
data, labels = fetch_20newsgroups(
        data_home=Util.get_data_path('20newsgroups'),
        # categories=['alt.atheism', 'comp.graphics'],
        subset='all',
        return_X_y=True
    )

In [3]:
print(data[0])
print(labels[0])

From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!


10


In [4]:
edges_src = []
edges_dst = []
edge_features = []

In [5]:
preprocessor = StringPreprocessing()
preprocessor.add_handler(ToLowerCase())
preprocessor.add_handler(RemoveWhiteSpace())
preprocessor.add_handler(RemovePunctuation())
preprocessor.add_handler(EnglishTokenizer())

corpus = [preprocessor.execute(d) for d in data]

In [6]:
vectorizer = TfidfVectorizer(token_pattern=r"\S+")
tfidf_vec = vectorizer.fit_transform(corpus)
lexicon = vectorizer.vocabulary_

doc_nodes = len(corpus)
word_nodes = len(lexicon)
num_nodes = doc_nodes + word_nodes

print(doc_nodes)
print(word_nodes)

18846
187434


In [7]:
for idx, row in tqdm(enumerate(tfidf_vec), desc="generate tfidf edge"):
    for col_ind, value in zip(row.indices, row.data):
        edges_src.append(idx) # doc_id
        edges_dst.append(doc_nodes + col_ind) # word_id
        edge_features.append(value)

generate tfidf edge: 18846it [00:02, 6951.17it/s]


In [8]:
# print(len(edges_src))
# print(len(edges_dst))
# print(len(edge_features))

In [9]:
word_window_freq, word_pair_count, windows_count = get_window(corpus, 20)
pmi_edge_lst = count_pmi(word_window_freq, word_pair_count, windows_count, threshold=0)

Split by window: 100%|██████████| 18846/18846 [05:09<00:00, 60.86it/s]  
Calculate pmi between words: 100%|██████████| 20661602/20661602 [00:46<00:00, 447209.34it/s] 


In [10]:
for edge_item in pmi_edge_lst:    
    w1_idx = doc_nodes + lexicon[edge_item[0]]
    w2_idx = doc_nodes + lexicon[edge_item[1]]
    edges_src.append(w1_idx) # word_1
    edges_dst.append(w2_idx) # word_2
    edge_features.append(edge_item[2])


In [11]:
import scipy.sparse as sp


# adj = sp.coo_matrix(
#     (
#         np.ones(len(edge_features)), 
#         (edges_src, edges_dst)
#     ),
#     shape=(num_nodes, num_nodes),
#     dtype=np.float32
# )

g = dgl.graph(
    (torch.tensor(edges_src), torch.tensor(edges_dst))
)

print(g)

Graph(num_nodes=206280, num_edges=17994990,
      ndata_schemes={}
      edata_schemes={})


In [12]:
g = dgl.add_reverse_edges(g) # chuyển về đồ thị vô hướng
g = dgl.add_self_loop(g) # + eye matrix

In [13]:
features = sp.coo_matrix(
    (edge_features, (edges_src, edges_dst)), 
    shape=(num_nodes, num_nodes)
)

In [14]:
labels = [lbl + 1 for lbl in labels]
word_labels = [0] * word_nodes
labels = labels + word_labels

In [15]:
values = features.data
indices = np.vstack((features.row, features.col))
i = torch.LongTensor(indices)
v = torch.FloatTensor(values)
shape = features.shape


g.ndata["features"] = torch.sparse_coo_tensor(i, v, torch.Size(shape))
g.ndata["label"] = torch.LongTensor(labels)

In [16]:
node_indices = [i for i in range(doc_nodes)]

x_train, x_test, y_train, y_test = train_test_split(node_indices, node_indices, test_size=0.33, random_state=42, shuffle=False)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.33, random_state=42, shuffle=False)    

train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

train_mask[x_train] = True
val_mask[x_val] = True
test_mask[x_test] = True

g.ndata["train_mask"] = train_mask
g.ndata["val_mask"] = val_mask
g.ndata["test_mask"] = test_mask

num_classes = len(set(labels)) + 1

In [17]:
class GCN(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats)
        self.conv2 = GraphConv(h_feats, num_classes)
        self.dropout = nn.Dropout(p=0.5)

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.dropout(h)
        h = self.conv2(g, h)
        return h



def normalize(adj):
    """ normalize adjacency matrix with normalization-trick that is faithful to
    the original paper.

    Arguments:
        a (scipy.sparse.coo_matrix): Unnormalied adjacency matrix

    Returns:
        scipy.sparse.coo_matrix: Normalized adjacency matrix
    """
    # no need to add identity matrix because self connection has already been added
    # a += sp.eye(a.shape[0])
    rowsum = np.array(adj.sum(1))
    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    # ~D in the GCN paper
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
    return d_mat_inv_sqrt.dot(adj).dot(d_mat_inv_sqrt)



def normalize_pygcn(adj):
    """ normalize adjacency matrix with normalization-trick. This variant
    is proposed in https://github.com/tkipf/pygcn .
    Refer https://github.com/tkipf/pygcn/issues/11 for the author's comment.

    Arguments:
        a (scipy.sparse.coo_matrix): Unnormalied adjacency matrix

    Returns:
        scipy.sparse.coo_matrix: Normalized adjacency matrix
    """
    # no need to add identity matrix because self connection has already been added
    # a += sp.eye(a.shape[0])
    rowsum = np.array(adj.sum(1))
    rowsum_inv = np.power(rowsum, -1).flatten()
    rowsum_inv[np.isinf(rowsum_inv)] = 0.
    # ~D in the GCN paper
    d_tilde = sp.diags(rowsum_inv)
    return d_tilde.dot(adj)


def train(g, model):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.02)
    best_val_acc = 0
    best_test_acc = 0

    features = g.ndata["features"]
    labels = g.ndata["label"]
    train_mask = g.ndata["train_mask"]
    val_mask = g.ndata["val_mask"]
    test_mask = g.ndata["test_mask"]
    for e in range(200):
        # Forward
        logits = model(g, features)

        # Compute prediction
        pred = logits.argmax(1)
       
        # Compute loss
        # Note that you should only compute the losses of the nodes in the training set.
        loss = F.cross_entropy(logits[train_mask], labels[train_mask])

        # Compute accuracy on training/validation/test
        train_acc = (pred[train_mask] == labels[train_mask]).float().mean()
        val_acc = (pred[val_mask] == labels[val_mask]).float().mean()
        test_acc = (pred[test_mask] == labels[test_mask]).float().mean()

        # Save the best validation accuracy and the corresponding test accuracy.
        if best_val_acc < val_acc:
            best_val_acc = val_acc
            best_test_acc = test_acc

        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if e % 5 == 0:
            print(
                f"In epoch {e}, loss: {loss:.3f}, val acc: {val_acc:.3f} (best {best_val_acc:.3f}), test acc: {test_acc:.3f} (best {best_test_acc:.3f})"
            )

In [18]:
model = GCN(g.ndata["features"].shape[1], 200, num_classes)
train(g, model)

In epoch 0, loss: 3.091, val acc: 0.049 (best 0.049), test acc: 0.051 (best 0.051)
In epoch 5, loss: 20.484, val acc: 0.086 (best 0.112), test acc: 0.085 (best 0.120)
In epoch 10, loss: 19.225, val acc: 0.079 (best 0.112), test acc: 0.074 (best 0.120)
In epoch 15, loss: 2.845, val acc: 0.108 (best 0.112), test acc: 0.109 (best 0.120)
In epoch 20, loss: 2.728, val acc: 0.193 (best 0.193), test acc: 0.207 (best 0.207)
In epoch 25, loss: 3.151, val acc: 0.189 (best 0.216), test acc: 0.194 (best 0.228)
In epoch 30, loss: 2.270, val acc: 0.230 (best 0.303), test acc: 0.247 (best 0.320)
In epoch 35, loss: 1.804, val acc: 0.450 (best 0.450), test acc: 0.441 (best 0.441)
In epoch 40, loss: 1.511, val acc: 0.469 (best 0.500), test acc: 0.460 (best 0.523)


KeyboardInterrupt: 