In [1]:
from HLab.hmd.text import get_window
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import numpy as np
import dgl
from HLab.hmd import Utilities as Util
from HLab.hmd.preprocessing import *
from HLab.hmd.text import *
from sklearn.model_selection import train_test_split
import torch.nn as nn
from dgl.nn import GraphConv
import torch.nn.functional as F

In [2]:
data, labels = fetch_20newsgroups(
        data_home=Util.get_data_path('20newsgroups'),
        subset='test',
        return_X_y=True
    )

In [3]:
edges_src = []
edges_dst = []
edge_features = []

In [4]:
preprocessor = StringPreprocessing()
preprocessor.add_handler(ToLowerCase())
preprocessor.add_handler(RemoveWhiteSpace())
preprocessor.add_handler(RemovePunctuation())
preprocessor.add_handler(EnglishTokenizer())

corpus = [preprocessor.execute(d) for d in data]

In [5]:
vectorizer = TfidfVectorizer(token_pattern=r"\S+")
tfidf_vec = vectorizer.fit_transform(corpus)
lexicon = vectorizer.vocabulary_

doc_nodes = len(corpus)
word_nodes = len(lexicon)
num_nodes = doc_nodes + word_nodes

print(doc_nodes)
print(word_nodes)

7532
98479


In [6]:
for idx, row in tqdm(enumerate(tfidf_vec), desc="generate tfidf edge"):
    for col_ind, value in zip(row.indices, row.data):
        edges_src.append(idx) # doc_id
        edges_dst.append(doc_nodes + col_ind) # word_id
        edge_features.append(value)

generate tfidf edge: 7532it [00:01, 7033.26it/s]


In [7]:
word_window_freq, word_pair_count, windows_count = get_window(corpus, 20)
pmi_edge_lst = count_pmi(word_window_freq, word_pair_count, windows_count, threshold=0)

Split by window: 100%|██████████| 7532/7532 [01:00<00:00, 124.49it/s]
Calculate pmi between words: 100%|██████████| 10150194/10150194 [00:10<00:00, 1001949.75it/s]


In [8]:
for edge_item in pmi_edge_lst:    
    w1_idx = doc_nodes + lexicon[edge_item[0]]
    w2_idx = doc_nodes + lexicon[edge_item[1]]
    edges_src.append(w1_idx) # word_1
    edges_dst.append(w2_idx) # word_2
    edge_features.append(edge_item[2])


In [9]:
labels = [lbl + 1 for lbl in labels]

In [10]:
print(len(labels))
print(doc_nodes)
print(word_nodes)

7532
7532
98479


In [11]:
word_labels = [0] * word_nodes
labels = labels + word_labels
print(len(labels))
print(labels[-20:])

106011
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [12]:
edges_src = torch.from_numpy(np.array(edges_src))
edges_dst = torch.from_numpy(np.array(edges_dst))

In [13]:
num_nodes = doc_nodes + word_nodes

In [14]:
graph = dgl.graph(
    (edges_src, edges_dst), num_nodes=num_nodes
)

In [15]:
graph = dgl.add_reverse_edges(graph) # chuyển về đồ thị vô hướng
graph = dgl.add_self_loop(graph) # + eye matrix

In [16]:
graph.ndata["labels"] = torch.from_numpy(np.array(labels))

In [19]:
print(graph.num_nodes())

106011


In [18]:
import scipy.sparse as sp

wMatrix = sp.csr_matrix((edge_features, (np.array(edges_src), edges_dst)), shape=(graph.num_nodes(), graph.num_nodes()))
print(wMatrix)

0.07703137119748274


In [None]:
# row = list(range(num_nodes))
# col = list(range(num_nodes))
# value = [1.] * num_nodes
# shape = (num_nodes, num_nodes)
# indices = torch.from_numpy(np.vstack((row, col)).astype(np.int64))
# values = torch.FloatTensor(value)
# shape = torch.Size(shape)

# graph.ndata["features"] =  torch.sparse_coo_tensor(indices, values, shape)

In [None]:
node_indices = [i for i in range(num_nodes)]

x_train, x_test, y_train, y_test = train_test_split(node_indices, labels, test_size=0.33, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.33, random_state=42)

In [None]:
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

In [None]:
train_mask[x_train] = True
val_mask[val_mask] = True
test_mask[test_mask] = True


graph.ndata["train_mask"] = train_mask
graph.ndata["val_mask"] = val_mask
graph.ndata["test_mask"] = test_mask

In [None]:
class GCN(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats)
        self.conv2 = GraphConv(h_feats, num_classes)

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h


In [None]:
def train(g, model):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    best_val_acc = 0
    best_test_acc = 0

    features = g.ndata["features"]
    labels = g.ndata["labels"]
    train_mask = g.ndata["train_mask"]
    val_mask = g.ndata["val_mask"]
    test_mask = g.ndata["test_mask"]
    for e in range(100):
        # Forward
        logits = model(g, features)

        # Compute prediction
        pred = logits.argmax(1)

        # Compute loss
        # Note that you should only compute the losses of the nodes in the training set.
        loss = F.cross_entropy(logits[train_mask], labels[train_mask])

        # Compute accuracy on training/validation/test
        train_acc = (pred[train_mask] == labels[train_mask]).float().mean()
        val_acc = (pred[val_mask] == labels[val_mask]).float().mean()
        test_acc = (pred[test_mask] == labels[test_mask]).float().mean()

        # Save the best validation accuracy and the corresponding test accuracy.
        if best_val_acc < val_acc:
            best_val_acc = val_acc
            best_test_acc = test_acc

        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if e % 5 == 0:
            print(
                f"In epoch {e}, loss: {loss:.3f}, val acc: {val_acc:.3f} (best {best_val_acc:.3f}), test acc: {test_acc:.3f} (best {best_test_acc:.3f})"
            )

In [None]:
num_classes = len(set(labels)) + 1
model = GCN(graph.ndata["features"].shape[1], 16, num_classes)

In [None]:
train(graph, model)

In epoch 0, loss: 3.091, val acc: nan (best 0.000), test acc: nan (best 0.000)
In epoch 5, loss: 2.874, val acc: nan (best 0.000), test acc: nan (best 0.000)
In epoch 10, loss: 2.560, val acc: nan (best 0.000), test acc: nan (best 0.000)
In epoch 15, loss: 2.159, val acc: nan (best 0.000), test acc: nan (best 0.000)
In epoch 20, loss: 1.710, val acc: nan (best 0.000), test acc: nan (best 0.000)
In epoch 25, loss: 1.286, val acc: nan (best 0.000), test acc: nan (best 0.000)
In epoch 30, loss: 0.963, val acc: nan (best 0.000), test acc: nan (best 0.000)


KeyboardInterrupt: 