In [1]:
import re
import os
import numpy as np
import pandas as pd

#### Read Dataset
---

In [2]:
dataset = 'mr'
files = os.listdir('data/{}'.format(dataset))
files.remove('.ipynb_checkpoints')

In [3]:
# read mr dataset

def clean_str(s):
    s = s.decode('latin-1')
    s = re.sub(r'\r\n', '', s)
    return s
    
def read_dataset(dataset, file):
    with open('data/{}/{}'.format(dataset, file), 'rb') as f:
        result = f.readlines()
        
    result = map(clean_str, result)
    if 'label' in file:
        result = list(map(int, result))
    else:
        result = list(map(lambda x: x.split(), result))
    return result
 

In [4]:
text_train = read_dataset('mr', 'text_train.txt')
label_train = read_dataset('mr', 'label_train.txt')

text_test = read_dataset('mr', 'text_test.txt')
label_test = read_dataset('mr', 'label_test.txt')

In [5]:
text_train_str = list(map(lambda x: " ".join(x), text_train))
text_test_str = list(map(lambda x: " ".join(x), text_test))

train_df = pd.DataFrame({'text': text_train_str, 'label': label_train})
test_df = pd.DataFrame({'text': text_test_str, 'label': label_test})

train_df.to_csv('data/mr/train_df.tsv', index_label=False, sep='\t')
test_df.to_csv('data/mr/test_df.tsv', index_label=False, sep='\t')

In [6]:
train_df['train_mask'] = True
train_df['test_mask'] = False
test_df['test_mask'] = True
test_df['train_mask'] = False

all_df = pd.concat([train_df, test_df], axis=0)
all_df.reset_index(drop=True, inplace=True)
all_df.reset_index(inplace=True)
all_df.columns = ["doc_id"] + list(all_df.columns)[1:]
all_df.doc_id = all_df.doc_id.astype(str)
all_df.doc_id = 'doc_id_' + all_df.doc_id
all_df.to_csv('data/mr/raw/all_df_mask.tsv', index_label=False, sep='\t')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [7]:
import itertools
import pandas as pd
import numpy as np

class Vocab(object):
  
  def __init__(self, L, doc_ids=None):    
    if isinstance(L[0], list):
      tokens = list(itertools.chain(*L))
      self.token_counts = pd.Series(tokens).value_counts().to_frame().sort_index(ascending=True)
      self.vocab = ["<unk>"] + self.token_counts.index.to_list()
    else:
      tokens = self.token_counts = pd.Series(L).value_counts().to_frame().sort_index(ascending=True)
      self.vocab = self.token_counts.index.to_list()
    self.vocab = list(set(self.vocab))
    self.vocab = sorted(self.vocab)
    if doc_ids is not None:
      self.vocab = doc_ids + self.vocab 
    
    self.w2i = dict(zip(self.vocab, range(len(self.vocab))))
    self.i2w = dict(zip(range(len(self.vocab)), self.vocab))

  def map_words2index(self, L):
    return list(map(lambda x: self.w2i[x] if x in self.w2i else self.w2i['unk'], L))

  def map_index2words(self, L):
    return list(map(lambda x: self.i2w[x], L))

  def map_dataset_words2index(self, L):
    return np.array(list(map(self.map_words2index, L)))

  def map_dataset_index2words(self, L):
    return np.array(list(map(self.map_index2words, L)))

  def get_counts(self):
    return self.token_counts

In [8]:
%time vocab = Vocab(text_train)
%time train_x = vocab.map_dataset_words2index(text_train)

CPU times: user 82.8 ms, sys: 0 ns, total: 82.8 ms
Wall time: 82.5 ms
CPU times: user 67.4 ms, sys: 0 ns, total: 67.4 ms
Wall time: 67.2 ms


In [9]:
counts = vocab.get_counts()
counts.sort_values(0, inplace=True)
counts.head()

Unnamed: 0,0
kirsten,1
mid-range,1
mid-section,1
mid-seventies,1
mid-to-low,1


#### Create Dataset

In [10]:
files

['train_df.tsv',
 'label_train.txt',
 'raw',
 'all_df_masks',
 'label_test.txt',
 'text_test.txt',
 'text_all.txt',
 'processed',
 'test_df.tsv',
 'text_train.txt']

In [162]:
import torch
import torch.nn as nn
from torch_geometric.data import InMemoryDataset, Data
from torch_geometric.utils import to_undirected, is_undirected

EMBED_DIM = 100

class MRDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(MRDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])
        
    @property
    def raw_file_names(self):
        return ['all_df_mask.tsv']
    
    @property
    def processed_file_names(self):
        return ['mr_train.pt']
    
    def process(self):
        df = pd.read_csv(self.raw_paths[0], sep="\t")
        
        texts = list(map(lambda x: x.split(), df.text.values))
        doc_ids = df.doc_id.values
        labels = df.label.values
        doc_label_dict = dict(zip(doc_ids, labels))
        is_train_dict = dict(zip(doc_ids, df.train_mask.values))
        vocab = Vocab(texts, doc_ids=df.doc_id.values.tolist())
        text_int = vocab.map_dataset_words2index(texts)
        
        # nodes_idx mapping will be the same as vocab.i2w
        n_nodes = len(vocab.w2i)
        self.embed = nn.Embedding(n_nodes, EMBED_DIM)
        nodes = self.embed(torch.tensor(range(n_nodes)))  # (vocab_size, EMBED_DIM)
        edge_index = []
        for i, row in df.iterrows():
            doc_id = vocab.w2i[row["doc_id"]]
            edges_ = zip([doc_id for _ in range(len(text_int[i]))], text_int[i])
            edge_index.extend(list(edges_))

        edge_index = list(set(edge_index))  # a word may occur mult times in a doc
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        edge_index = to_undirected(edge_index)

        # masks:
        train_mask = []
        test_mask = []
        for i in range(n_nodes):
            w = vocab.i2w[i]
            if w in doc_label_dict:
                is_train = is_train_dict[w]
                train_mask.append(is_train)
                test_mask.append(not is_train)
            else:
                train_mask.append(False)
                test_mask.append(False)
        labels = np.concatenate((labels, np.array([-1 for i in range(n_nodes-len(labels))])))
        labels = torch.tensor(labels, dtype=torch.long)
        
        data_list = [Data(x=nodes, y=labels, edge_index=edge_index)]
        data_list[0].train_mask = torch.tensor(train_mask)
        data_list[0].test_mask = torch.tensor(test_mask)
        
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])
   

In [163]:
torch.tensor([(1,2), (3,2)])

tensor([[1, 2],
        [3, 2]])

In [164]:
all_df.head()

Unnamed: 0,doc_id,label,test_mask,text,train_mask
0,doc_id_0,1,False,'moore is like a progressive bull in a china s...,True
1,doc_id_1,1,False,idiotic and ugly .,True
2,doc_id_2,1,False,even if the naipaul original remains the real ...,True
3,doc_id_3,1,False,"the movie is amateurish , but it's a minor tre...",True
4,doc_id_4,1,False,some people march to the beat of a different d...,True


In [165]:
import collections

In [166]:
%time dataset = MRDataset('data/mr')

Processing...
Done!
CPU times: user 1.97 s, sys: 106 ms, total: 2.07 s
Wall time: 1.76 s


### Net

In [167]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(dataset.num_node_features, 50)
        self.conv2 = GCNConv(50, dataset.num_classes)
        
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)

### Training

In [172]:
import time

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net().to(device)
data = dataset[0].to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)

model.train()
start = time.time()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    if epoch % 50 == 49:
        _, pred = model(data).max(dim=1)
        correct = float(pred[data.test_mask].eq(data.y[data.test_mask]).sum().item())
        valid_acc = correct / data.test_mask.sum().item()
        print("epoch {}, loss = {:.4f}, valid_acc = {:.4f}, time taken: {:.2f}".format(epoch, loss, valid_acc, time.time()-start))
        start = time.time()

KeyboardInterrupt: 

In [None]:
model.eval()
_, pred = model(data).max(dim=1)
correct = float(pred[data.test_mask].eq(data.y[data.test_mask]).sum().item())
acc = correct / data.test_mask.sum().item()
print('Accuracy: {:.4f}'.format(acc))