In [1]:
import re
import os
import numpy as np
import pandas as pd

#### Read Dataset
---

In [2]:
dataset = 'mr'
files = os.listdir('data/{}'.format(dataset))
files.remove('.ipynb_checkpoints')

In [3]:
# read mr dataset

def clean_str(s):
    s = s.decode('latin-1')
    s = re.sub(r'\r\n', '', s)
    return s
    
def read_dataset(dataset, file):
    with open('data/{}/{}'.format(dataset, file), 'rb') as f:
        result = f.readlines()
        
    result = map(clean_str, result)
    if 'label' in file:
        result = list(map(int, result))
    else:
        result = list(map(lambda x: x.split(), result))
    return result
 

In [4]:
text_train = read_dataset('mr', 'text_train.txt')
label_train = read_dataset('mr', 'label_train.txt')

text_test = read_dataset('mr', 'text_test.txt')
label_test = read_dataset('mr', 'label_test.txt')

In [5]:
text_train_str = list(map(lambda x: " ".join(x), text_train))
text_test_str = list(map(lambda x: " ".join(x), text_test))

train_df = pd.DataFrame({'text': text_train_str, 'label': label_train})
test_df = pd.DataFrame({'text': text_test_str, 'label': label_test})

train_df.to_csv('data/mr/train_df.tsv', index_label=False, sep='\t')
test_df.to_csv('data/mr/test_df.tsv', index_label=False, sep='\t')

In [6]:
train_df['train_mask'] = True
train_df['test_mask'] = False
test_df['test_mask'] = True
test_df['train_mask'] = False

all_df = pd.concat([train_df, test_df], axis=0)
all_df.reset_index(drop=True, inplace=True)
all_df.reset_index(inplace=True)
all_df.columns = ["doc_id"] + list(all_df.columns)[1:]
all_df.doc_id = all_df.doc_id.astype(str)
all_df.doc_id = 'doc_id_' + all_df.doc_id
all_df.to_csv('data/mr/raw/all_df_mask.tsv', index_label=False, sep='\t')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [7]:
import itertools
import pandas as pd
import numpy as np

class Vocab(object):
  
  def __init__(self, L):
    if isinstance(L[0], list):
      tokens = list(itertools.chain(*L))
      self.token_counts = pd.Series(tokens).value_counts().to_frame().sort_index(ascending=True)
      self.vocab = ["unk"] + self.token_counts.index.to_list()
    else:
      tokens = self.token_counts = pd.Series(L).value_counts().to_frame().sort_index(ascending=True)
      self.vocab = self.token_counts.index.to_list()
    self.w2i = dict(zip(self.vocab, range(len(self.vocab))))
    self.i2w = dict(zip(range(len(self.vocab)), self.vocab))

  def map_words2index(self, L):
    return list(map(lambda x: self.w2i[x] if x in self.w2i else self.w2i['unk'], L))

  def map_index2words(self, L):
    return list(map(lambda x: self.i2w[x], L))

  def map_dataset_words2index(self, L):
    return np.array(list(map(self.map_words2index, L)))

  def map_dataset_index2words(self, L):
    return np.array(list(map(self.map_index2words, L)))

In [8]:
%time vocab = Vocab(text_train + text_test)
%time train_x = vocab.map_dataset_words2index(text_train)
%time test_x = vocab.map_dataset_words2index(text_test)

print(len(vocab.vocab))
train_x_ = vocab.map_dataset_index2words(train_x)
print(" ".join(train_x_[0]))

label_vocab = Vocab(label_train + label_test)
label_vocab.w2i

CPU times: user 97.1 ms, sys: 4.1 ms, total: 101 ms
Wall time: 99.7 ms
CPU times: user 60.6 ms, sys: 8.07 ms, total: 68.6 ms
Wall time: 68.3 ms
CPU times: user 28 ms, sys: 7.8 ms, total: 35.8 ms
Wall time: 35.9 ms
21402
'moore is like a progressive bull in a china shop , a provocateur crashing into ideas and special-interest groups as he slaps together his own brand of liberalism . '


{0: 0, 1: 1}

In [9]:
import torch
from torch import nn, LongTensor
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset
import torch.nn.functional as F
from IPython.core.debugger import set_trace


class MyDataset(Dataset):

  def __init__(self, X, Y=None):
    self.X = X
    self.Y = Y

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    if self.Y is not None:
      return (self.X[idx], self.Y[idx])
    return (self.X[idx], None)

def pad(seq, seq_lengths, pad_after=True):
  max_seq_len = max(seq_lengths)
  seq_tensor = Variable(torch.zeros((len(seq), max_seq_len))).long()
  # pad input tensor
  for idx, seq in enumerate(seq):
    seq_len = seq_lengths[idx]
    if pad_after:
      seq_tensor[idx, :seq_len] = LongTensor(np.asarray(seq).astype(int))
    else: 
      # pad before
      seq_tensor[idx, max_seq_len-seq_len:] = LongTensor(np.asarray(seq).astype(int))
  return seq_tensor

def batchify(data):
  X, Y = tuple(map(list, zip(*data)))
  seq_lengths = LongTensor([len(x) for x in X])
  X = pad(X, seq_lengths, pad_after=True)
  Y = LongTensor(Y)
  return X, Y

def batchify_test(data):
  X, Y = tuple(map(list, zip(*data)))
  seq_lengths = LongTensor([len(x) for x in X])
  X = pad(X, seq_lengths, pad_after=True)
  return X, Y


train = MyDataset(train_x, label_train)
valid = MyDataset(test_x, label_test)
test = MyDataset(test_x)

In [10]:
train_loader = DataLoader(train, batch_size=64, shuffle=True, collate_fn=batchify)
valid_loader = DataLoader(valid, batch_size=64, shuffle=False, collate_fn=batchify)
test_loader = DataLoader(test, batch_size=64, shuffle=False, collate_fn=batchify_test)

In [11]:
from torch import nn, LongTensor, Tensor
from torch.autograd import Variable
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset
import torch.optim as optim
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

In [12]:
import time
def accuracy(preds, y):
  return (np.array(preds) == np.array(y)).astype(int).mean()


def train_epoch(epoch, model, optimizer, criterion):
  model.train()
  train_loss, n_data = 0, 0
  start = time.time()
  preds = []
  labels = []
  for i, (x, y) in enumerate(train_loader):
    n_data += x.size()[0]
    labels.extend(y.tolist())
    if is_cuda: x, y = x.cuda(), y.cuda()
    optimizer.zero_grad()
    out = model(x)
    preds.extend(out.argmax(axis=1).tolist())
    loss = criterion(out, y)
    loss.backward()
    if grad_clip: torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
    optimizer.step()
    train_loss += loss
    if i % print_iter == print_iter - 1:
      model, valid_preds, valid_labels, valid_loss = validate(model, criterion)
      print("""epoch {} - batch [{}/{}] - train loss: {:.2f} - acc: {:.3f} - valid loss : {:.2f} - acc : {:.3f} time taken: {:.2f}""".format(epoch, i, 
            len(train_loader), train_loss/(i+1),
            accuracy(preds, labels), valid_loss, accuracy(valid_preds, valid_labels),
            time.time()-start), flush=True)
      
      model.train()
      start = time.time()
      train_loss = 0

  # end of epoch
  model, valid_preds, valid_labels, valid_loss = validate(model, criterion)
  print("""epoch {} - batch [{}/{}] - train loss: {:.2f} - acc: {:.3f} - valid loss : {:.2f} - acc : {:.3f} time taken: {:.2f}""".format(epoch, i, 
        len(train_loader), train_loss/(i+1),
        accuracy(preds, labels), valid_loss, accuracy(valid_preds, valid_labels),
        time.time()-start), flush=True)
  return model

def learning_rate_decay(optimizer):
  for param_group in optimizer.param_groups:
        param_group['lr'] = param_group['lr'] * 0.1
  return optimizer

def training(model, epoches, lr, wd, return_model=False):
  if torch.cuda.is_available():
    model.cuda()
  optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)
  criterion = nn.CrossEntropyLoss()
  for ep in range(epoches):
    model = train_epoch(ep, model, optimizer, criterion)
    optimizer = learning_rate_decay(optimizer)
  if return_model:
      return model

def validate(model, criterion):
  model.eval()
  valid_loss = 0
  preds, labels = [], []
  for i, (x, y) in enumerate(valid_loader):
    labels.extend(y.tolist())
    if torch.cuda.is_available(): x, y = x.cuda(), y.cuda()
    out = model(x)
    loss = criterion(out, y)
    preds.extend(out.argmax(axis=1).tolist())
    valid_loss += loss
  return model, preds, labels, valid_loss/(i+1)
    
def predict(model, loader):
  model.eval()
  preds, labels = [], []
  for i, (x, _) in enumerate(loader):
    if torch.cuda.is_available(): x = x.cuda()
    out = model(x)
    preds.extend(out.argmax(axis=1).tolist())
  return preds


In [13]:
from torch.nn.utils import weight_norm

class LSTM_clf(nn.Module):

  def __init__(self, embed_dim, hidden_dim, vocab_size, out_size, 
               layers=1, bidirectional=False):
    super(LSTM_clf, self).__init__()
    self.word_embedding = nn.Embedding(vocab_size, embed_dim)
    self.net = nn.LSTM(embed_dim, hidden_dim,  num_layers=layers, 
                       bidirectional=bidirectional, dropout=0.5)
    self.relu = nn.ReLU()
    self.bn = nn.BatchNorm1d(hidden_dim * (int(bidirectional) + 1))
    self.linear = nn.Linear(hidden_dim * (int(bidirectional) + 1), out_size)

  def forward(self, x):
    out = self.word_embedding(x)
    out = self.net(out)[0]
    out = self.relu(out).transpose(1,2)
    out = F.max_pool1d(out, out.size()[2]).squeeze()
    out = self.linear(self.bn(out))
    return out

class DCNN_block(nn.Module):
  
  def __init__(self, embed_dim, hidden_dim, kernel_size, dilations=None,
               dropout=0.2):
    super(DCNN_block, self).__init__()
    self.conv1 = weight_norm(nn.Conv1d(embed_dim, hidden_dim, kernel_size, dilation=1))
    self.conv2 = weight_norm(nn.Conv1d(embed_dim, hidden_dim, kernel_size, dilation=2))
    self.conv3 = weight_norm(nn.Conv1d(embed_dim, hidden_dim, kernel_size, dilation=4))
    self.net = nn.Sequential(self.conv1, nn.ReLU(), nn.Dropout(dropout),
                             self.conv2, nn.ReLU(), nn.Dropout(dropout), 
                             self.conv3, nn.ReLU(), nn.Dropout(dropout))
  
  def forward(self, x):
    # N x C x L
    return self.net(x)

class DCNN_rez_block(nn.Module):
  
  def __init__(self, embed_dim, hidden_dim, kernel_size, dilations=None,
               dropout=0.2):
    super(DCNN_rez_block, self).__init__()
    self.conv1 = weight_norm(nn.Conv1d(embed_dim, hidden_dim, kernel_size, 
                                       padding=(kernel_size-1)*1, dilation=1))
    self.conv2 = weight_norm(nn.Conv1d(embed_dim, hidden_dim, kernel_size, 
                                       padding=(kernel_size-1)*2, dilation=2))
    self.conv3 = weight_norm(nn.Conv1d(embed_dim, hidden_dim, kernel_size, 
                                       padding=(kernel_size-1)*4, dilation=4))

    self.relu1 = nn.ReLU()
    self.relu2 = nn.ReLU()
    self.relu3 = nn.ReLU()

    self.do1 = nn.Dropout(dropout)
    self.do2 = nn.Dropout(dropout)
    self.do3 = nn.Dropout(dropout)
  
  def forward(self, x):
    # N x C x L
    seq_len = x.size()[2]
    out = self.do1(self.relu1(self.conv1(x)))[:, :, -seq_len:]
    out = out + self.do2(self.relu2(self.conv2(x)))[:, :, -seq_len:]
    out = out + self.do3(self.relu3(self.conv3(x)))[:, :, -seq_len:]
    return out


class DCNN(nn.Module):

  def __init__(self, embed_dim, hidden_dim, vocab_size, out_size, 
               kernel_size, dilations=None, rez_block=True, 
               dropout=0.2):
    super(DCNN, self).__init__()
    self.word_embedding = nn.Embedding(vocab_size, embed_dim)
    if rez_block: 
      self.net = DCNN_rez_block(embed_dim, hidden_dim, kernel_size, dilations, dropout)
    else:
      self.net = DCNN_block(embed_dim, hidden_dim, kernel_size, dilations, dropout)
    self.bn = nn.BatchNorm1d(hidden_dim)
    self.do = nn.Dropout(dropout)
    self.linear = nn.Linear(hidden_dim, out_size)

  def forward(self, x):
    out = self.word_embedding(x)
    out = self.net(out.transpose(1,2))
    out = F.max_pool1d(out, out.size()[2]).squeeze()
    out = self.linear(self.do(self.bn(out)))
    return out


class DDCNN(nn.Module):
  # Dilated and Dense CNN
  def __init__(self, embed_dim, hidden_dim, vocab_size, out_size, 
               kernel_size, dilations=None, rez_block=True, 
               dropout=0.2):
    super(DDCNN, self).__init__()
    self.word_embedding = nn.Embedding(vocab_size, embed_dim)
    if rez_block: 
      self.dcnn = DCNN_rez_block(embed_dim, hidden_dim, kernel_size, dilations, dropout)
    else:
      self.dcnn = DCNN_block(embed_dim, hidden_dim, kernel_size, dilations, dropout)

    self.do1 = nn.Dropout(dropout)
    self.do2 = nn.Dropout(dropout)
    self.do3 = nn.Dropout(dropout)
    self.cnn1 = weight_norm(nn.Conv1d(embed_dim, int(hidden_dim//3), 4, padding=3, dilation=1))
    self.cnn2 = weight_norm(nn.Conv1d(embed_dim, int(hidden_dim//3), 6, padding=5, dilation=1))
    self.cnn3 = weight_norm(nn.Conv1d(embed_dim, int(hidden_dim//3), 8, padding=7, dilation=1))
    
    self.bn = nn.BatchNorm1d(hidden_dim*2)
    self.do = nn.Dropout(dropout)
    self.linear = nn.Linear(hidden_dim*2, out_size)

  def cnn(self, x):
    out1 = F.relu(self.cnn1(self.do1(x)))
    out2 = F.relu(self.cnn2(self.do2(x)))
    out3 = F.relu(self.cnn3(self.do3(x)))
    outs = []
    for o in [out1, out2, out3]:
      outs.append(F.max_pool1d(o, o.size()[2]).squeeze())
    out = torch.cat(outs, 1)
    return out

  def forward(self, x):
    out = self.word_embedding(x).transpose(1,2)
    dcnn_out = self.dcnn(out)
    cnn_out = self.cnn(out)
    dcnn_out = F.max_pool1d(dcnn_out, dcnn_out.size()[2]).squeeze()
    out = self.linear(self.do(self.bn(torch.cat((dcnn_out,cnn_out), 1))))
    return out

In [14]:
torch.manual_seed(1)
bs = 512
n_class = 16
epochs = 3
lstm_hidden = 300
cnn_hidden = 300
embed_dim = 300
layers = 2
kernel_size = 3
vocab_size = len(vocab.vocab)
is_cuda = torch.cuda.is_available()
lr = 0.002
grad_clip = 1
print_iter = 500
lstm1 = LSTM_clf(embed_dim, lstm_hidden, vocab_size, n_class, layers)
dcnn1 = DCNN(embed_dim, cnn_hidden, vocab_size, n_class, 3, 
             rez_block=False, dropout=0.2)
dcnn_rez1 = DCNN(embed_dim, cnn_hidden, vocab_size, n_class, 5, 
                 rez_block=True, dropout=0.2)
ddcnn_rez1 = DDCNN(embed_dim, 300, vocab_size, n_class, 5, 
                 rez_block=True, dropout=0.2)

In [37]:
%time training(lstm1, 20, 2e-3, 1e-4)

epoch 0 - batch [111/112] - train loss: 0.70 - acc: 0.558 - valid loss : 0.73 - acc : 0.499 time taken: 5.58
epoch 1 - batch [111/112] - train loss: 0.68 - acc: 0.584 - valid loss : 0.68 - acc : 0.567 time taken: 6.06
epoch 2 - batch [111/112] - train loss: 0.67 - acc: 0.588 - valid loss : 0.68 - acc : 0.578 time taken: 6.25
epoch 3 - batch [111/112] - train loss: 0.67 - acc: 0.595 - valid loss : 0.68 - acc : 0.579 time taken: 5.97
epoch 4 - batch [111/112] - train loss: 0.67 - acc: 0.592 - valid loss : 0.68 - acc : 0.579 time taken: 5.57
epoch 5 - batch [111/112] - train loss: 0.67 - acc: 0.588 - valid loss : 0.68 - acc : 0.578 time taken: 5.56
epoch 6 - batch [111/112] - train loss: 0.67 - acc: 0.599 - valid loss : 0.68 - acc : 0.578 time taken: 5.57
epoch 7 - batch [111/112] - train loss: 0.67 - acc: 0.600 - valid loss : 0.68 - acc : 0.573 time taken: 5.56
epoch 8 - batch [111/112] - train loss: 0.67 - acc: 0.595 - valid loss : 0.68 - acc : 0.579 time taken: 5.57
epoch 9 - batch [11

In [38]:
%time training(dcnn1, 20, 2e-3, 1e-4)

epoch 0 - batch [111/112] - train loss: 1.44 - acc: 0.470 - valid loss : 0.75 - acc : 0.505 time taken: 4.36
epoch 1 - batch [111/112] - train loss: 0.66 - acc: 0.609 - valid loss : 0.67 - acc : 0.591 time taken: 4.79
epoch 2 - batch [111/112] - train loss: 0.62 - acc: 0.655 - valid loss : 0.66 - acc : 0.618 time taken: 4.64
epoch 3 - batch [111/112] - train loss: 0.62 - acc: 0.657 - valid loss : 0.66 - acc : 0.624 time taken: 4.14
epoch 4 - batch [111/112] - train loss: 0.62 - acc: 0.652 - valid loss : 0.65 - acc : 0.623 time taken: 4.18
epoch 5 - batch [111/112] - train loss: 0.62 - acc: 0.659 - valid loss : 0.66 - acc : 0.623 time taken: 4.11
epoch 6 - batch [111/112] - train loss: 0.62 - acc: 0.668 - valid loss : 0.66 - acc : 0.622 time taken: 3.96
epoch 7 - batch [111/112] - train loss: 0.62 - acc: 0.655 - valid loss : 0.66 - acc : 0.616 time taken: 4.83
epoch 8 - batch [111/112] - train loss: 0.62 - acc: 0.666 - valid loss : 0.66 - acc : 0.620 time taken: 4.76
epoch 9 - batch [11

In [39]:
%time training(dcnn_rez1, 20, 2e-3, 1e-4)

epoch 0 - batch [111/112] - train loss: 1.48 - acc: 0.528 - valid loss : 0.67 - acc : 0.610 time taken: 4.95
epoch 1 - batch [111/112] - train loss: 0.53 - acc: 0.768 - valid loss : 0.65 - acc : 0.634 time taken: 4.96
epoch 2 - batch [111/112] - train loss: 0.47 - acc: 0.810 - valid loss : 0.64 - acc : 0.635 time taken: 4.97
epoch 3 - batch [111/112] - train loss: 0.46 - acc: 0.814 - valid loss : 0.65 - acc : 0.634 time taken: 4.96
epoch 4 - batch [111/112] - train loss: 0.45 - acc: 0.819 - valid loss : 0.64 - acc : 0.636 time taken: 4.97
epoch 5 - batch [111/112] - train loss: 0.45 - acc: 0.816 - valid loss : 0.64 - acc : 0.634 time taken: 4.97
epoch 6 - batch [111/112] - train loss: 0.46 - acc: 0.822 - valid loss : 0.64 - acc : 0.633 time taken: 5.09
epoch 7 - batch [111/112] - train loss: 0.45 - acc: 0.824 - valid loss : 0.65 - acc : 0.633 time taken: 5.01
epoch 8 - batch [111/112] - train loss: 0.46 - acc: 0.817 - valid loss : 0.65 - acc : 0.635 time taken: 5.19
epoch 9 - batch [11

In [40]:
%time training(ddcnn_rez1, 20, 2e-3, 1e-4)

epoch 0 - batch [111/112] - train loss: 1.34 - acc: 0.539 - valid loss : 0.70 - acc : 0.628 time taken: 6.98
epoch 1 - batch [111/112] - train loss: 0.46 - acc: 0.806 - valid loss : 0.63 - acc : 0.654 time taken: 7.05
epoch 2 - batch [111/112] - train loss: 0.36 - acc: 0.864 - valid loss : 0.64 - acc : 0.655 time taken: 6.98
epoch 3 - batch [111/112] - train loss: 0.36 - acc: 0.875 - valid loss : 0.63 - acc : 0.658 time taken: 7.06
epoch 4 - batch [111/112] - train loss: 0.36 - acc: 0.868 - valid loss : 0.63 - acc : 0.658 time taken: 6.99
epoch 5 - batch [111/112] - train loss: 0.35 - acc: 0.878 - valid loss : 0.63 - acc : 0.657 time taken: 7.05
epoch 6 - batch [111/112] - train loss: 0.35 - acc: 0.870 - valid loss : 0.63 - acc : 0.654 time taken: 7.01
epoch 7 - batch [111/112] - train loss: 0.36 - acc: 0.875 - valid loss : 0.64 - acc : 0.658 time taken: 7.04
epoch 8 - batch [111/112] - train loss: 0.36 - acc: 0.876 - valid loss : 0.63 - acc : 0.656 time taken: 7.13
epoch 9 - batch [11

## GCN
---

In [16]:
import itertools
import pandas as pd
import numpy as np

class Vocab(object):
  
  def __init__(self, L, doc_ids=None):    
    if isinstance(L[0], list):
      tokens = list(itertools.chain(*L))
      self.token_counts = pd.Series(tokens).value_counts().to_frame().sort_index(ascending=True)
      self.vocab = ["<unk>"] + self.token_counts.index.to_list()
    else:
      tokens = self.token_counts = pd.Series(L).value_counts().to_frame().sort_index(ascending=True)
      self.vocab = self.token_counts.index.to_list()
    self.vocab = list(set(self.vocab))
    self.vocab = sorted(self.vocab)
    if doc_ids is not None:
      self.vocab = doc_ids + self.vocab 
    
    self.w2i = dict(zip(self.vocab, range(len(self.vocab))))
    self.i2w = dict(zip(range(len(self.vocab)), self.vocab))

  def map_words2index(self, L):
    return list(map(lambda x: self.w2i[x] if x in self.w2i else self.w2i['unk'], L))

  def map_index2words(self, L):
    return list(map(lambda x: self.i2w[x], L))

  def map_dataset_words2index(self, L):
    return np.array(list(map(self.map_words2index, L)))

  def map_dataset_index2words(self, L):
    return np.array(list(map(self.map_index2words, L)))

  def get_counts(self):
    return self.token_counts

In [17]:
%time vocab = Vocab(text_train)
%time train_x = vocab.map_dataset_words2index(text_train)

CPU times: user 78.6 ms, sys: 458 µs, total: 79.1 ms
Wall time: 77.6 ms
CPU times: user 65 ms, sys: 152 µs, total: 65.1 ms
Wall time: 64.9 ms


In [18]:
counts = vocab.get_counts()
counts.sort_values(0, inplace=True)
counts.head()

Unnamed: 0,0
kirsten,1
mid-range,1
mid-section,1
mid-seventies,1
mid-to-low,1


### Create Dataset

In [19]:
files

['train_df.tsv',
 'label_train.txt',
 'raw',
 'all_df_masks',
 'label_test.txt',
 'text_test.txt',
 'text_all.txt',
 'processed',
 'test_df.tsv',
 'text_train.txt']

In [25]:
import torch
import torch.nn as nn
from torch_geometric.data import InMemoryDataset, Data
from torch_geometric.utils import to_undirected, is_undirected

EMBED_DIM = 300

class MRDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(MRDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])
        
    @property
    def raw_file_names(self):
        return ['all_df_mask.tsv']
    
    @property
    def processed_file_names(self):
        return ['mr_train.pt']
    
    def process(self):
        df = pd.read_csv(self.raw_paths[0], sep="\t")
        
        texts = list(map(lambda x: x.split(), df.text.values))
        doc_ids = df.doc_id.values
        labels = df.label.values
        doc_label_dict = dict(zip(doc_ids, labels))
        is_train_dict = dict(zip(doc_ids, df.train_mask.values))
        vocab = Vocab(texts, doc_ids=df.doc_id.values.tolist())
        text_int = vocab.map_dataset_words2index(texts)
        
        # nodes_idx mapping will be the same as vocab.i2w
        n_nodes = len(vocab.w2i)
        self.embed = nn.Embedding(n_nodes, EMBED_DIM)
        nodes = self.embed(torch.tensor(range(n_nodes)))  # (vocab_size, EMBED_DIM)
        edge_index = []
        for i, row in df.iterrows():
            doc_id = vocab.w2i[row["doc_id"]]
            edges_ = zip([doc_id for _ in range(len(text_int[i]))], text_int[i])
            edge_index.extend(list(edges_))

        edge_index = list(set(edge_index))  # a word may occur mult times in a doc
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        edge_index = to_undirected(edge_index)

        # masks:
        train_mask = []
        test_mask = []
        for i in range(n_nodes):
            w = vocab.i2w[i]
            if w in doc_label_dict:
                is_train = is_train_dict[w]
                train_mask.append(is_train)
                test_mask.append(not is_train)
            else:
                train_mask.append(False)
                test_mask.append(False)
        labels = np.concatenate((labels, np.array([-1 for i in range(n_nodes-len(labels))])))
        labels = torch.tensor(labels, dtype=torch.long)
        
        data_list = [Data(x=nodes, y=labels, edge_index=edge_index)]
        data_list[0].train_mask = torch.tensor(train_mask)
        data_list[0].test_mask = torch.tensor(test_mask)
        
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])
   

In [26]:
torch.tensor([(1,2), (3,2)])

tensor([[1, 2],
        [3, 2]])

In [27]:
all_df.head()

Unnamed: 0,doc_id,label,test_mask,text,train_mask
0,doc_id_0,1,False,'moore is like a progressive bull in a china s...,True
1,doc_id_1,1,False,idiotic and ugly .,True
2,doc_id_2,1,False,even if the naipaul original remains the real ...,True
3,doc_id_3,1,False,"the movie is amateurish , but it's a minor tre...",True
4,doc_id_4,1,False,some people march to the beat of a different d...,True


In [28]:
import collections

In [29]:
%time dataset = MRDataset('data/mr')

CPU times: user 3.55 ms, sys: 16.7 ms, total: 20.3 ms
Wall time: 18.1 ms


### Net

In [33]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class Net(nn.Module):
    def __init__(self, hidden_dim):
        super(Net, self).__init__()
        self.conv1 = GCNConv(dataset.num_node_features, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, dataset.num_classes)
        
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)

### Training

In [34]:
import time

In [41]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net(100).to(device)
data = dataset[0].to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005) #, weight_decay=5e-4)

model.train()
start = time.time()
for epoch in range(2000):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    if epoch % 100 == 99:
        _, pred = model(data).max(dim=1)
        train_correct = float(pred[data.train_mask].eq(data.y[data.train_mask]).sum().item())
        train_acc = train_correct / data.train_mask.sum().item()
        valid_correct = float(pred[data.test_mask].eq(data.y[data.test_mask]).sum().item())
        valid_acc = valid_correct / data.test_mask.sum().item()
        
        print("epoch {}, loss = {:.4f}, train_acc = {:.4f}, valid_acc = {:.4f}, time taken: {:.2f}".format(epoch, loss, train_acc, valid_acc, time.time()-start))
        start = time.time()

epoch 99, loss = 0.5432, train_acc = 0.7383, valid_acc = 0.5675, time taken: 4.04
epoch 199, loss = 0.4964, train_acc = 0.7577, valid_acc = 0.5521, time taken: 4.05
epoch 299, loss = 0.4750, train_acc = 0.7804, valid_acc = 0.5594, time taken: 4.20
epoch 399, loss = 0.4558, train_acc = 0.7860, valid_acc = 0.5627, time taken: 4.09
epoch 499, loss = 0.4447, train_acc = 0.7860, valid_acc = 0.5760, time taken: 3.99
epoch 599, loss = 0.4365, train_acc = 0.8029, valid_acc = 0.5619, time taken: 4.03
epoch 699, loss = 0.4366, train_acc = 0.8091, valid_acc = 0.5672, time taken: 4.17
epoch 799, loss = 0.4151, train_acc = 0.8143, valid_acc = 0.5701, time taken: 4.02
epoch 899, loss = 0.4292, train_acc = 0.8206, valid_acc = 0.5672, time taken: 3.96
epoch 999, loss = 0.4090, train_acc = 0.8213, valid_acc = 0.5726, time taken: 4.01
epoch 1099, loss = 0.4053, train_acc = 0.8271, valid_acc = 0.5777, time taken: 3.98
epoch 1199, loss = 0.3920, train_acc = 0.8275, valid_acc = 0.5664, time taken: 4.09
epo

In [42]:
model.eval()
_, pred = model(data).max(dim=1)
correct = float(pred[data.test_mask].eq(data.y[data.test_mask]).sum().item())
acc = correct / data.test_mask.sum().item()
print('Accuracy: {:.4f}'.format(acc))

Accuracy: 0.5968
