In [1]:
import gluonnlp as nlp
import mxnet as mx
from mxnet import ndarray as nd
from mxnet import autograd
import tqdm
import numpy as np

%load_ext autoreload
%autoreload 2

In [2]:
model, vocab = nlp.model.get_model('bert_12_768_12', dataset_name='book_corpus_wiki_en_uncased', use_classifier=False,
                                  use_decoder=False);
tokenizer = nlp.data.BERTTokenizer(vocab, lower=True);
transform = nlp.data.BERTSentenceTransform(tokenizer, max_seq_length=512, pair=False, pad=False);
sample = transform(['Hello World']);
words, valid_len, segments = mx.nd.array([sample[0]]), mx.nd.array([sample[1]]), mx.nd.array([sample[2]]);
seq_encoding, cls_encoding = model(words, segments, valid_len);

## AG-NEWS

In [3]:
def read_input_data(filename):
    """Helper function to get training data"""
    input_file = open(filename, 'r')
    data = []
    labels = []
    for line in input_file:
        tokens = line.split(',', 1)
        labels.append(int(tokens[0].strip()[-1]))
        data.append(tokens[1].strip())
    return labels, data


In [14]:
# labels, data = read_input_data("../../data/nlp_datasets/ag_news.train")
labels, data = read_input_data("../../data/nlp_datasets/ag_news.test")

In [15]:
%%time
encodings = []

with autograd.predict_mode():
    for i in tqdm.tqdm(range(len(data))):
        sample = transform([data[i]]);
        words, valid_len, segments = mx.nd.array([sample[0]]), mx.nd.array([sample[1]]), mx.nd.array([sample[2]]);
        _, cls_encoding = model(words, segments, valid_len);
        encodings.append(cls_encoding.detach().asnumpy())
#         if i == 2000:
#             break

100%|██████████| 7600/7600 [11:31<00:00, 11.00it/s]

CPU times: user 47min 22s, sys: 1min 5s, total: 48min 27s
Wall time: 11min 31s





In [16]:
encodings = np.concatenate(encodings, axis=0)

In [17]:
labels = np.array(labels) - 1

In [18]:
# np.savez("../../data/nlp_datasets/ag_news_train.npz",
#         encodings=encodings,
#         labels=labels)

# np.savez("../../data/nlp_datasets/ag_news_test.npz",
#         encodings=encodings,
#         labels=labels)

In [19]:
encodings.shape

(7600, 768)

## Logistic Regression

In [4]:
from sklearn.linear_model import LogisticRegression

In [5]:
train_data = np.load("../../data/nlp_datasets/ag_news_train.npz")
x_train, y_train = train_data["encodings"], train_data["labels"]

test_data = np.load("../../data/nlp_datasets/ag_news_test.npz")
x_test, y_test = test_data["encodings"], test_data["labels"]

In [6]:
x = np.hstack([x_train, y_train[:, None]])

In [7]:
np.random.shuffle(x)
x_train = x[:400, :-1]
y_train = x[:400, -1]

In [8]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [9]:
y_pred = logreg.predict(x_test)

In [10]:
(y_pred == y_test).mean()

0.8076315789473684

## SSL Labels

In [14]:
import os
import torch
import numpy as np
import sys 

In [15]:
train_data = np.load("../../data/nlp_datasets/ag_news_train.npz")
x_train, y_train = train_data["encodings"], train_data["labels"]

In [16]:
base_label_dir = os.path.abspath("../../data/labels/agnews/")
print("AGNews: label_dir", base_label_dir)

#10 images per class
for per_class in [100]:
    label_dir = os.path.join(base_label_dir, str(4 * per_class)+"_balanced_labels")
    os.makedirs(label_dir, exist_ok=True)
    
    #generate 20 data splits
    for i in range(10):
        np.random.seed(i)
    
        indices = np.arange(len(y_train))
        np.random.shuffle(indices)
        mask = np.zeros(indices.shape[0], dtype=np.bool)
        labels = y_train
        for j in range(4):
            mask[np.where(labels[indices] == j)[0][:per_class]] = True
        labeled_indices=indices[mask]
#         np.savez(os.path.join(label_dir, str(i)),
#                  labeled_indices=indices[mask],
#                  unlabeled_indices=indices[~mask])


AGNews: label_dir /home/izmailovpavel/Documents/Projects/flow_ssl/data/labels/agnews


In [3]:
from flow_ssl.data import make_ssl_data_loaders
from flow_ssl.data import make_sup_data_loaders
from flow_ssl.data import NO_LABEL
from flow_ssl.data import TransformTwice
from torchvision import transforms

In [4]:
trainloader, testloader, _ = make_ssl_data_loaders(
        "../../data/nlp_datasets/", 
        "../../data/labels/agnews/4000_balanced_labels/0.npz", 
        64 // 2, 
        64 // 2, 
        4, 
        None, 
        None, 
        use_validation=False,
        dataset="ag_news")

Num classes 10
Labeled data:  4000
Unlabeled data: 116000


In [7]:
len(trainloader.dataset)

NotImplementedError: 

In [6]:
trainloader.dataset[0]

NotImplementedError: 

In [5]:
[(trainloader.dataset.train_labels == i).sum() for i in range(4)]

[tensor(1000), tensor(1000), tensor(1000), tensor(1000)]