In [1]:
from pprint import pprint
from argparse import ArgumentParser
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from collections import Counter

# from gensim.models.doc2vec import Doc2Vec
from nltk.tokenize import word_tokenize

import os
import joblib
import logging
import numpy as np
import yaml
import random
import gc
import multiprocessing as mp
from multiprocessing import cpu_count
import pandas as pd
import tensorflow_hub as hub
import tensorflow as tf
from bert.tokenization.bert_tokenization import FullTokenizer

In [2]:
# Constants
config_path = '../config/20news.yaml'

In [10]:
USE_MODULE_URL = "https://tfhub.dev/google/universal-sentence-encoder/4" # updated the url to match tf2 requirements
USE_EMBED = hub.load(USE_MODULE_URL)

KeyboardInterrupt: 

In [10]:
BERT_URL = 'https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1'
BERT_EMBED = hub.load(BERT_URL)

In [11]:
# for tokenization vocab file
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3",
                            trainable=False)

In [3]:
ELMO_URL = 'https://tfhub.dev/google/elmo/3'
ELMO_EMBED = hub.load(ELMO_URL)

In [4]:
def load_from_path(df_path, rand=False, rand_seed=4079):
    df = pd.read_csv(df_path)
    if rand:
        df = shuffle(df, random_state=rand_seed)
    return df

In [5]:
def load_df(path):
    df = load_from_path(path, rand=True)
    df['id'] = df['id'].astype('category')
    df['cat'] = df['cat'].astype('category')
    df['doc'] = df['doc'].astype(str)
    return df
# end def

In [6]:
def get_BERT(col_series):
    def _create_tokenizer(vocab_file, do_lower_case=True):
        return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
    # end def

    # tokenizer = _create_tokenizer(os.path.join(os.environ['TFHUB_CACHE_DIR'], 'ecd2596ce849110246602e3d4d81e2d9719cb027/assets/vocab.txt'), do_lower_case=True
    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    tokenizer = _create_tokenizer(vocab_file, do_lower_case=True)
    # tokenizer = _create_tokenizer(os.path.join(os.environ['TFHUB_CACHE_DIR'], 'fcc4b13aa51839e09bd1c291f604abcc2411f245/assets/vocab.txt'), do_lower_case=True)

    def _convert_sentence_to_bert(sentence, tokenizer, max_seq_len):
        tokens = ['[CLS]']
        tokens.extend(tokenizer.tokenize(sentence))
        if len(tokens) > max_seq_len-1:
            tokens = tokens[:max_seq_len-1]
        tokens.append('[SEP]')

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)
        segment_ids = [0] * len(tokens)

        #Zero Mask till seq_length
        zero_mask = [0] * (max_seq_len-len(tokens))
        input_ids.extend(zero_mask)
        input_mask.extend(zero_mask)
        segment_ids.extend(zero_mask)
        # pprint(input_ids)
        # pprint(input_mask)
        # pprint(segment_ids)
        return input_ids, input_mask, segment_ids
    # end def

    def _convert_sentences_to_bert(sentences, tokenizer, max_seq_len=128):
        all_input_ids = []
        all_input_mask = []
        all_segment_ids = []

        for sentence in sentences:
            input_ids, input_mask, segment_ids = _convert_sentence_to_bert(sentence, tokenizer, max_seq_len)
            all_input_ids.append(input_ids)
            all_input_mask.append(input_mask)
            all_segment_ids.append(segment_ids)

        return all_input_ids, all_input_mask, all_segment_ids
    # end def

    pprint('Converting to BERT....')

    # col_series = ['New Delhi is the capital of India', 'The capital of India is Delhi']
    max_seq_len = 16
    input_ids_vals, input_mask_vals, segment_ids_vals = _convert_sentences_to_bert(col_series, tokenizer, max_seq_len)
    
    '''bert_inputs = dict(
    input_ids=tf.convert_to_tensor(input_ids_vals),
    input_mask=tf.convert_to_tensor(input_mask_vals),
    segment_ids=tf.convert_to_tensor(segment_ids_vals)
    )'''
    print(tf.shape(tf.convert_to_tensor(input_mask_vals)))
    bert_outputs = BERT_EMBED.signatures['tokens'](input_ids=tf.convert_to_tensor(input_ids_vals),
    input_mask=tf.convert_to_tensor(input_mask_vals),
    segment_ids=tf.convert_to_tensor(segment_ids_vals))
    # Note that out has 2 keys:
    # sequence_output which is output embedding for each token and
    # pooled_output which is output embedding for the entire sequence.

    # return np.reshape(out['sequence_output'], (out['sequence_output'].shape[0], out['sequence_output'].shape[1] * out['sequence_output'].shape[2]))
    return (bert_outputs['pooled_output'], bert_outputs['sequence_output'])
    # return out['pooled_output']
# end def

In [7]:
def get_ELMO(col_series):
    def _restric_len(sentence, max_seq_len=128):
        tokens = word_tokenize(sentence)
        if len(tokens) > max_seq_len:
            tokens = tokens[:max_seq_len]

        return ' '.join(tokens)
    # end def

    pprint('Converting to ELMO....')
    max_seq_len = 128
    # col_series = pd.Series(['New Delhi is the capital of India', 'The capital of India is Delhi'])
    col_series = col_series.apply(lambda x: _restric_len(x, max_seq_len))
    pprint(col_series)
    elmo_inputs = {'tokens': tf.convert_to_tensor(col_series.values), 
                   'sequence_len' : 128}

    pooled_embeddings = ELMO_EMBED.signatures['default'](text= tf.convert_to_tensor(col_series.values))['default']
    seq_embeddings = ELMO_EMBED.signatures['default'](text= tf.convert_to_tensor(col_series.values))['elmo']
    # return embeddings.reshape(embeddings.shape[0], embeddings.shape[1] * embeddings.shape[2])
    return (pooled_embeddings, seq_embeddings)
# end def

In [8]:
# main
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)
# end with
pprint('=' * 20 + 'Configs' + '=' * 20)
pprint(config['train'])

train_df = load_df(config['train'])
test_df = load_df(config['test'])

train_df['labeled'] = 0
#### add x% of EACH CLASS in the train_df to L
cat_count = Counter(train_df['cat'])
random.seed(config['seed'])
ratio = []
for k, v in cat_count.items():
    ratio.append(dict(k=v / train_df.shape[0]))
    cat_id = list(train_df[train_df['cat'] == k]['id'].values)
    rand_id = random.sample(cat_id, int(config['percent'] * v))  # x% currently 10%
    train_df.loc[train_df['id'].isin(rand_id), 'labeled'] = 1
# end for

l_train_df = train_df.loc[train_df['labeled'] == 1]
u_train_df = train_df.loc[train_df['labeled'] == 0]
pprint('LABELED has {} data'.format(l_train_df.shape[0]))
pprint('UNLABELED has {} data'.format(u_train_df.shape[0]))

'../data/20news/train.csv'
'LABELED has 1123 data'
'UNLABELED has 10191 data'


In [None]:
#embed all documents with doc2vec
pprint('=' * 20 + 'Embedding with doc2vec' + '=' * 20)
model = Doc2Vec.load(config['embed']['doc2vec_path'])
l_train_doc2vec = np.array([model.infer_vector(doc.strip().split()) for doc in l_train_df['doc'].values])
if u_train_df.shape[0] > 0:
    u_train_doc2vec = np.array([model.infer_vector(doc.strip().split()) for doc in u_train_df['doc'].values])
test_doc2vec = np.array([model.infer_vector(doc.strip().split()) for doc in test_df['doc'].values])
pprint('DOC2VEC: Labeled training documents embedded into {} dimensions'.format(l_train_doc2vec.shape))
if u_train_df.shape[0] > 0:
    pprint('DOC2VEC: Unlabeled training documents embedded into {} dimensions'.format(u_train_doc2vec.shape))
del model
gc.collect()



In [60]:
#### embed all documents with tfidf
pprint('=' * 20 + 'Embedding with tfidf' + '=' * 20)
ngram_range = (1, 3)
vectorizer = TfidfVectorizer(
    max_features=10000,
    sublinear_tf=True,
    strip_accents='unicode',
    stop_words='english',
    lowercase=True,
    analyzer='word',
    token_pattern=r'\w{3,}',
    ngram_range=ngram_range,
    dtype=np.float32,
    norm='l2',
    min_df=3,
    max_df=.9
    )
vectorizer.fit_transform(train_df['doc'])  # fit on all training documents, regardless of labeled or unlabeled

l_train_tfidf_text = vectorizer.transform(l_train_df['doc']).toarray()
if u_train_df.shape[0] > 0:
    u_train_tfidf_text = vectorizer.transform(u_train_df['doc']).toarray()
test_tfidf_text = vectorizer.transform(test_df['doc']).toarray()

pprint('TFIDF: Labeled training documents embedded into {} dimensions'.format(l_train_tfidf_text.shape))
if u_train_df.shape[0] > 0:
    pprint('TFIDF: Unlabeled training documents embedded into {} dimensions'.format(u_train_tfidf_text.shape))

'TFIDF: Labeled training documents embedded into (1123, 10000) dimensions'
'TFIDF: Unlabeled training documents embedded into (10191, 10000) dimensions'


In [59]:
#### embed all documents with USE
pprint('=' * 20 + 'USE' + '=' * 20)

l_train_use_text = USE_EMBED(l_train_df['doc']).numpy()
if u_train_df.shape[0] > 0:
    u_train_use_text = USE_EMBED(u_train_df['doc']).numpy()
test_use_text = USE_EMBED(test_df['doc']).numpy()
#end with

pprint('USE: Labeled training documents embedded into {} dimensions'.format(l_train_use_text.shape))
if u_train_df.shape[0] > 0:
    pprint('USE: Unlabeled training documents embedded into {} dimensions'.format(u_train_use_text.shape))



NameError: name 'USE_EMBED' is not defined

In [28]:
#### embed all documents with BERT
tf.keras.backend.set_floatx('float16')
pprint('=' * 20 + 'BERT' + '=' * 20)
l_train_pooledbert_text, l_train_seqbert_text = get_BERT(l_train_df['doc'])

'Converting to BERT....'
tf.Tensor([1123   16], shape=(2,), dtype=int32)


In [29]:
with tf.device("/cpu:0"): # OOR error due to lack of memory resource, change to CPU compute
    if u_train_df.shape[0] > 0:
        u_train_pooledbert_text, u_train_seqbert_text = get_BERT(u_train_df['doc'])

'Converting to BERT....'
tf.Tensor([10191    16], shape=(2,), dtype=int32)


In [30]:
with tf.device("/cpu:0"):
    test_pooledbert_text, test_seqbert_text = get_BERT(test_df['doc'])

'Converting to BERT....'
tf.Tensor([7532   16], shape=(2,), dtype=int32)


In [31]:
l_train_pooledbert_text = l_train_pooledbert_text.numpy()
l_train_seqbert_text = l_train_seqbert_text.numpy()
u_train_pooledbert_text = u_train_pooledbert_text.numpy()
u_train_seqbert_text = u_train_seqbert_text.numpy()
test_pooledbert_text = test_pooledbert_text.numpy()
test_seqbert_text = test_seqbert_text.numpy()

In [None]:
#### embed all documents with ELMO
# Config threading params
tf.config.threading.set_intra_op_parallelism_threads(2)
tf.config.threading.set_inter_op_parallelism_threads(2)

with tf.device('/CPU:0'):
    pprint('=' * 20 + 'ELMO' + '=' * 20)
    l_train_pooledelmo_text, l_train_seqelmo_text = get_ELMO(l_train_df['doc'])
    if u_train_df.shape[0] > 0:
        u_train_pooledelmo_text, u_train_seqelmo_text = get_ELMO(u_train_df['doc'])
    test_pooledelmo_text, test_seqelmo_text = get_ELMO(test_df['doc'], session)
#end with

pprint('POOLEDELMO: Labeled training documents embedded into {} dimensions'.format(l_train_pooledelmo_text.shape))
if u_train_df.shape[0] > 0:
    pprint('POOLEDELMO: Unlabeled training documents embedded into {} dimensions'.format(u_train_pooledelmo_text.shape))

pprint('SEQELMO: Labeled training documents embedded into {} dimensions'.format(l_train_seqelmo_text.shape))
if u_train_df.shape[0] > 0:
    pprint('SEQELMO: Unlabeled training documents embedded into {} dimensions'.format(u_train_seqelmo_text.shape))

In [32]:
#### binarize train target
lb = LabelBinarizer().fit(train_df['cat'].values)
l_train_cat_bin = lb.transform(l_train_df['cat'].values)
if u_train_df.shape[0] > 0:
    u_train_cat_bin = lb.transform(u_train_df['cat'].values)
pprint('Binarized Classes: {}'.format(lb.classes_))
#### binarize test target
test_cat_bin = lb.transform(test_df['cat'].values)

#### encode train target
le = LabelEncoder().fit(train_df['cat'].values)
l_train_cat_en = le.transform(l_train_df['cat'].values)
if u_train_df.shape[0] > 0:
    u_train_cat_en = le.transform(u_train_df['cat'].values)
pprint('Encoded Classes: {}'.format(le.classes_))
#### encode test target
test_cat_en = le.transform(test_df['cat'].values)

("Binarized Classes: ['alt.atheism' 'comp.graphics' 'comp.os.ms-windows.misc'\n"
 " 'comp.sys.ibm.pc.hardware' 'comp.sys.mac.hardware' 'comp.windows.x'\n"
 " 'misc.forsale' 'rec.autos' 'rec.motorcycles' 'rec.sport.baseball'\n"
 " 'rec.sport.hockey' 'sci.crypt' 'sci.electronics' 'sci.med' 'sci.space'\n"
 " 'soc.religion.christian' 'talk.politics.guns' 'talk.politics.mideast'\n"
 " 'talk.politics.misc' 'talk.religion.misc']")
("Encoded Classes: ['alt.atheism' 'comp.graphics' 'comp.os.ms-windows.misc'\n"
 " 'comp.sys.ibm.pc.hardware' 'comp.sys.mac.hardware' 'comp.windows.x'\n"
 " 'misc.forsale' 'rec.autos' 'rec.motorcycles' 'rec.sport.baseball'\n"
 " 'rec.sport.hockey' 'sci.crypt' 'sci.electronics' 'sci.med' 'sci.space'\n"
 " 'soc.religion.christian' 'talk.politics.guns' 'talk.politics.mideast'\n"
 " 'talk.politics.misc' 'talk.religion.misc']")


In [34]:
#### Save all embedded documents
#### save labeled train data to output path
if config['labeled_train_out']:
    l_train_data = [
        dict(
            # fasttext=l_train_fasttext_text[i],
            # pooledbiobert=l_train_pooledbiobert_text[i],
            # seqbiobert=l_train_seqbiobert_text[i],
            # tfidf=l_train_tfidf_text[i],
            # doc2vec=l_train_doc2vec[i],
            # use=l_train_use_text[i],
            pooledbert=l_train_pooledbert_text[i],
            seqbert=l_train_seqbert_text[i],
            # pooledelmo=l_train_pooledelmo_text[i],
            # seqelmo=l_train_seqelmo_text[i],
            cat_bin=label,
            cat_en=l_train_cat_en[i],
            id=l_train_df['id'].values[i])
        for i, label in enumerate(l_train_cat_bin)]
    joblib.dump(
        l_train_data,
        config['labeled_train_out'],
        compress=3)
# end if

#### save unlabeled train data to output path
if config['unlabeled_train_out'] and u_train_df.shape[0] > 0:
    u_train_data = [
        dict(
            # fasttext=u_train_fasttext_text[i],
            # pooledbiobert=u_train_pooledbiobert_text[i],
            # seqbiobert=u_train_seqbiobert_text[i],
            # tfidf=u_train_tfidf_text[i],
            # doc2vec=u_train_doc2vec[i],
            # use=u_train_use_text[i],
            pooledbert=u_train_pooledbert_text[i],
            seqbert=u_train_seqbert_text[i],
            # pooledelmo=u_train_pooledelmo_text[i],
            # seqelmo=u_train_seqelmo_text[i],
            cat_bin=label,
            cat_en=u_train_cat_en[i],
            id=u_train_df['id'].values[i])
        for i, label in enumerate(u_train_cat_bin)]
    joblib.dump(
        u_train_data,
        config['unlabeled_train_out'],
        compress=3)
# end if

if config['test_out']:
    test_data = [
        dict(
            # fasttext=test_fasttext_text[i],
            # pooledbiobert=test_pooledbiobert_text[i],
            # seqbiobert=test_seqbiobert_text[i],
            # tfidf=test_tfidf_text[i],
            # doc2vec=test_doc2vec[i],
            # use=test_use_text[i],
            pooledbert=test_pooledbert_text[i],
            seqbert=test_seqbert_text[i],
            # pooledelmo=test_pooledelmo_text[i],
            # seqelmo=test_seqelmo_text[i],
            cat_bin=label,
            cat_en=test_cat_en[i],
            id=test_df['id'].values[i])
        for i, label in enumerate(test_cat_bin)]
    joblib.dump(
        test_data,
        config['test_out'],
        compress=3)
# end if

#### save binarizer to output path
if config['encoder_out']:
    joblib.dump(
        le,
        config['encoder_out'],
        compress=3)

#### save encoder to output path
if config['binarizer_out']:
    joblib.dump(
        lb,
        config['binarizer_out'],
        compress=3)
# end def

[-0.7676421  -0.46616325 -0.80833274  0.6734823   0.33734703 -0.27703196
  0.60597163  0.5224088  -0.8149108  -0.9998391  -0.4292998   0.9429371
  0.9663594   0.3679517   0.5756684  -0.8142145  -0.59017307 -0.43645248
  0.42589492 -0.14058141  0.55344635  0.99997294 -0.32688245  0.4266491
  0.50261784  0.9815946  -0.82330453  0.7125289   0.9123911   0.56843674
 -0.592455    0.49326822 -0.98284304 -0.20813778 -0.89326036 -0.9788771
  0.38495365 -0.26637498 -0.10426681 -0.01124939 -0.4199149   0.45385128
  0.9997956  -0.06899013  0.46134266 -0.3337812  -0.9999993   0.28515524
 -0.6364738   0.81394595  0.869247    0.88619417  0.35010263  0.4629448
  0.59282386 -0.44816735  0.02279369  0.38343647 -0.2338827  -0.55296415
 -0.62468624  0.627178   -0.8014105  -0.75086355  0.950226    0.5828002
 -0.49729773 -0.3209368  -0.10257295 -0.12500489  0.6877629   0.23662205
  0.01854715 -0.74639153  0.7118426   0.27826643 -0.68398416  1.
 -0.72101104 -0.9403801   0.8814371   0.6914073   0.685079   -0.

(300,)


In [35]:
test_data[20]['seqbert'].shape

(16, 768)