In [2]:
from pprint import pprint
from argparse import ArgumentParser
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from keras.preprocessing import sequence
from keras.preprocessing import text
from collections import Counter
from gensim.models.doc2vec import Doc2Vec
from nltk.tokenize import word_tokenize

import os
import joblib
import logging
import numpy as np
import yaml
import random
import gc
import multiprocessing as mp
from multiprocessing import cpu_count
import pandas as pd
import tensorflow_hub as hub
import tensorflow as tf
import tokenization

In [3]:
# Constants
config_path = '../config/20news.yaml'

In [4]:
def load_from_path(df_path, rand=False, rand_seed=4079):
    df = pd.read_csv(df_path)
    if rand:
        df = shuffle(df, random_state=rand_seed)
    return df

In [5]:
def load_df(path):
    df = load_from_path(path, rand=True)
    df['id'] = df['id'].astype('category')
    df['cat'] = df['cat'].astype('category')
    df['doc'] = df['doc'].astype(str)
    return df
# end def

In [6]:
# main
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)
# end with
pprint('=' * 20 + 'Configs' + '=' * 20)
pprint(config['train'])

train_df = load_df(config['train'])
test_df = load_df(config['test'])

train_df['labeled'] = 0
#### add x% of EACH CLASS in the train_df to L
cat_count = Counter(train_df['cat'])
random.seed(config['seed'])
ratio = []
for k, v in cat_count.items():
    ratio.append(dict(k=v / train_df.shape[0]))
    cat_id = list(train_df[train_df['cat'] == k]['id'].values)
    rand_id = random.sample(cat_id, int(config['percent'] * v))  # x% currently 10%
    train_df.loc[train_df['id'].isin(rand_id), 'labeled'] = 1
# end for

l_train_df = train_df.loc[train_df['labeled'] == 1]
u_train_df = train_df.loc[train_df['labeled'] == 0]
pprint('LABELED has {} data'.format(l_train_df.shape[0]))
pprint('UNLABELED has {} data'.format(u_train_df.shape[0]))

'../data/20news/train.csv'
'LABELED has 1123 data'
'UNLABELED has 10191 data'


In [7]:
#embed all documents with doc2vec
pprint('=' * 20 + 'Embedding with doc2vec' + '=' * 20)
model = Doc2Vec.load(config['embed']['doc2vec_path'])
l_train_doc2vec = np.array([model.infer_vector(doc.strip().split()) for doc in l_train_df['doc'].values])
if u_train_df.shape[0] > 0:
    u_train_doc2vec = np.array([model.infer_vector(doc.strip().split()) for doc in u_train_df['doc'].values])
test_doc2vec = np.array([model.infer_vector(doc.strip().split()) for doc in test_df['doc'].values])
pprint('DOC2VEC: Labeled training documents embedded into {} dimensions'.format(l_train_doc2vec.shape))
if u_train_df.shape[0] > 0:
    pprint('DOC2VEC: Unlabeled training documents embedded into {} dimensions'.format(u_train_doc2vec.shape))
del model
gc.collect()

'DOC2VEC: Labeled training documents embedded into (1123, 300) dimensions'
'DOC2VEC: Unlabeled training documents embedded into (10191, 300) dimensions'


22

In [8]:
#### binarize train target
lb = LabelBinarizer().fit(train_df['cat'].values)
l_train_cat_bin = lb.transform(l_train_df['cat'].values)
if u_train_df.shape[0] > 0:
    u_train_cat_bin = lb.transform(u_train_df['cat'].values)
pprint('Binarized Classes: {}'.format(lb.classes_))
#### binarize test target
test_cat_bin = lb.transform(test_df['cat'].values)

#### encode train target
le = LabelEncoder().fit(train_df['cat'].values)
l_train_cat_en = le.transform(l_train_df['cat'].values)
if u_train_df.shape[0] > 0:
    u_train_cat_en = le.transform(u_train_df['cat'].values)
pprint('Encoded Classes: {}'.format(le.classes_))
#### encode test target
test_cat_en = le.transform(test_df['cat'].values)

("Binarized Classes: ['alt.atheism' 'comp.graphics' 'comp.os.ms-windows.misc'\n"
 " 'comp.sys.ibm.pc.hardware' 'comp.sys.mac.hardware' 'comp.windows.x'\n"
 " 'misc.forsale' 'rec.autos' 'rec.motorcycles' 'rec.sport.baseball'\n"
 " 'rec.sport.hockey' 'sci.crypt' 'sci.electronics' 'sci.med' 'sci.space'\n"
 " 'soc.religion.christian' 'talk.politics.guns' 'talk.politics.mideast'\n"
 " 'talk.politics.misc' 'talk.religion.misc']")
("Encoded Classes: ['alt.atheism' 'comp.graphics' 'comp.os.ms-windows.misc'\n"
 " 'comp.sys.ibm.pc.hardware' 'comp.sys.mac.hardware' 'comp.windows.x'\n"
 " 'misc.forsale' 'rec.autos' 'rec.motorcycles' 'rec.sport.baseball'\n"
 " 'rec.sport.hockey' 'sci.crypt' 'sci.electronics' 'sci.med' 'sci.space'\n"
 " 'soc.religion.christian' 'talk.politics.guns' 'talk.politics.mideast'\n"
 " 'talk.politics.misc' 'talk.religion.misc']")


In [11]:
#### Save all embedded documents
#### save labeled train data to output path
if config['labeled_train_out']:
    l_train_data = [
        dict(
            # fasttext=l_train_fasttext_text[i],
            # pooledbiobert=l_train_pooledbiobert_text[i],
            # seqbiobert=l_train_seqbiobert_text[i],
            # tfidf=l_train_tfidf_text[i],
            doc2vec=l_train_doc2vec[i],
            # use=l_train_use_text[i],
            # pooledbert=l_train_pooledbert_text[i],
            # seqbert=l_train_seqbert_text[i],
            # pooledelmo=l_train_pooledelmo_text[i],
            # seqelmo=l_train_seqelmo_text[i],
            cat_bin=label,
            cat_en=l_train_cat_en[i],
            id=l_train_df['id'].values[i])
        for i, label in enumerate(l_train_cat_bin)]
    joblib.dump(
        l_train_data,
        config['labeled_train_out'],
        compress=3)
# end if

#### save unlabeled train data to output path
if config['unlabeled_train_out'] and u_train_df.shape[0] > 0:
    u_train_data = [
        dict(
            # fasttext=u_train_fasttext_text[i],
            # pooledbiobert=u_train_pooledbiobert_text[i],
            # seqbiobert=u_train_seqbiobert_text[i],
            # tfidf=u_train_tfidf_text[i],
            doc2vec=u_train_doc2vec[i],
            # use=u_train_use_text[i],
            # pooledbert=u_train_pooledbert_text[i],
            # seqbert=u_train_seqbert_text[i],
            # pooledelmo=u_train_pooledelmo_text[i],
            # seqelmo=u_train_seqelmo_text[i],
            cat_bin=label,
            cat_en=u_train_cat_en[i],
            id=u_train_df['id'].values[i])
        for i, label in enumerate(u_train_cat_bin)]
    joblib.dump(
        u_train_data,
        config['unlabeled_train_out'],
        compress=3)
# end if

if config['test_out']:
    test_data = [
        dict(
            # fasttext=test_fasttext_text[i],
            # pooledbiobert=test_pooledbiobert_text[i],
            # seqbiobert=test_seqbiobert_text[i],
            # tfidf=test_tfidf_text[i],
            doc2vec=test_doc2vec[i],
            # use=test_use_text[i],
            # pooledbert=test_pooledbert_text[i],
            # seqbert=test_seqbert_text[i],
            # pooledelmo=test_pooledelmo_text[i],
            # seqelmo=test_seqelmo_text[i],
            cat_bin=label,
            cat_en=test_cat_en[i],
            id=test_df['id'].values[i])
        for i, label in enumerate(test_cat_bin)]
    joblib.dump(
        test_data,
        config['test_out'],
        compress=3)
# end if

#### save binarizer to output path
if config['encoder_out']:
    joblib.dump(
        le,
        config['encoder_out'],
        compress=3)

#### save encoder to output path
if config['binarizer_out']:
    joblib.dump(
        lb,
        config['binarizer_out'],
        compress=3)
# end def

10191