In [1]:
# download livedoor news corpus

In [2]:
!mkdir -p ./data/corpus/livedoor
!mkdir -p ./data/model
!wget https://www.rondhuit.com/download/ldcc-20140209.tar.gz
!tar -zxf ldcc-20140209.tar.gz -C ./data/corpus/livedoor

livedoor_news  livedoor_sentences_nva.pkl


In [None]:
corpus_dir = './data/corpus/livedoor/'
model_dir = './data/moel'

In [32]:
# library

import os
import gc
import pandas as pd
import numpy as np

# NLP
import re
import MeCab
mecab = MeCab.Tagger (r"-Ochasen") #  -u ./dict/qiita.dic
mecab.parse("")

# SCDV
from gensim.models import Word2Vec
from gensim.models import FastText
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.mixture import GaussianMixture

# ML
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [6]:
# japanese plot setting

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

font_path = "/usr/share/fonts/truetype/takao-gothic/TakaoGothic.ttf"
font_prop = FontProperties(fname=font_path)
mpl.rcParams['font.family'] = font_prop.get_name()

In [7]:
# cloudpickle

import cloudpickle

def load_from_pkl( fpath ):
    frb = open(fpath, 'rb')
    obj = cloudpickle.loads(frb.read())
    return obj

def save_as_pkl( obj, fpath ):
    fwb = open( fpath, 'wb')
    tmp = fwb.write(cloudpickle.dumps(obj))
    return

In [8]:
# tokenize

def get_tokens( text, ):
    tokens = []
    result = mecab.parseToNode( text )
    while result:
        tokens.append( result.surface )
        result = result.next
    return tokens

def get_tokens_pos( text, target_pos ):
    tokens = []
    result = mecab.parseToNode( text )
    while result:
        pos = result.feature.split(',')[0]
        if pos in target_pos: tokens.append( result.surface )
        result = result.next
    return tokens

In [26]:
class SCDV(object):
    
    """
    w2v_model: 
    softclustering_model: 
    sparse_percentage: the threshold percentage for making it sparse
    """
    def __init__(self, w2v_model, sc_model, sparse_percentage):
        
        # values
        self.w2v_model = w2v_model
        self.num_clusters = sc_model.n_components
        self.w2v_vector_size = w2v_model.vector_size
        self.min_no = .0
        self.max_no = .0
        self.sparse_percentage = sparse_percentage
        
        # apply soft clustering to embedding vectors
        self.w2v_vectors = w2v_model.wv.vectors
        idx, idx_proba = self._soft_clustering( sc_model, self.w2v_vectors)
        self.word_centroid_map = dict( zip( w2v_model.wv.index2word, idx ) ) # Create a Word / Index dictionary, mapping each vocabulary word to a cluster number
        self.word_centroid_prob_map = dict(zip( w2v_model.wv.index2word, idx_proba )) # Create a Word / Probability of cluster assignment dictionary, mapping each vocabulary word to list of probabilities of cluster assignments.       
               
        return
    
    """
    sentences:
    """
    def precompute_word_topic_vector( self, sentences ):
               
        # compute idf values
        self.featurenames, self.word_idf_dict = self._compute_idf_values( sentences ) 
        
        # compute word topic vectors
        self.wv = self._get_probability_word_vectors( self.w2v_model,
                                                      self.num_clusters,
                                                      self.w2v_vector_size,
                                                      self.featurenames,
                                                      self.word_centroid_map,
                                                      self.word_centroid_prob_map,
                                                      self.word_idf_dict)
        return
    
    def train( self, sentences ):

        # values
        doc_num = len(sentences)
        
        # get document vector
        X = np.zeros( (doc_num, self.num_clusters*self.w2v_vector_size), dtype="float32")
        for idx, tokens in enumerate( sentences ):
            X[idx] = self._create_cluster_vector_and_gwbowv(self.wv, tokens, self.word_centroid_map, self.word_centroid_prob_map, self.w2v_vector_size, self.word_idf_dict, self.featurenames, self.num_clusters, train=True)

        # get the threshold value for making it sparse. 
        thres = (abs( self.max_no / float( doc_num ) ) + abs( self.min_no / float( doc_num ) )) / 2
        self.sparse_thres = thres * self.sparse_percentage
        
        # Make values of matrices which are less than threshold to zero.
        temp = abs(X) < self.sparse_thres
        X[temp] = 0

        return X

    def infer_vector( self, sentences ):
        
        # values
        doc_num = len(sentences)
        
        # get document vector
        X = np.zeros( (doc_num, self.num_clusters*self.w2v_vector_size), dtype="float32")
        for idx, tokens in enumerate( sentences ):
            X[idx] = self._create_cluster_vector_and_gwbowv(self.wv, tokens, self.word_centroid_map, self.word_centroid_prob_map, self.w2v_vector_size, self.word_idf_dict, self.featurenames, self.num_clusters, train=False)
       
        # Make values of matrices which are less than threshold to zero.
        temp = abs(X) < self.sparse_thres
        X[temp] = 0        

        return X

    
    def _create_cluster_vector_and_gwbowv( self, prob_wordvecs, wordlist, word_centroid_map, word_centroid_prob_map, dimension, word_idf_dict, featurenames, num_centroids, train=False):
        
        bag_of_centroids = np.zeros( num_centroids * dimension, dtype="float32" )
        for word in wordlist:
            if not word in word_centroid_map: continue  
            bag_of_centroids += prob_wordvecs[word]

        norm = np.sqrt(np.einsum('...i,...i', bag_of_centroids, bag_of_centroids))
        if norm != 0: bag_of_centroids /= norm

        # to make feature vector sparse, make note of minimum and maximum values.
        if train:
            self.min_no += min(bag_of_centroids)
            self.max_no += max(bag_of_centroids)

        return bag_of_centroids
    
    @staticmethod
    def _soft_clustering( sc_model, word_vectors):
        
        sc_model.fit(word_vectors)
        idx = sc_model.predict(word_vectors)
        idx_proba = sc_model.predict_proba(word_vectors)

        return (idx, idx_proba)

    @staticmethod
    def _compute_idf_values( sentences ):
        
        tfv = TfidfVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x, dtype=np.float32)
        tfidfmatrix_traindata = tfv.fit_transform(sentences)
        featurenames = tfv.get_feature_names()
        idf = tfv._tfidf.idf_
        idf_dict = { pair[0]:pair[1] for pair in zip(featurenames, idf) }
        
        return featurenames, idf_dict

    @staticmethod
    def _get_probability_word_vectors( w2v_model, num_clusters, num_features, featurenames, word_centroid_map, word_centroid_prob_map, word_idf_dict):
        
        prob_wordvecs = {}
        for word in word_centroid_map:
            prob_wordvecs[word] = np.zeros( num_clusters * num_features, dtype="float32" )
            for c_idx in range(0, num_clusters):
                if not word in word_idf_dict: continue
                prob_wordvecs[word][c_idx*num_features:(c_idx+1)*num_features] = w2v_model.wv[word] * word_centroid_prob_map[word][c_idx] * word_idf_dict[word]
        
        return prob_wordvecs

In [29]:
text_dir = '%s/text' % corpus_dir
files = os.listdir(text_dir)
class_list = [f for f in files if os.path.isdir(os.path.join(text_dir, f)) ]

skipline = 2
y = []
texts = []
for c in class_list:
    c_dir = '%s/%s' % (text_dir, c)
    for file in os.listdir( c_dir ):
        if file == 'LICENSE.txt': continue
        with open( '%s/%s' % ( c_dir, file ) ) as f:
            for _ in range( skipline ): next(f)
            text = f.read()
            texts.append( text )
            y.append( c )
y = np.array(y)
doc_num = len( texts )

# input value
- tokenization
 - 名詞のみ, 内容語(名詞, 形容詞, 動詞), all
- w2v model
 - model
   - word2vec, fastText
 - feature_dim
   - 50, 100, 200
- soft clustering (GMM)
   - cluster_num
     - 10, 20, 30, 40, 50
- sparse percentage
 - 0.1, 0.2, ..., 0.5

In [49]:
tokenize_list = [('n',['名詞']), ('nva',['名詞', '動詞', '形容詞']), ('all','all')]
word_embedding_list = [ 'word2vec', 'fastText' ]
num_features_list = [50, 100, 200] # fastText embedding dim
num_clusters_list = [10, 20, 30, 40, 50]
sparse_percentage_list = [ i/10 for i in range( 1, 6) ]

def objective(trial):
    
    # parameters
    token_type = trial.suggest_categorical('token_type', tokenize_list)
    word_embedding = trial.suggest_categorical('word_embedding', word_embedding_list)
    num_features = trial.suggest_categorical('num_features', num_features_list)
    num_clusters = trial.suggest_categorical('num_clusters', num_clusters_list)
    sparse_percentage = trial.suggest_categorical('sparse_percentage', sparse_percentage_list)
    
    # load values
    sentences = load_from_pkl( '/data/corpus/livedoor_sentences_%s.pkl' % token_type[0] )
    
    if word_embedding == 'word2vec':
        w2v_model = Word2Vec.load( '/data/model/livedoor_%s_%s_%s.model' % ( word_embedding, num_features, token_type[0] ) )
    if word_embedding == 'fastText':
        w2v_model = FastText.load( '/data/model/livedoor_%s_%s_%s.model' % ( word_embedding, num_features, token_type[0] ) )
    
    # set gmm
    gmm =  GaussianMixture(n_components=num_clusters, covariance_type="tied", init_params='kmeans', max_iter=50)
       
    # learn (5-fold cv)
    acc_score_cv = []
    kf = StratifiedKFold(n_splits=5, random_state=777, shuffle=True)
    for idx, (train, val) in enumerate( kf.split(sentences, y) ):

        sentences_train = [ sentences[i] for i in train ]
        sentences_val = [ sentences[i] for i in val ]

        # train scdv
        scdv_model = SCDV(w2v_model=w2v_model, sc_model=gmm, sparse_percentage = sparse_percentage )
        scdv_model.precompute_word_topic_vector(sentences_train)
        X_train = scdv_model.train(sentences_train)

        # learn model
        clf = lgb.LGBMClassifier(n_estimators=10, objective="multiclass")
        clf.fit(X_train, y[train])

        # test scdv
        X_val = scdv_model.infer_vector(sentences_val)

        # predict validation and score
        preds = clf.predict(X_val)
        acc_cv = accuracy_score( preds, y[val] )
        acc_score_cv.append( acc_cv )

    # cv score
    mean = sum( acc_score_cv ) / len( acc_score_cv )
    return 1.0 - mean

In [34]:
# tokenize

for token_type in tokenize_list:
    
    target_pkl = '%s/livedoor_sentences_%s.pkl' % (corpus_dir, token_type[0])
    if os.path.exists( target_pkl ): continue
    print( token_type )
    
    if token_type[0] == 'all':
        sentences = [ get_tokens(text) for text in texts ]
    else:
        sentences = [ get_tokens_pos(text, token_type[1]) for text in texts ]
    
    # save tokenized sentences
    save_as_pkl( sentences, target_pkl )

In [44]:
# learn fastText

# static
min_word_count = 10 # Minimum word count
context = 10 # Context window size

for token_type in tokenize_list:
    for word_embedding in word_embedding_list:
        for num_features in num_features_list:

            target_model = '%s/livedoor_%s_%s_%s.model' % ( model_dir, word_embedding, num_features, token_type[0] )
            if os.path.exists( target_model ): continue
            print( word_embedding, num_features, token_type[0] )

            if word_embedding == 'word2vec':
                model = Word2Vec(size=num_features, sg=1, workers=7, window=context, min_count=min_word_count)
            if word_embedding == 'fastText':
                model = FastText(size=num_features, sg=1, workers=7, window=context, min_count=min_word_count)
            
            sentences = load_from_pkl( '%s/livedoor_sentences_%s.pkl' % (corpus_dir, token_type[0]) )
            
            model.build_vocab( sentences )
            model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)

            # save model
            model.save( target_model )

In [None]:
import warnings
warnings.filterwarnings('ignore')

study = optuna.create_study()
study.optimize(objective, n_trials=10)
print('params:', study.best_params)

[I 2019-03-13 06:23:32,889] Finished a trial resulted in value: 0.172383289679928. Current best value is 0.172383289679928 with parameters: {'token_type': ('nva', ['名詞', '動詞', '形容詞']), 'word_embedding': 'word2vec', 'num_features': 200, 'num_clusters': 30, 'sparse_percentage': 0.2}.
[I 2019-03-13 06:25:52,753] Finished a trial resulted in value: 0.2608897833270084. Current best value is 0.172383289679928 with parameters: {'token_type': ('nva', ['名詞', '動詞', '形容詞']), 'word_embedding': 'word2vec', 'num_features': 200, 'num_clusters': 30, 'sparse_percentage': 0.2}.
[I 2019-03-13 06:30:56,624] Finished a trial resulted in value: 0.13302737885803295. Current best value is 0.13302737885803295 with parameters: {'token_type': ('all', 'all'), 'word_embedding': 'word2vec', 'num_features': 100, 'num_clusters': 50, 'sparse_percentage': 0.1}.
