In [1]:
# download livedoor news corpus

In [1]:
!mkdir -p ./data/corpus/livedoor
!mkdir -p ./data/model
!wget https://www.rondhuit.com/download/ldcc-20140209.tar.gz
!tar -zxf ldcc-20140209.tar.gz -C ./data/corpus/livedoor

--2019-03-13 07:06:20--  https://www.rondhuit.com/download/ldcc-20140209.tar.gz
Resolving www.rondhuit.com (www.rondhuit.com)... 59.106.19.174
Connecting to www.rondhuit.com (www.rondhuit.com)|59.106.19.174|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8855190 (8.4M) [application/x-gzip]
Saving to: ‘ldcc-20140209.tar.gz.1’


2019-03-13 07:06:21 (13.4 MB/s) - ‘ldcc-20140209.tar.gz.1’ saved [8855190/8855190]



In [9]:
corpus_dir = './data/corpus/livedoor/'
model_dir = './data/moel'

In [3]:
# library

import os
import gc
import pandas as pd
import numpy as np

# NLP
import re
import MeCab
mecab = MeCab.Tagger (r"-Ochasen") #  -u ./dict/qiita.dic
mecab.parse("")

# SCDV
from gensim.models import Word2Vec
from gensim.models import FastText
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.mixture import GaussianMixture

# ML
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [4]:
# japanese plot setting

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

font_path = "/usr/share/fonts/truetype/takao-gothic/TakaoGothic.ttf"
font_prop = FontProperties(fname=font_path)
mpl.rcParams['font.family'] = font_prop.get_name()

In [5]:
# cloudpickle

import cloudpickle

def load_from_pkl( fpath ):
    frb = open(fpath, 'rb')
    obj = cloudpickle.loads(frb.read())
    return obj

def save_as_pkl( obj, fpath ):
    fwb = open( fpath, 'wb')
    tmp = fwb.write(cloudpickle.dumps(obj))
    return

In [6]:
# tokenize

def get_tokens( text, ):
    tokens = []
    result = mecab.parseToNode( text )
    while result:
        tokens.append( result.surface )
        result = result.next
    return tokens

def get_tokens_pos( text, target_pos ):
    tokens = []
    result = mecab.parseToNode( text )
    while result:
        pos = result.feature.split(',')[0]
        if pos in target_pos: tokens.append( result.surface )
        result = result.next
    return tokens

In [7]:
class SCDV(object):
    
    """
    w2v_model: 
    softclustering_model: 
    sparse_percentage: the threshold percentage for making it sparse
    """
    def __init__(self, w2v_model, sc_model, sparse_percentage):
        
        # values
        self.w2v_model = w2v_model
        self.num_clusters = sc_model.n_components
        self.w2v_vector_size = w2v_model.vector_size
        self.min_no = .0
        self.max_no = .0
        self.sparse_percentage = sparse_percentage
        
        # apply soft clustering to embedding vectors
        self.w2v_vectors = w2v_model.wv.vectors
        idx, idx_proba = self._soft_clustering( sc_model, self.w2v_vectors)
        self.word_centroid_map = dict( zip( w2v_model.wv.index2word, idx ) ) # Create a Word / Index dictionary, mapping each vocabulary word to a cluster number
        self.word_centroid_prob_map = dict(zip( w2v_model.wv.index2word, idx_proba )) # Create a Word / Probability of cluster assignment dictionary, mapping each vocabulary word to list of probabilities of cluster assignments.       
               
        return
    
    """
    sentences:
    """
    def precompute_word_topic_vector( self, sentences ):
               
        # compute idf values
        self.featurenames, self.word_idf_dict = self._compute_idf_values( sentences ) 
        
        # compute word topic vectors
        self.wv = self._get_probability_word_vectors( self.w2v_model,
                                                      self.num_clusters,
                                                      self.w2v_vector_size,
                                                      self.featurenames,
                                                      self.word_centroid_map,
                                                      self.word_centroid_prob_map,
                                                      self.word_idf_dict)
        return
    
    def train( self, sentences ):

        # values
        doc_num = len(sentences)
        
        # get document vector
        X = np.zeros( (doc_num, self.num_clusters*self.w2v_vector_size), dtype="float32")
        for idx, tokens in enumerate( sentences ):
            X[idx] = self._create_cluster_vector_and_gwbowv(self.wv, tokens, self.word_centroid_map, self.word_centroid_prob_map, self.w2v_vector_size, self.word_idf_dict, self.featurenames, self.num_clusters, train=True)

        # get the threshold value for making it sparse. 
        thres = (abs( self.max_no / float( doc_num ) ) + abs( self.min_no / float( doc_num ) )) / 2
        self.sparse_thres = thres * self.sparse_percentage
        
        # Make values of matrices which are less than threshold to zero.
        temp = abs(X) < self.sparse_thres
        X[temp] = 0

        return X

    def infer_vector( self, sentences ):
        
        # values
        doc_num = len(sentences)
        
        # get document vector
        X = np.zeros( (doc_num, self.num_clusters*self.w2v_vector_size), dtype="float32")
        for idx, tokens in enumerate( sentences ):
            X[idx] = self._create_cluster_vector_and_gwbowv(self.wv, tokens, self.word_centroid_map, self.word_centroid_prob_map, self.w2v_vector_size, self.word_idf_dict, self.featurenames, self.num_clusters, train=False)
       
        # Make values of matrices which are less than threshold to zero.
        temp = abs(X) < self.sparse_thres
        X[temp] = 0        

        return X

    
    def _create_cluster_vector_and_gwbowv( self, prob_wordvecs, wordlist, word_centroid_map, word_centroid_prob_map, dimension, word_idf_dict, featurenames, num_centroids, train=False):
        
        bag_of_centroids = np.zeros( num_centroids * dimension, dtype="float32" )
        for word in wordlist:
            if not word in word_centroid_map: continue  
            bag_of_centroids += prob_wordvecs[word]

        norm = np.sqrt(np.einsum('...i,...i', bag_of_centroids, bag_of_centroids))
        if norm != 0: bag_of_centroids /= norm

        # to make feature vector sparse, make note of minimum and maximum values.
        if train:
            self.min_no += min(bag_of_centroids)
            self.max_no += max(bag_of_centroids)

        return bag_of_centroids
    
    @staticmethod
    def _soft_clustering( sc_model, word_vectors):
        
        sc_model.fit(word_vectors)
        idx = sc_model.predict(word_vectors)
        idx_proba = sc_model.predict_proba(word_vectors)

        return (idx, idx_proba)

    @staticmethod
    def _compute_idf_values( sentences ):
        
        tfv = TfidfVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x, dtype=np.float32)
        tfidfmatrix_traindata = tfv.fit_transform(sentences)
        featurenames = tfv.get_feature_names()
        idf = tfv._tfidf.idf_
        idf_dict = { pair[0]:pair[1] for pair in zip(featurenames, idf) }
        
        return featurenames, idf_dict

    @staticmethod
    def _get_probability_word_vectors( w2v_model, num_clusters, num_features, featurenames, word_centroid_map, word_centroid_prob_map, word_idf_dict):
        
        prob_wordvecs = {}
        for word in word_centroid_map:
            prob_wordvecs[word] = np.zeros( num_clusters * num_features, dtype="float32" )
            for c_idx in range(0, num_clusters):
                if not word in word_idf_dict: continue
                prob_wordvecs[word][c_idx*num_features:(c_idx+1)*num_features] = w2v_model.wv[word] * word_centroid_prob_map[word][c_idx] * word_idf_dict[word]
        
        return prob_wordvecs

In [10]:
text_dir = '%s/text' % corpus_dir
files = os.listdir(text_dir)
class_list = [f for f in files if os.path.isdir(os.path.join(text_dir, f)) ]

skipline = 2
y = []
texts = []
for c in class_list:
    c_dir = '%s/%s' % (text_dir, c)
    for file in os.listdir( c_dir ):
        if file == 'LICENSE.txt': continue
        with open( '%s/%s' % ( c_dir, file ) ) as f:
            for _ in range( skipline ): next(f)
            text = f.read()
            texts.append( text )
            y.append( c )
y = np.array(y)
doc_num = len( texts )

In [11]:
# tokenize

token_type = ('nva',['名詞', '動詞', '形容詞'])
    
target_pkl = '%s/livedoor_sentences_%s.pkl' % (corpus_dir, token_type[0])
if os.path.exists( target_pkl ):
    sentences = load_from_pkl( target_pkl )
else:
    if token_type[0] == 'all':
        sentences = [ get_tokens(text) for text in texts ]
    else:
        sentences = [ get_tokens_pos(text, token_type[1]) for text in texts ]
    save_as_pkl( sentences, target_pkl )

In [None]:
# learn fastText

min_word_count = 10 # Minimum word count
context = 10 # Context window size
word_embedding = 'fastText'
num_features = 100 # fastText embedding dim

target_model = '%s/livedoor_%s_%s_%s.model' % ( model_dir, word_embedding, num_features, token_type[0] )

if os.path.exists( target_model ):
    if word_embedding == 'word2vec':
        w2v_model = Word2Vec.load( target_model )
    if word_embedding == 'fastText':
        w2v_model = FastText.load( target_model )

else:
    print( word_embedding, num_features, token_type[0] )
    if word_embedding == 'word2vec':
        w2v_model = Word2Vec(size=num_features, sg=1, workers=7, window=context, min_count=min_word_count)
    if word_embedding == 'fastText':
        w2v_model = FastText(size=num_features, sg=1, workers=7, window=context, min_count=min_word_count)

    sentences = load_from_pkl( '%s/livedoor_sentences_%s.pkl' % (corpus_dir, token_type[0]) )
    w2v_model.build_vocab( sentences )
    w2v_model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)

    # save model
    w2v_model.save( target_model )

In [15]:
# GMM

num_clusters = 30 # cluster num of GMM clustering
gmm =  GaussianMixture(n_components=num_clusters, covariance_type="tied", init_params='kmeans', max_iter=50)

In [18]:
# learn (5-fold cv)

acc_score_cv = []
kf = StratifiedKFold(n_splits=5, random_state=777, shuffle=True)
for idx, (train, val) in enumerate( kf.split(sentences, y) ):
    
    print( "fold -", idx )
    sentences_train = [ sentences[i] for i in train ]
    sentences_val = [ sentences[i] for i in val ]

    # train scdv
    scdv_model = SCDV(w2v_model=w2v_model, sc_model=gmm, sparse_percentage = 0.04 )
    scdv_model.precompute_word_topic_vector(sentences_train)
    X_train = scdv_model.train(sentences_train)
    
    # learn model
    clf = lgb.LGBMClassifier(n_estimators=10, objective="multiclass")
    clf.fit(X_train, y[train])
    
    # test scdv
    X_val = scdv_model.infer_vector(sentences_val)

    # predict validation and score
    preds = clf.predict(X_val)
    acc_cv = accuracy_score( preds, y[val] )
    acc_score_cv.append( acc_cv )
    
    # classification report
    print( classification_report(y[val], preds) )

# cv score
mean = sum( acc_score_cv ) / len( acc_score_cv )

fold - 0
                precision    recall  f1-score   support

dokujo-tsushin       0.80      0.84      0.82       174
  it-life-hack       0.87      0.90      0.88       174
 kaden-channel       0.86      0.88      0.87       173
livedoor-homme       0.86      0.62      0.72       103
   movie-enter       0.87      0.94      0.91       174
        peachy       0.78      0.73      0.75       169
          smax       0.96      0.97      0.96       174
  sports-watch       0.87      0.94      0.91       180
    topic-news       0.86      0.82      0.84       154

     micro avg       0.86      0.86      0.86      1475
     macro avg       0.86      0.85      0.85      1475
  weighted avg       0.86      0.86      0.86      1475

fold - 1
                precision    recall  f1-score   support

dokujo-tsushin       0.83      0.83      0.83       174
  it-life-hack       0.91      0.87      0.89       174
 kaden-channel       0.85      0.90      0.87       173
livedoor-homme       0.90 

In [19]:
mean

0.8654796218792262