In [29]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from transformers import BertJapaneseTokenizer, BertModel
import torch
from sudachipy import dictionary
from sudachipy import tokenizer
import pickle
import fasttext
import fasttext.util

# クラスタリングの類似度を評価する関数

In [45]:
def get_pair_label(cluster):
  labels = []
  for i0, v0 in enumerate(cluster):
    for i1, v1 in enumerate(cluster):
      if i1<=i0: continue
      labels.append(v0==v1)
  return labels

def cluster_similarity(correct_cluster, test_cluster):
  correct_pairs = get_pair_label(correct_cluster)
  test_pairs = get_pair_label(test_cluster)
  combined_pairs = [(v0,v1) for v0, v1 in zip(correct_pairs, test_pairs)]

  correct_true, correct_false = correct_pairs.count(True), correct_pairs.count(False)
  test_true, test_false = test_pairs.count(True), test_pairs.count(False)
  true_positive = combined_pairs.count((True, True))
  false_positive = combined_pairs.count((False, True))
  true_negative = combined_pairs.count((False, False))
  false_negative = combined_pairs.count((True, False))

  scores = {
    "ct_cf_tt_tf": (correct_true, correct_false, test_true, test_false)
    , "tp_fp_tn_fn": (true_positive, false_positive, true_negative, false_negative)
    , "precision": precision_score(correct_pairs, test_pairs)
    , "recall": recall_score(correct_pairs, test_pairs)
    , "f1": f1_score(correct_pairs, test_pairs)
    , "accuracy": accuracy_score(correct_pairs, test_pairs)
  }
  return scores    

In [46]:
cluster_similarity([0,0,0,1,1,1],[1,0,0,0,0,2])

{'ct_cf_tt_tf': (6, 9, 6, 9),
 'tp_fp_tn_fn': (2, 4, 5, 4),
 'precision': 0.3333333333333333,
 'recall': 0.3333333333333333,
 'f1': 0.3333333333333333,
 'accuracy': 0.4666666666666667}

# データの読み込み

In [4]:
FILE_PATH = "text/titles.csv"

In [5]:
# テストデータの読み込み
df = pd.read_csv(FILE_PATH)
# 検証を素早くできるようにテストデータ数を制限
train_df, test_df = train_test_split(df, train_size=0.9, random_state = 0, shuffle=True, stratify=df["category"])
# indexをリセット
train_df, test_df = train_df.reset_index(drop=True), test_df.reset_index(drop=True)

print("all")
print(df["category"].value_counts())
print("")
print("train")
print(train_df["category"].value_counts())
print("")
print("test")
print(test_df["category"].value_counts())


all
sports-watch      900
dokujo-tsushin    870
it-life-hack      870
movie-enter       870
smax              870
kaden-channel     864
peachy            842
topic-news        770
livedoor-homme    511
Name: category, dtype: int64

train
sports-watch      810
smax              783
it-life-hack      783
dokujo-tsushin    783
movie-enter       783
kaden-channel     777
peachy            758
topic-news        693
livedoor-homme    460
Name: category, dtype: int64

test
sports-watch      90
kaden-channel     87
smax              87
dokujo-tsushin    87
movie-enter       87
it-life-hack      87
peachy            84
topic-news        77
livedoor-homme    51
Name: category, dtype: int64


# Embedding取得のための関数の定義

## TF-IDF

In [33]:
# tfidf
def ngram_tfidf(texts, *, ngram_range = (3,3)):
  vectorizer = TfidfVectorizer(
                    analyzer="char"
                    , ngram_range=ngram_range
                    , max_df=0.9
                    , min_df = 5)
  return vectorizer.fit_transform(texts)

def word_tfidf(texts, *, ngram_range = (1,1)):
  tokenizer_obj = dictionary.Dictionary(dict="full").create()
  mode = tokenizer.Tokenizer.SplitMode.A
  wakachi_texts = [" ".join([m.surface() for m in tokenizer_obj.tokenize(text, mode)]) for text in texts]
  vectorizer = TfidfVectorizer(
    analyzer = "word"
    , ngram_range = ngram_range
    , max_df = 0.9
    , min_df = 5
  )
  return vectorizer.fit_transform(wakachi_texts)

## FastText

In [22]:
def fasttext_vector(texts, *, model=None, model_path = "fasttext/cc.ja.300.bin"):
  ft = model or fasttext.load_model(model_path)
  tokenizer_obj = dictionary.Dictionary(dict="full").create()
  mode = tokenizer.Tokenizer.SplitMode.A
  vectors = []
  for text in texts:
    tokens = tokenizer_obj.tokenize(text)
    words = [token.surface() for token in tokens]
    vec = ft.get_word_vector(words[0])
    for w in words[1:]:
      vec += ft.get_word_vector(w)
    mean_vec = vec / len(words)
    vectors.append(mean_vec)
  return vectors

## Sentence-Bert

In [27]:
# 参考 https://qiita.com/sonoisa/items/1df94d0a98cd4f209051
class SentenceBertJapanese:
    def __init__(self, model_name_or_path, device=None):
        self.tokenizer = BertJapaneseTokenizer.from_pretrained(model_name_or_path)
        self.model = BertModel.from_pretrained(model_name_or_path)
        self.model.eval()

        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
        self.device = torch.device(device)
        self.model.to(device)

    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


    def encode(self, sentences, batch_size=8):
        all_embeddings = []
        iterator = range(0, len(sentences), batch_size)
        for batch_idx in iterator:
            batch = sentences[batch_idx:batch_idx + batch_size]

            encoded_input = self.tokenizer.batch_encode_plus(batch, padding="longest", 
                                           truncation=True, return_tensors="pt").to(self.device)
            model_output = self.model(**encoded_input)
            sentence_embeddings = self._mean_pooling(model_output, encoded_input["attention_mask"]).to('cpu')

            all_embeddings.extend(sentence_embeddings)

        # return torch.stack(all_embeddings).numpy()
        return torch.stack(all_embeddings)

def sentencebert(texts, *, model=None):
    MODEL_NAME = "sonoisa/sentence-bert-base-ja-mean-tokens-v2"  # <- v2です。
    model = model or SentenceBertJapanese(MODEL_NAME)
    sentence_embeddings = model.encode(texts, batch_size=8)
    return sentence_embeddings.detach().numpy()

# Clustering用の関数

In [26]:
# k-meansでクラスタ分析。とりあえず3つのグループに分けてみる
def kmeans_clustering(vectors, *, n_clusters=9):
  km_model = KMeans(n_clusters=n_clusters, random_state = 0)
  km_model.fit(vectors)
  return km_model.labels_


# Calculate Similarity

## ngram, tfidf, kmeans

In [47]:
# ngram tfidf, kmeans
X = ngram_tfidf(test_df["title"])
test_labels = kmeans_clustering(X)
cluster_similarity(test_df["category"], test_labels)

{'ct_cf_tt_tf': (30397, 240819, 172013, 99203),
 'tp_fp_tn_fn': (21785, 150228, 90591, 8612),
 'precision': 0.12664740455663234,
 'recall': 0.7166825673586209,
 'f1': 0.21525616323304184,
 'accuracy': 0.4143413367942894}

## word, tfidf, kmeans

In [48]:
# word tfidf, kmeans
X = word_tfidf(test_df["title"])
test_labels = kmeans_clustering(X)
cluster_similarity(test_df["category"], test_labels)

{'ct_cf_tt_tf': (30397, 240819, 85461, 185755),
 'tp_fp_tn_fn': (12802, 72659, 168160, 17595),
 'precision': 0.14979932366810592,
 'recall': 0.4211599828930487,
 'f1': 0.22099466588409952,
 'accuracy': 0.667224647513421}

## fasttext, kmeans

In [49]:
#ft = fasttext.load_model('fasttext/cc.ja.300.bin')
vectors = fasttext_vector(test_df["title"], model = ft)
test_labels = kmeans_clustering(vectors)
cluster_similarity(test_df["category"], test_labels)

{'ct_cf_tt_tf': (30397, 240819, 33096, 238120),
 'tp_fp_tn_fn': (6382, 26714, 214105, 24015),
 'precision': 0.19283297075175249,
 'recall': 0.20995492976280555,
 'f1': 0.2010300348069866,
 'accuracy': 0.812957200165182}

## sentencebert, kmeans

In [50]:
# sentence bert, kmeans
embedding_binary_path = "embedding/sentencebert_embedding.pickle"
if Path(embedding_binary_path).exists():
  with open(embedding_binary_path, "rb") as f:
    sentence_embeddings = pickle.load(f)
else:
  sentence_embeddings = sentencebert(test_df["title"])
  with open("embedding/sentencebert_embedding.pickle", "wb") as f:
    pickle.dump(sentence_embeddings.detach().numpy(), f)

test_labels = kmeans_clustering(sentence_embeddings)
cluster_similarity(test_df["category"], test_labels)

{'ct_cf_tt_tf': (30397, 240819, 32532, 238684),
 'tp_fp_tn_fn': (8420, 24112, 216707, 21977),
 'precision': 0.25882208287224884,
 'recall': 0.277001019837484,
 'f1': 0.2676031718285687,
 'accuracy': 0.8300653353784437}

## 参考：教師あり（トリグラム、ナイーブベイズ）

# References

[SudachiPy](https://github.com/WorksApplications/SudachiPy/blob/develop/docs/tutorial.md)
