# キーフレーズ抽出アルゴリズムの比較
評判のいいSGRank, TextRank, SCAKE, RAKEを比べてみて、どれが配信の特徴を表現できているか確認する。

In [1]:
#python -m spacy download ja_core_news_md

In [2]:
import textacy

ja = textacy.load_spacy_lang("ja_core_news_md")

In [3]:
import pandas as pd

texts = []

datafolder = "../../../Data/Transcription_raw/"
def get_text(filename):
    df = pd.read_csv(filename)
    text = "。 ".join(df["text"].tolist())
    return text

example_files = [datafolder+"15-4.csv", datafolder+"2718-4.csv", datafolder+"983-4.csv", datafolder+"2084-4.csv"]

In [4]:
#https://qiita.com/python_walker/items/fbc8c3205d01129e6afc
def extract_phrases_spacy(text, method):
    doc = textacy.make_spacy_doc(text, lang=ja)
    keywords_with_score = [
        (kps, score) for kps, score in method(doc, normalize="lemma", topn=5)
    ]

    keywords = [keywords_with_score[i][0] for i in range(len(keywords_with_score))]
    scores = [keywords_with_score[i][1] for i in range(len(keywords_with_score))]

    return scores, keywords

from katodb.Analyzing.rake_ja import Tokenizer, JapaneseRake
def extract_phrases_Rake(text):
    tokenizer = Tokenizer(rawargs='-r "C:/Program Files/MeCab/etc/mecabrc" -u "C:/Program Files/MeCab/dic/unidic_kato.dic"')
    tokens = tokenizer.tokenize(text)
    
    rake = JapaneseRake()
    
    rake.extract_keywords_from_text(tokens)
    keywords = rake.get_ranked_phrases_with_scores()
    
    if len(keywords) > 0:
        return [x[0] for x in keywords], [x[1] for x in keywords]
    else:
        return [], []

In [5]:
from textacy.extract.keyterms import sgrank, textrank, scake
def compare(filename):
    text = get_text(filename)
    
    print("sgrank")
    print(extract_phrases_spacy(text, sgrank))
    
    print("textrank")
    print(extract_phrases_spacy(text, textrank))
    
    print("scake")
    print(extract_phrases_spacy(text, scake))
    
    print("rake")
    print(extract_phrases_Rake(text))

In [6]:
compare(example_files[0])

sgrank
([0.13092996671099022, 0.08020904672246566, 0.07369578639718175, 0.06613293930590235, 0.05446491115501354], ['ニコ 生', '早送り 中', '一般 会員', 'タイム シフト', '諸 事情'])
textrank
([0.009262058050988557, 0.008053122373890404, 0.007554256077632982, 0.007521877679988214, 0.00709541809149689], ['プレミアマー 一般 見る', '面白い タス ねえ', 'ない', '観る 方法 ねえ', 'ねえ もん'])
scake
([341.3173858262519, 237.0615813735101, 237.0424204064287, 27.693291249605036, 17.677565883137802], ['早送り 中', '有給 中', '村田 中', '別', '今 コメント'])
rake
([37.84249084249084, 23.223443223443226, 22.627164502164504, 21.5974025974026, 18.636904761904763, 18.636904761904763, 17.125, 16.0, 15.835497835497836, 15.67798867798868, 15.412878787878789, 15.0, 14.75857475857476, 14.333333333333332, 13.675824175824177, 12.7, 11.547619047619047, 10.835497835497836, 10.50915750915751, 10.214285714285715, 9.923076923076923, 9.0, 9.0, 8.711538461538462, 8.5, 8.333333333333332, 8.25, 8.2, 8.033333333333333, 8.005952380952381, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 7.95454545454