In [1]:
import numpy as np
import pandas as pd
import MeCab
import ipadic
from tqdm.notebook import tqdm
from argparse import Namespace
import os
import gensim.corpora


In [4]:
args = Namespace(data_csv="/home/hc/[NII-IDR] 楽天市場データ/sample/sample_from_raw.csv",
                 stopword_dir="/home/hc/paper_new//stopword")

### Preprocessing

In [18]:
#把评论论内容分词后写入wakati_review.txt
# text = data_sub['レビュー内容'].to_numpy()
# with open("wakati_review.txt", "w") as fp:
#     for line in tqdm(text):
#         fp.write(wakati.parse(line))

In [7]:
def extract_hinshi(text, hinshi:str):
    """
    品词, 抽出所有文本对应的品词 text 为pandas review dataframe to numpy。 返回text中所以对应的hinshi
    """
    kigo = set()
    CHASEN_ARGS = r' -F "%m\t%f[7]\t%f[6]\t%F-[0,1,2,3]\t%f[4]\t%f[5]\n"'
    CHASEN_ARGS += r' -U "%m\t%m\t%m\t%F-[0,1,2,3]\t\t\n"'
    wakati = MeCab.Tagger(ipadic.MECAB_ARGS + CHASEN_ARGS)
    for line in text:
        wakati_text = wakati.parse(line).split("\n")
        for token in wakati_text:
            token = token.split("\t")
            if token[0] != "" and token[0] != "EOS":
                if hinshi in token[3]:
                    kigo.add(token[0])
    return kigo

data_sub = pd.read_csv("/home/hc/[NII-IDR] 楽天市場データ/sample/sample_from_raw.csv")['レビュー内容']
text = data_sub.to_numpy()
kigo = extract_hinshi(text,"副詞")
with open(os.path.join(args.stopword_dir,"副詞.txt"), "w") as fp:
    for line in kigo:
        fp.write(line)
        fp.write("\n")

### Write wakati

### Vocabulary class

In [21]:
class Vocabulary:

    def __init__(self):

        self.token_to_idx = {}
        self.idx_to_token = {}
    def add_token(self, token):
        try:
            index = self.token_to_idx[token]
        except KeyError:
            index = len(self.token_to_idx)
            self.token_to_idx[token] = index
            self.idx_to_token[index] = token
    def add_many(self, tokens:list): #1d list of token
        for token in tokens:
            self.add_token(token)
    def add_corpus(self, doc:list): #2d list of list
        for sent in doc:
            self.add_many(sent)

    def __len__(self):
        return len(self.token_to_idx)

    def __str__(self):
        return "<Vocabulary> size is {}".format(len(self.token_to_idx))

### LDA

In [22]:

def lemma(text:list): #1d list
    CHASEN_ARGS = r' -F "%m\t%f[7]\t%f[6]\t%F-[0,1,2,3]\t%f[4]\t%f[5]\n"'
    CHASEN_ARGS += r' -U "%m\t%m\t%m\t%F-[0,1,2,3]\t\t\n"'
    wakati = MeCab.Tagger(ipadic.MECAB_ARGS + CHASEN_ARGS)
    temp_text = "".join(text)
    wakati_text = wakati.parse(temp_text).split("\n")
    dic_t = {}
    for token in wakati_text:
        token = token.split("\t")
        if token[0] != "" and token[0] != "EOS":
            dic_t[token[0]] = token[2]
    for idx in range(len(text)):
        if text[idx] in dic_t:
            text[idx] = dic_t[text[idx]]
    return text #list of lemmalization

def wakati_tolist(text:list, stopword:list=[]): #list of str
    #convert list of str to list of list, removing stopword and \n

    wakati = MeCab.Tagger("-Owakati")
    for line_idx in range(len(text)):
        line = wakati.parse(text[line_idx])
        line = line.replace("\n", "")
        line_list = line.split()
        line_list = [token for token in line_list if token not in stopword] #1d list
        line_list = lemma(line_list)
        text[line_idx] = line_list
    return text #list of list



In [23]:
stopwords = []
for stopword_file in os.listdir("stopword"):
    with open(os.path.join("stopword",stopword_file),"r") as fp:
        stopword = [line.replace("\n","") for line in fp.readlines()]
    stopwords += stopword

data = pd.read_csv("/home/hc/[NII-IDR] 楽天市場データ/sample/sample_from_raw2.csv")
text = data['レビュー内容'].to_list()

In [24]:
data_gr1 = data[data['参考になった数']>=1]


In [25]:
text = data_gr1['レビュー内容'].to_list()


In [26]:

text = wakati_tolist(text,stopword=stopwords) #list of list
id2word = gensim.corpora.Dictionary(text)
print(len(id2word))
doc_term_matrix = [id2word.doc2bow(t) for t in text]

lda_model = gensim.models.LdaMulticore(corpus=doc_term_matrix,id2word=id2word,num_topics=6,iterations=400 )
from gensim.models.coherencemodel import CoherenceModel

33852


In [27]:
# from gensim.models.coherencemodel import CoherenceModel
# import matplotlib.pyplot as plt
# num_t = []
# c_score = []
# for num_topic in tqdm(range(3,20)):
#     lda_model = gensim.models.LdaModel(corpus=doc_term_matrix,id2word=id2word,num_topics=num_topic)
#     cm = CoherenceModel(model=lda_model,corpus=doc_term_matrix,texts=text,dictionary=id2word)
#     c_score.append(cm.get_coherence())
#     num_t.append(num_topic)


# plt.plot(num_t,c_score)


In [29]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
lda_prepare = gensimvis.prepare(lda_model,doc_term_matrix,id2word)
pyLDAvis.save_html(lda_prepare,"lda1.html")

In [30]:
from pprint import pprint
pprint(lda_model.print_topics())

[(0,
  '0.007*"用" + 0.007*"思っ" + 0.005*"値段" + 0.005*"満足" + 0.004*"使い" + 0.004*"サイズ" '
  '+ 0.003*"見" + 0.003*"時" + 0.003*"ありがとう" + 0.003*"だけ"'),
 (1,
  '0.005*"使っ" + 0.005*"色" + 0.004*"日" + 0.004*"けど" + 0.004*"良かっ" + 0.004*"使い" '
  '+ 0.004*"目" + 0.004*"感" + 0.004*"入っ" + 0.003*"もう"'),
 (2,
  '0.006*"サイズ" + 0.005*"満足" + 0.005*"み" + 0.004*"色" + 0.004*"しっかり" + '
  '0.004*"使い" + 0.004*"生地" + 0.004*"見" + 0.004*"着" + 0.003*"入っ"'),
 (3,
  '0.005*"良かっ" + 0.005*"だけ" + 0.005*"入っ" + 0.005*"注文" + 0.004*"使い" + '
  '0.004*"サイズ" + 0.004*"時" + 0.004*"思っ" + 0.004*"すぐ" + 0.004*"み"'),
 (4,
  '0.007*"色" + 0.006*"サイズ" + 0.004*"注文" + 0.004*"時" + 0.004*"着" + 0.004*"思っ" + '
  '0.004*"使っ" + 0.004*"値段" + 0.004*"さん" + 0.004*"日"'),
 (5,
  '0.012*"サイズ" + 0.004*"用" + 0.004*"見" + 0.004*"日" + 0.003*"満足" + 0.003*"買っ" + '
  '0.003*"思っ" + 0.003*"感" + 0.003*"み" + 0.003*"時"')]


### Record

stopword 44412
+puncts 44351
