# 重要語と、その極性を表示
ストップワードの調整・適用もここでする。

# 必要なもの
* ID, POPULATION_IDをセット
* Stopwordsフォルダーに「ID.txt」ファイルを作成し、ストップワードをスペース区切りで入力。  
このストップワードが合致したテキストはこれ以降反映されなくなる。  
ひとまずストップワードなしで算出される重要語が「TFIDFによる重要語の算出」の章で表示されるので、それを見ながらストップワードを調整する


In [1]:
ID = "pikachu"

POPULATION_ID = "pokemon"

#分析対象テキストファイル
target_texts_file = "Responses/%s.txt"%ID

#ストップワード
stopwords_file = "Stopwords/%s.txt"%ID

#TFIDFモデル
tfidf_model_file = "Models/Tfidf/%s-from-%s.model"%(ID, POPULATION_ID)

#分析モデル
classifer_model_file = "Models/Classifers/%s-from-%s.model"%(ID, POPULATION_ID)

#ストップワード抜きの形態素解析・極性分析完了したデータの保存先
without_stopwords_text_file = "Progresses/NonStopword/%s-from-%s.txt"%(ID, POPULATION_ID)

#極性保存するファイル
sentiment_file = "Sentiment/%s-from-%s.txt"%(ID, POPULATION_ID)

#内部的に品詞を区別する区切り文字
TOKEN_DIVIDER = "<334>"

## TFIDFによる重要語の列挙

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import MeCab
import pandas as pd

### テキストを取り込み

In [3]:
df_target = pd.read_table(target_texts_file)

df_target.head(5)

Unnamed: 0,date,link_index,number,content
0,2016/02/23|21/35/00,60432,7,初ポケモンがピカチュウバージョンでひいこら言いながら進めてたこと ピカチュウが主力なの...
1,2008/02/25|00/58/00,26640,154,ダークシンジとピカチュウが来たらいよいよもって 懐かしいコテ勢揃いだな
2,2008/02/25|01/00/00,26640,159,>>152 ハングル文字ですね 日本版と通信できるのでしょうか >>154 シンジ...
3,2008/02/25|01/00/00,26640,160,>>154 また懐かしい名前を
4,2008/02/25|01/03/00,26640,164,>>159 多分出来るだろう


### 10で計算したTFIDFモデルをサルベージ

In [4]:
import pickle
from copy import deepcopy

#TFIDFモデル復元用
#tokenizerをすり替え
#本ノートでは生テキストではなく形態素解析後のテキストが渡される
def tokenize(text):
    return text.split(TOKEN_DIVIDER)

with open(tfidf_model_file, "rb") as f:
    tfidf_model = pickle.load(f)

#対象（名詞・形容詞）を見せる用
tfidf_model_showing = deepcopy(tfidf_model)
tfidf_model_showing.ngram = (1,1)
tfidf_model_showing.tokenizer = tokenize

#極性を予測する用
tfidf_model_sentiment = deepcopy(tfidf_model)
tfidf_model_sentiment.tokenizer = tokenize

### 品詞を抽出

#### 形態素解析して原形をリスト化

#### 名詞・形容詞原形に限定

In [5]:
#求める品詞
WANTED_WORD = ["名詞", "形容詞"]

#返り値：[全原形], [求める品詞（原形)]
def get_target_tokens(text):
    output_all = []
    output_wanted = []
    
    tagger = MeCab.Tagger()
    node = tagger.parseToNode(text)

    while node:
        features = node.feature.split(",")

        #求める品詞なら
        if features[0] in WANTED_WORD:
            if len(features) >= 8:
                #原形を記録
                output_wanted.append(features[7])
            else:
                output_wanted.append(node.surface)

        #全品詞
        if len(features) >= 8:
            #原形を記録
            output_all.append(features[7])
        else:
            output_all.append(node.surface)

        node = node.next
    
    return TOKEN_DIVIDER.join(output_all), TOKEN_DIVIDER.join(output_wanted)

get_target_tokens("オレンジ今日も食べてみたけどまだ酸っぱくて泣いた")

('*<334>オレンジ-orange<334>今日<334>も<334>食べる<334>て<334>見る<334>た<334>けれど<334>未だ<334>酸っぱい<334>て<334>泣く<334>た<334>*',
 'オレンジ-orange<334>今日<334>酸っぱい')

### TFIDFを計算


#### テキスト群を原形に分解

In [6]:
df_target["disassembled_all"] = ""
df_target["disassembled_target"] = ""

for index, row in df_target.iterrows():
    df_target.at[index, "disassembled_all"], df_target.at[index, "disassembled_target"] = get_target_tokens(row["content"])

df_target.head(5)

Unnamed: 0,date,link_index,number,content,disassembled_all,disassembled_target
0,2016/02/23|21/35/00,60432,7,初ポケモンがピカチュウバージョンでひいこら言いながら進めてたこと ピカチュウが主力なの...,*<334>初<334>ポケモン<334>が<334>ピカチュウ<334>バージョン-ver...,初<334>ポケモン<334>ピカチュウ<334>バージョン-version<334>事<3...
1,2008/02/25|00/58/00,26640,154,ダークシンジとピカチュウが来たらいよいよもって 懐かしいコテ勢揃いだな,*<334>ダーク-外国<334>シンジ<334>と<334>ピカチュウ<334>が<334...,ダーク-外国<334>シンジ<334>ピカチュウ<334>懐かしい<334>小手<334>勢揃い
2,2008/02/25|01/00/00,26640,159,>>152 ハングル文字ですね 日本版と通信できるのでしょうか >>154 シンジ...,*<334>＞<334>＞<334>152<334>ハングル<334>文字<334>です<3...,152<334>ハングル<334>文字<334>日本<334>版<334>通信<334>15...
3,2008/02/25|01/00/00,26640,160,>>154 また懐かしい名前を,*<334>＞<334>＞<334>154<334>又<334>懐かしい<334>名前<33...,154<334>懐かしい<334>名前
4,2008/02/25|01/03/00,26640,164,>>159 多分出来るだろう,*<334>＞<334>＞<334>159<334>多分<334>出来る<334>だ<334>*,159


#### 各文書のTFIDFの平均値でランキング付け

In [7]:
#上記のテキストを分解
target_tfidf_spycy = tfidf_model_showing.transform(df_target["disassembled_target"])
target_tfidf = target_tfidf_spycy.toarray()

target_tfidf.shape

(25482, 35244)

In [8]:
#pandas行列に
df_tfidf = pd.DataFrame(target_tfidf, columns=tfidf_model.get_feature_names_out())

df_tfidf.head(5)

Unnamed: 0,"""",""".1",""" ヽ",""".2",',"' """,' ',' ' ',' a,' a `),...,￣ ｜ ￣,￣ ￣,￣ ￣ <\,￣ ￣.1,￣ ￣ ヽ,￣ ￣ ／,￣ ￣ ＼,￣ ￣ ｀,￣ ￣ ｜,￣ ￣ ￣
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
#平均を算出
df_tfidf_mean = df_tfidf.mean(axis=0).sort_values(ascending=False)

df_tfidf_mean.head(100)

ピカチュウ      0.131934
無い         0.023863
ポケモン       0.022629
サトシ        0.022071
良い         0.019599
             ...   
手          0.003000
ピカチュウ 版    0.002995
子供         0.002987
初代         0.002979
ボルテッカー     0.002973
Length: 100, dtype: float64

### ストップワードを反映
ここでストップワードの調整をする

#### ストップワードを取り込み

In [11]:
with open(stopwords_file, "r", encoding="utf-8") as f:
    stopwords = f.read().split("\n")

stopwords

['']

#### 除外

In [12]:
import numpy as np

In [13]:
#ストップワードのないものを除外

def has_stopword(text_list):
    for word in stopwords:
        if word in text_list.split(TOKEN_DIVIDER):
            return True
    
    return False

#df_target[has_stopword(df_target["disassembled"]) == False]
df_non_stopwords = np.empty((0, df_target.shape[1]))
for _, row in df_target.iterrows():
    if has_stopword(row["disassembled_all"]) == False:
        df_non_stopwords = np.vstack([df_non_stopwords, row.values])

df_non_stopwords = pd.DataFrame(df_non_stopwords, columns=df_target.columns)

df_non_stopwords.shape

(25482, 6)

#### TFIDFを計算

In [14]:
target_tfidf_spycy = tfidf_model_showing.transform(df_non_stopwords["disassembled_target"])
target_tfidf = target_tfidf_spycy.toarray()

#pandas行列に
df_tfidf = pd.DataFrame(target_tfidf, columns=tfidf_model_showing.get_feature_names_out())

#平均を算出
df_tfidf_mean = df_tfidf.mean(axis=0).sort_values(ascending=False)

#並べ替え
pd.set_option('display.max_rows', 100)
df_tfidf_mean.head(100)


ピカチュウ            0.131934
無い               0.023863
ポケモン             0.022629
サトシ              0.022071
良い               0.019599
事                0.014171
可愛い              0.011501
電気               0.011012
ライチュウ            0.010042
ｗ                0.009813
アニメ-animation    0.008825
1                0.008302
進化               0.008241
今                0.007273
方                0.007207
時                0.007171
後                0.007028
2                0.006991
人                0.006744
自分               0.006619
ポッチャマ            0.006605
一                0.006336
奴                0.006234
人気               0.006024
欲しい              0.005921
気                0.005791
物                0.005754
登録               0.005730
技                0.005381
ピカチュウ 可愛い        0.005323
主人               0.005206
ピチュー             0.005198
版                0.005197
強い               0.005131
前                0.004996
所                0.004933
10               0.004899
光                0.004860
他           

## 重要語の極性分析

### 全体の極性

### 分類モデルをサルベージ

In [15]:
with open(classifer_model_file, "rb") as f:
    classifer = pickle.load(f)

### 極性をとる

In [16]:
def predict(disassembled):
    if len(disassembled) == 0:
        return []

    #TFIDF行列へ変換
    df_tfidf = tfidf_model_sentiment.transform(disassembled)
    df_tfidf = df_tfidf.toarray()

    #分類器に食わせる
    output = classifer.predict(df_tfidf)

    return output

In [17]:
df_non_stopwords["Sentiment"] = predict(df_non_stopwords["disassembled_all"])

df_non_stopwords.head(5)

Unnamed: 0,date,link_index,number,content,disassembled_all,disassembled_target,Sentiment
0,2016/02/23|21/35/00,60432,7,初ポケモンがピカチュウバージョンでひいこら言いながら進めてたこと ピカチュウが主力なの...,*<334>初<334>ポケモン<334>が<334>ピカチュウ<334>バージョン-ver...,初<334>ポケモン<334>ピカチュウ<334>バージョン-version<334>事<3...,neg
1,2008/02/25|00/58/00,26640,154,ダークシンジとピカチュウが来たらいよいよもって 懐かしいコテ勢揃いだな,*<334>ダーク-外国<334>シンジ<334>と<334>ピカチュウ<334>が<334...,ダーク-外国<334>シンジ<334>ピカチュウ<334>懐かしい<334>小手<334>勢揃い,pos
2,2008/02/25|01/00/00,26640,159,>>152 ハングル文字ですね 日本版と通信できるのでしょうか >>154 シンジ...,*<334>＞<334>＞<334>152<334>ハングル<334>文字<334>です<3...,152<334>ハングル<334>文字<334>日本<334>版<334>通信<334>15...,neu
3,2008/02/25|01/00/00,26640,160,>>154 また懐かしい名前を,*<334>＞<334>＞<334>154<334>又<334>懐かしい<334>名前<33...,154<334>懐かしい<334>名前,neu
4,2008/02/25|01/03/00,26640,164,>>159 多分出来るだろう,*<334>＞<334>＞<334>159<334>多分<334>出来る<334>だ<334>*,159,neu


### 全体の極性

In [18]:
cnt = 0
score = 0
for _, row in df_non_stopwords.iterrows():
    if row["Sentiment"] == "pos":
        score += 1
    elif row["Sentiment"] == "neg":
        score -= 1
    cnt += 1

score /= cnt

print(score)

-0.06537948355702064


### 単語ごと

In [19]:
#分析する数
WANTED = 100

#keywordについての極性スコアを分析
#返り値：極性スコア、個数
def get_sentiment(keyword):
    count = 0
    score = 0

    #keywordを含む行を走査
    for _, row in df_non_stopwords.iterrows():
        if keyword in row["disassembled_all"].split(TOKEN_DIVIDER):
            count += 1
            if row["Sentiment"] == "pos":
                score += 1
            elif row["Sentiment"] == "neg":
                score -= 1

    #平均極性スコアを算出
    if count == 0:
        score = 0
    else:
        score /= count

    return score, count

#[Token, TFIDF, Sentiment, Counts]
df_sentiment = np.empty((0,4))
cnt = 0
for index, value in df_tfidf_mean.items():
    score, count = get_sentiment(index)
    row = np.array([index, value, score, count])
    df_sentiment = np.vstack([df_sentiment, row])
    cnt += 1
    if cnt > WANTED:
        break

df_sentiment = pd.DataFrame(df_sentiment, columns=["Token", "TFIDF", "Sentiment", "Counts"])

df_sentiment.shape

(101, 4)

### 表示

In [20]:
import seaborn as sns

In [21]:
df_sentiment.style.background_gradient(cmap="vlag_r", axis=0, subset="Sentiment")

Unnamed: 0,Token,TFIDF,Sentiment,Counts
0,ピカチュウ,0.1319338529763873,-0.0337690631808278,16524
1,無い,0.0238634873448613,-0.3322214809873249,4497
2,ポケモン,0.022628751429606,-0.0884476534296028,3878
3,サトシ,0.0220710444097711,0.0587473002159827,2315
4,良い,0.0195988872710852,0.0041456914421083,3377
5,事,0.0141709579951171,-0.0863279636513837,2421
6,可愛い,0.0115010434347002,0.2307692307692307,1079
7,電気,0.0110120017302476,-0.0025773195876288,1164
8,ライチュウ,0.0100422823574713,-0.1085450346420323,866
9,ｗ,0.0098127463626356,-0.0642927794263105,1011


### 保存

In [22]:
df_sentiment.to_csv(sentiment_file, sep="\t", index=False)

df_non_stopwords.to_csv(without_stopwords_text_file, sep="\t", index=False)