# 重要語と、その極性を表示
ストップワードの調整・適用もここでする。

In [52]:
ID = "mimikkyu"

POPULATION_ID = "pokemon"

#分析対象テキストファイル
target_texts_file = "Responses/%s.txt"%ID

#ストップワード
stopwords_file = "Stopwords/%s.txt"%ID

#TFIDFモデル
tfidf_model_file = "Models/Tfidf/%s-from-%s.model"%(ID, POPULATION_ID)

#分析モデル
classifer_model_file = "Models/Classifers/%s-from-%s.model"%(ID, POPULATION_ID)

#ストップワード抜きの形態素解析・極性分析完了したデータの保存先
without_stopwords_text_file = "Progresses/NonStopword/%s-from-%s.txt"%(ID, POPULATION_ID)

#極性保存するファイル
sentiment_file = "Sentiment/%s-from-%s.txt"%(ID, POPULATION_ID)

#内部的に品詞を区別する区切り文字
TOKEN_DIVIDER = "<334>"

## TFIDFによる重要語の列挙

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
import MeCab
import pandas as pd

### テキストを取り込み

In [15]:
df_target = pd.read_table(target_texts_file)

df_target.head(5)

Unnamed: 0,date,link_index,number,content
0,2017/01/10|18/43/00,57557,4,いちおつまり やっとミミッキュ理想でた <\br>
1,2017/01/10|19/04/00,57557,78,ミミッキュて生き残るの？ <\br>
2,2017/01/10|19/39/00,57557,198,テテフよりミミッキュのほうが嫌い <\br>
3,2017/01/10|19/42/00,57557,202,てかあれだなまじで過去作ない人辛いなこれ ハッサムにはたきおとす持たせられないとかミミッ...
4,2017/01/10|19/53/00,57557,224,最近のろいミミッキュ見るけど何対面意識してるんだろ <\br>


### 10で計算したTFIDFモデルをサルベージ

In [16]:
import pickle
from copy import deepcopy

#TFIDFモデル復元用
#tokenizerをすり替え
#本ノートでは生テキストではなく形態素解析後のテキストが渡される
def tokenize(text):
    return text.split(TOKEN_DIVIDER)

with open(tfidf_model_file, "rb") as f:
    tfidf_model = pickle.load(f)

#対象（名詞・形容詞）を見せる用
tfidf_model_showing = deepcopy(tfidf_model)
tfidf_model_showing.ngram = (1,1)
tfidf_model_showing.tokenizer = tokenize

#極性を予測する用
tfidf_model_sentiment = deepcopy(tfidf_model)
tfidf_model_sentiment.tokenizer = tokenize

### 品詞を抽出

#### 形態素解析して原形をリスト化

#### 名詞・形容詞原形に限定

In [17]:
#求める品詞
WANTED_WORD = ["名詞", "形容詞"]

#返り値：[全原形], [求める品詞（原形)]
def get_target_tokens(text):
    output_all = []
    output_wanted = []
    
    tagger = MeCab.Tagger()
    node = tagger.parseToNode(text)

    while node:
        features = node.feature.split(",")

        #求める品詞なら
        if features[0] in WANTED_WORD:
            if len(features) >= 8:
                #原形を記録
                output_wanted.append(features[7])
            else:
                output_wanted.append(node.surface)

        #全品詞
        if len(features) >= 8:
            #原形を記録
            output_all.append(features[7])
        else:
            output_all.append(node.surface)

        node = node.next
    
    return TOKEN_DIVIDER.join(output_all), TOKEN_DIVIDER.join(output_wanted)

get_target_tokens("オレンジ今日も食べてみたけどまだ酸っぱくて泣いた")

('*<334>オレンジ-orange<334>今日<334>も<334>食べる<334>て<334>見る<334>た<334>けれど<334>未だ<334>酸っぱい<334>て<334>泣く<334>た<334>*',
 'オレンジ-orange<334>今日<334>酸っぱい')

### TFIDFを計算


#### テキスト群を原形に分解

In [18]:
df_target["disassembled_all"] = ""
df_target["disassembled_target"] = ""

for index, row in df_target.iterrows():
    df_target.at[index, "disassembled_all"], df_target.at[index, "disassembled_target"] = get_target_tokens(row["content"])

df_target.head(5)

Unnamed: 0,date,link_index,number,content,disassembled_all,disassembled_target
0,2017/01/10|18/43/00,57557,4,いちおつまり やっとミミッキュ理想でた <\br>,*<334>一<334>おー<334>詰まり<334>漸と<334>ミミッキュ<334>理想...,一<334>ミミッキュ<334>理想<334>br
1,2017/01/10|19/04/00,57557,78,ミミッキュて生き残るの？ <\br>,*<334>ミミ<334>っ<334>きゅっ<334>って<334>生き残る<334>の<3...,ミミ<334>br
2,2017/01/10|19/39/00,57557,198,テテフよりミミッキュのほうが嫌い <\br>,*<334>ティー-tea<334>テフ-teff<334>より<334>ミミッキュ<334...,ティー-tea<334>テフ-teff<334>ミミッキュ<334>方<334>嫌い<334>br
3,2017/01/10|19/42/00,57557,202,てかあれだなまじで過去作ない人辛いなこれ ハッサムにはたきおとす持たせられないとかミミッ...,*<334>って<334>か<334>彼れ<334>だ<334>なまじ<334>で<334>...,過去<334>無い<334>人<334>辛い<334>ハッサム<334>ミミ<334>視野<...
4,2017/01/10|19/53/00,57557,224,最近のろいミミッキュ見るけど何対面意識してるんだろ <\br>,*<334>最近<334>鈍い<334>ミミ<334>っ<334>きゅっ<334>見る<33...,最近<334>鈍い<334>ミミ<334>対面<334>意識<334>br


#### 各文書のTFIDFの平均値でランキング付け

In [19]:
#上記のテキストを分解
target_tfidf_spycy = tfidf_model_showing.transform(df_target["disassembled_target"])
target_tfidf = target_tfidf_spycy.toarray()

target_tfidf.shape

(41002, 17797)

In [20]:
#pandas行列に
df_tfidf = pd.DataFrame(target_tfidf, columns=tfidf_model.get_feature_names_out())

df_tfidf.head(5)



Unnamed: 0,"""",""" ヽ",""".1",',"' """,' ',' ' ',' a,' a `),' i,...,￣ ｜,￣ ｜.1,￣ ￣,￣ ￣ <\,￣ ￣.1,￣ ￣ ／,￣ ￣ ＼,￣ ￣ ｀,￣ ￣ ｜,￣ ￣ ￣
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
#平均を算出
df_tfidf_mean = df_tfidf.mean(axis=0).sort_values(ascending=False)

df_tfidf_mean.head(100)

br         0.144607
交換         0.087241
ミミッキュ      0.034431
ミミ         0.028871
v          0.028666
             ...   
メガ-mega    0.003891
丸          0.003885
箇所         0.003861
容器         0.003775
アイ-eye     0.003767
Length: 100, dtype: float64

### ストップワードを反映
ここでストップワードの調整をする

#### ストップワードを取り込み

In [22]:
with open(stopwords_file, "r", encoding="utf-8") as f:
    stopwords = f.read().split()

stopwords

['交換', 'FC', '求', 'FC', 'フレコ', 'ありがとう', 'TN', '了解', 'お願い', 'tn', '確認', '個体']

#### 除外

In [23]:
import numpy as np

In [26]:
#ストップワードのないものを除外

def has_stopword(text_list):
    for word in stopwords:
        if word in text_list.split(TOKEN_DIVIDER):
            return True
    
    return False

#df_target[has_stopword(df_target["disassembled"]) == False]
df_non_stopwords = np.empty(df_target.shape)
for _, row in df_target.iterrows():
    if has_stopword(row["disassembled_all"]) == False:
        df_non_stopwords = np.vstack([df_non_stopwords, row.values])

df_non_stopwords = pd.DataFrame(df_non_stopwords, columns=df_target.columns)

df_non_stopwords.shape

(25787, 6)

#### TFIDFを計算

In [27]:
target_tfidf_spycy = tfidf_model_showing.transform(df_non_stopwords["disassembled_target"])
target_tfidf = target_tfidf_spycy.toarray()

#pandas行列に
df_tfidf = pd.DataFrame(target_tfidf, columns=tfidf_model_showing.get_feature_names_out())

#平均を算出
df_tfidf_mean = df_tfidf.mean(axis=0).sort_values(ascending=False)

#並べ替え
pd.set_option('display.max_rows', 100)
df_tfidf_mean.head(100)




br               0.187560
ミミッキュ            0.046337
ミミ               0.039874
良い               0.020403
無い               0.017407
希望               0.017402
登録               0.015226
夢                0.014357
v                0.013506
今                0.012285
遺伝               0.009669
強い               0.009301
準備               0.009176
4                0.009000
意地               0.008971
孵化               0.008194
4 v              0.007939
申し訳              0.007832
飴                0.007509
2                0.007434
z                0.007386
襷                0.007362
方                0.007362
次第               0.007081
1                0.006939
5                0.006515
ポケモン             0.006497
欲しい              0.006460
宜しい              0.006371
零                0.006069
ボール-ball         0.006007
特性               0.005999
事                0.005971
パワー-power        0.005914
5 v              0.005776
文字               0.005752
bp               0.005557
技                0.005549
ロコン         

## 重要語の極性分析

### 全体の極性

### 分類モデルをサルベージ

In [31]:
with open(classifer_model_file, "rb") as f:
    classifer = pickle.load(f)

### 極性をとる

In [32]:
def predict(disassembled):
    if len(disassembled) == 0:
        return []

    #TFIDF行列へ変換
    df_tfidf = tfidf_model_sentiment.transform(disassembled)
    df_tfidf = df_tfidf.toarray()

    #分類器に食わせる
    output = classifer.predict(df_tfidf)

    return output

In [36]:
df_non_stopwords["Sentiment"] = predict(df_non_stopwords["disassembled_all"])

df_non_stopwords.head(5)

Unnamed: 0,date,link_index,number,content,disassembled_all,disassembled_target,Sentiment
0,2017/01/10|18/43/00,57557,4,いちおつまり やっとミミッキュ理想でた <\br>,*<334>一<334>おー<334>詰まり<334>漸と<334>ミミッキュ<334>理想...,一<334>ミミッキュ<334>理想<334>br,pos
1,2017/01/10|19/04/00,57557,78,ミミッキュて生き残るの？ <\br>,*<334>ミミ<334>っ<334>きゅっ<334>って<334>生き残る<334>の<3...,ミミ<334>br,neg
2,2017/01/10|19/39/00,57557,198,テテフよりミミッキュのほうが嫌い <\br>,*<334>ティー-tea<334>テフ-teff<334>より<334>ミミッキュ<334...,ティー-tea<334>テフ-teff<334>ミミッキュ<334>方<334>嫌い<334>br,neg
3,2017/01/10|19/42/00,57557,202,てかあれだなまじで過去作ない人辛いなこれ ハッサムにはたきおとす持たせられないとかミミッ...,*<334>って<334>か<334>彼れ<334>だ<334>なまじ<334>で<334>...,過去<334>無い<334>人<334>辛い<334>ハッサム<334>ミミ<334>視野<...,neg
4,2017/01/10|19/53/00,57557,224,最近のろいミミッキュ見るけど何対面意識してるんだろ <\br>,*<334>最近<334>鈍い<334>ミミ<334>っ<334>きゅっ<334>見る<33...,最近<334>鈍い<334>ミミ<334>対面<334>意識<334>br,neg


### 全体の極性

In [47]:
cnt = 0
score = 0
for _, row in df_non_stopwords.iterrows():
    if row["Sentiment"] == "pos":
        score += 1
    else:
        score -= 1
    cnt += 1

score /= cnt

print(score)

0.14104005894442936


### 単語ごと

In [40]:
#分析する数
WANTED = 100

#keywordについての極性スコアを分析
#返り値：極性スコア、個数
def get_sentiment(keyword):
    count = 0
    score = 0

    #keywordを含む行を走査
    for _, row in df_non_stopwords.iterrows():
        if keyword in row["disassembled_all"].split(TOKEN_DIVIDER):
            count += 1
            if row["Sentiment"] == "pos":
                score += 1
            else:
                score -= 1

    #平均極性スコアを算出
    if count == 0:
        score = 0
    else:
        score /= count

    return score, count

#[Token, TFIDF, Sentiment, Counts]
df_sentiment = np.empty((0,4))
cnt = 0
for index, value in df_tfidf_mean.items():
    score, count = get_sentiment(index)
    row = np.array([index, value, score, count])
    df_sentiment = np.vstack([df_sentiment, row])
    cnt += 1
    if cnt > WANTED:
        break

df_sentiment = pd.DataFrame(df_sentiment, columns=["Token", "TFIDF", "Sentiment", "Counts"])

df_sentiment.shape

(101, 4)

### 表示

In [41]:
import seaborn as sns

In [46]:
df_sentiment.style.background_gradient(cmap="vlag_r", axis=0, subset="Sentiment")

Unnamed: 0,Token,TFIDF,Sentiment,Counts
0,br,0.1875603051548277,0.1499136170881105,25468
1,ミミッキュ,0.04633694365117,-0.1726837755545889,4598
2,ミミ,0.0398736782907424,-0.2432732316227462,3605
3,良い,0.0204031464923853,0.131124591693887,2143
4,無い,0.0174073814921998,-0.5075690115761353,2246
5,希望,0.017402168391053,0.5393939393939394,990
6,登録,0.0152260030147009,0.7878315132605305,641
7,夢,0.0143569102658966,0.1534603811434303,997
8,v,0.0135058739876286,0.3233532934131736,334
9,今,0.012285038047131,0.1608910891089109,808


### 保存

In [53]:
df_sentiment.to_csv(sentiment_file, sep="\t", index=False)

df_non_stopwords.to_csv(without_stopwords_text_file, sep="\t", index=False)