# 11で出した重要語と共起する単語を算出

In [29]:
ID = "mimikkyu"

POPULATION_ID = "pokemon"

#10の途中経過
without_stopwords_text_file = "Progresses/NonStopword/%s-from-%s.txt"%(ID, POPULATION_ID)

#10の結果
sentiment_file = "Sentiment/%s-from-%s.txt"%(ID, POPULATION_ID)

##TFIDFモデル
tfidf_model_file = "Models/Tfidf/%s-from-%s.model"%(ID, POPULATION_ID)

#内部的に品詞を区別する区切り文字
TOKEN_DIVIDER = "<334>"

In [30]:
#ここだけのストップワード
#投稿そのものを除外するのではなく、関連単語から除外する
local_stopwords = ["ミミッキュ", "br", "ミミ"]

## 読み込み

In [31]:
import pandas as pd
import seaborn as sns

In [32]:
df_main = pd.read_table(without_stopwords_text_file)

#本ノートでは生テキストではなく形態素解析後のテキストが渡される
def tokenize(text):
    return text.split(TOKEN_DIVIDER)

df_main.head(2)

Unnamed: 0,date,link_index,number,content,disassembled_all,disassembled_target,Sentiment
0,2017/01/10|18/43/00,57557,4,いちおつまり やっとミミッキュ理想でた <\br>,*<334>一<334>おー<334>詰まり<334>漸と<334>ミミッキュ<334>理想...,一<334>ミミッキュ<334>理想<334>br,pos
1,2017/01/10|19/04/00,57557,78,ミミッキュて生き残るの？ <\br>,*<334>ミミ<334>っ<334>きゅっ<334>って<334>生き残る<334>の<3...,ミミ<334>br,neg


In [33]:
df_sentiment = pd.read_table(sentiment_file)

df_sentiment.head(2)

Unnamed: 0,Token,TFIDF,Sentiment,Counts
0,br,0.18756,0.149914,25468
1,ミミッキュ,0.046337,-0.172684,4598


## 単純な共起回数

### 単語ベクトルの用意

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np


In [35]:
vectorizer = CountVectorizer(tokenizer=tokenize, stop_words=local_stopwords, ngram_range=(1,1), min_df=20)
vectorizer = vectorizer.fit(df_main["disassembled_target"])



In [36]:
#keywordに共起する回数の多いものを数える
def get_freq(keyword):
    #keywordを含む行を走査
    df_keyword = np.empty((0, df_main.shape[1]))
    for _, row in df_main.iterrows():
        if keyword in tokenize(row["disassembled_all"]):
            df_keyword = np.vstack([df_keyword, row.values])

    #pandas行列に
    df_keyword = pd.DataFrame(df_keyword, columns=df_main.columns)

    #単語ベクトルの回数数える
    freq_spycy = vectorizer.transform(df_keyword["disassembled_target"])
    freq_df = pd.DataFrame(freq_spycy.toarray(), columns=vectorizer.get_feature_names_out())
    freq_sum = freq_df.sum(axis=0)
    
    #単語名と結び付け
    freq_sum = freq_sum.sort_values(ascending=False)

    return freq_sum

get_freq("良い").head(5)

良い    2308
無い     326
v      222
夢      194
方      190
dtype: int64

In [37]:
df_sentiment["CoOccurre-FREQ"] = ""
for index, row in df_sentiment.iterrows():
    freq = get_freq(row["Token"])
    end = 6
    if len(freq) <= 1:
        end = 1
    elif len(freq) < 6:
        end = len(freq)

    if end != 1:
        cooc = ", ".join(list(freq.iloc[1:end].index))
        df_sentiment.at[index, "CoOccurre-FREQ"] = cooc

pd.set_option('display.max_rows', 100)
df_sentiment.head(100).style.background_gradient(cmap="vlag_r", axis=0, subset="Sentiment")

Unnamed: 0,Token,TFIDF,Sentiment,Counts,CoOccurre-FREQ
0,br,0.18756,0.149914,25468,"良い, v, 夢, 希望, 4"
1,ミミッキュ,0.046337,-0.172684,4598,"良い, 強い, z, 1, 襷"
2,ミミ,0.039874,-0.243273,3605,"良い, 強い, ポケモン, 2, 1"
3,良い,0.020403,0.131125,2143,"無い, v, 夢, 方, ポケモン"
4,無い,0.017407,-0.507569,2246,"良い, 事, 強い, ポケモン, 申し訳"
5,希望,0.017402,0.539394,990,"夢, v, 意地, 4, 良い"
6,登録,0.015226,0.787832,641,"次第, 今, 申請, 良い, 無い"
7,夢,0.014357,0.15346,997,"v, 4, 遺伝, 意地, 良い"
8,v,0.013506,0.323353,334,"4, 5, 遺伝, 意地, 夢"
9,今,0.012285,0.160891,808,"無い, 良い, 環境, 孵化, ポケモン"


## 分析対象内

## 母集団に対するTFIDF

### 母集団TFIDFモデルをサルベージ

In [38]:
import pickle
from copy import deepcopy

with open(tfidf_model_file, "rb") as f:
    tfidf_model = pickle.load(f)

#対象（名詞・形容詞）を見せる用
tfidf_model_showing = deepcopy(tfidf_model)
tfidf_model_showing.ngram = (1,1)
tfidf_model_showing.tokenizer = tokenize

### TFIDFを計算

In [39]:
#keywordに共起する回数の多いものを数える
def get_tfidf(keyword):
    #keywordを含む行を走査
    df_keyword = np.empty((0, df_main.shape[1]))
    for _, row in df_main.iterrows():
        if keyword in tokenize(row["disassembled_all"]):
            df_keyword = np.vstack([df_keyword, row.values])

    #pandas行列に
    df_keyword = pd.DataFrame(df_keyword, columns=df_main.columns)

    if df_keyword.shape[0] == 0:
        return pd.Series()

    #TFIDF計算
    tfidf_spycy = tfidf_model_showing.transform(df_keyword["disassembled_target"])
    tfidf_df = pd.DataFrame(tfidf_spycy.toarray(), columns=tfidf_model_showing.get_feature_names_out())
    tfidf_mean = tfidf_df.mean(axis=0)
    
    #単語名と結び付け
    tfidf_mean = tfidf_mean.sort_values(ascending=False)

    #ストップワードの削除
    for word in local_stopwords:
        tfidf_mean = tfidf_mean.drop(word)

    return tfidf_mean

get_tfidf("良い").head(5)

良い            0.245514
夢             0.024792
v             0.022091
良い ブイ-buoy    0.021629
ブイ-buoy       0.021437
dtype: float64

In [41]:
df_sentiment["CoOccurre-TFIDF"] = ""
for index, row in df_sentiment.iterrows():
    freq = get_tfidf(row["Token"])
    end = 6
    if len(freq) <= 1:
        end = 1
    elif len(freq) < 6:
        end = len(freq)

    if end != 1:
        cooc = ", ".join(list(freq.iloc[1:end].index))
        df_sentiment.at[index, "CoOccurre-TFIDF"] = cooc

pd.set_option('display.max_rows', 200)
df_sentiment.head(200).style.background_gradient(cmap="vlag_r", axis=0, subset="Sentiment")

  return pd.Series()
  return pd.Series()
  return pd.Series()
  return pd.Series()


Unnamed: 0,Token,TFIDF,Sentiment,Counts,CoOccurre-FREQ,CoOccurre-TFIDF
0,br,0.18756,0.149914,25468,"良い, v, 夢, 希望, 4","希望, 無い, 登録, 夢, v"
1,ミミッキュ,0.046337,-0.172684,4598,"良い, 強い, z, 1, 襷","良い, 強い, z, 襷, トリル-trill"
2,ミミ,0.039874,-0.243273,3605,"良い, 強い, ポケモン, 2, 1","無い, 良い, 襷, 対策, z"
3,良い,0.020403,0.131125,2143,"無い, v, 夢, 方, ポケモン","夢, v, 良い ブイ-buoy, ブイ-buoy, 無い"
4,無い,0.017407,-0.507569,2246,"良い, 事, 強い, ポケモン, 申し訳","申し訳, 申し訳 無い, 良い, 事, 強い"
5,希望,0.017402,0.539394,990,"夢, v, 意地, 4, 良い","夢, v, 意地, 4 v, 遺伝"
6,登録,0.015226,0.787832,641,"次第, 今, 申請, 良い, 無い","次第, 今, フレンド-friend, 待機, 準備"
7,夢,0.014357,0.15346,997,"v, 4, 遺伝, 意地, 良い","v, 遺伝, 希望, ロコン, 4 v"
8,v,0.013506,0.323353,334,"4, 5, 遺伝, 意地, 夢","4 v, 4, 5 v, 5, 遺伝"
9,今,0.012285,0.160891,808,"無い, 良い, 環境, 孵化, ポケモン","登録, 孵化, 無い, 申し訳, 環境"
