# 11で出した重要語と共起する単語を算出

## 必要なもの
* ID, POPULATION_IDのセット
* local_stopwordsのセット

In [None]:
ID = ""

POPULATION_ID = ""

#10の途中経過
without_stopwords_text_file = "Progresses/NonStopword/%s-from-%s.txt"%(ID, POPULATION_ID)

#10の結果
sentiment_file = "Sentiment/%s-from-%s.txt"%(ID, POPULATION_ID)

##TFIDFモデル
tfidf_model_file = "Models/Tfidf/%s-from-%s.model"%(ID, POPULATION_ID)

#内部的に品詞を区別する区切り文字
TOKEN_DIVIDER = "<334>"

In [None]:
#ここだけのストップワード
#投稿そのものを除外するのではなく、関連単語から除外する
local_stopwords = ["ミミッキュ", "br", "ミミ"]

## 読み込み

In [None]:
import pandas as pd
import seaborn as sns

In [None]:
df_main = pd.read_table(without_stopwords_text_file)

#本ノートでは生テキストではなく形態素解析後のテキストが渡される
def tokenize(text):
    return text.split(TOKEN_DIVIDER)

df_main.head(2)

In [None]:
df_sentiment = pd.read_table(sentiment_file)

df_sentiment.head(2)

## 単純な共起回数

### 単語ベクトルの用意

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np


In [None]:
vectorizer = CountVectorizer(tokenizer=tokenize, stop_words=local_stopwords, ngram_range=(1,1), min_df=20)
vectorizer = vectorizer.fit(df_main["disassembled_target"])

In [None]:
#keywordに共起する回数の多いものを数える
def get_freq(keyword):
    #keywordを含む行を走査
    df_keyword = np.empty((0, df_main.shape[1]))
    for _, row in df_main.iterrows():
        if keyword in tokenize(row["disassembled_all"]):
            df_keyword = np.vstack([df_keyword, row.values])

    #pandas行列に
    df_keyword = pd.DataFrame(df_keyword, columns=df_main.columns)

    #単語ベクトルの回数数える
    freq_spycy = vectorizer.transform(df_keyword["disassembled_target"])
    freq_df = pd.DataFrame(freq_spycy.toarray(), columns=vectorizer.get_feature_names_out())
    freq_sum = freq_df.sum(axis=0)
    
    #単語名と結び付け
    freq_sum = freq_sum.sort_values(ascending=False)

    #ストップワードの削除
    for word in local_stopwords:
        try:
            freq_sum = freq_sum.drop(word)
        except:
            pass

    return freq_sum

get_freq("良い").head(5)

In [None]:
df_sentiment["CoOccurre-FREQ"] = ""
for index, row in df_sentiment.iterrows():
    freq = get_freq(row["Token"])
    end = 6
    if len(freq) <= 1:
        end = 1
    elif len(freq) < 6:
        end = len(freq)

    if end != 1:
        cooc = ", ".join(list(freq.iloc[1:end].index))
        df_sentiment.at[index, "CoOccurre-FREQ"] = cooc

pd.set_option('display.max_rows', 100)
df_sentiment.head(100).style.background_gradient(cmap="vlag_r", axis=0, subset="Sentiment")

## 分析対象内

## 母集団に対するTFIDF

### 母集団TFIDFモデルをサルベージ

In [None]:
import pickle
from copy import deepcopy

with open(tfidf_model_file, "rb") as f:
    tfidf_model = pickle.load(f)

#対象（名詞・形容詞）を見せる用
tfidf_model_showing = deepcopy(tfidf_model)
tfidf_model_showing.ngram = (1,1)
tfidf_model_showing.tokenizer = tokenize

### TFIDFを計算

In [None]:
#keywordに共起する回数の多いものを数える
def get_tfidf(keyword):
    #keywordを含む行を走査
    df_keyword = np.empty((0, df_main.shape[1]))
    for _, row in df_main.iterrows():
        if keyword in tokenize(row["disassembled_all"]):
            df_keyword = np.vstack([df_keyword, row.values])

    #pandas行列に
    df_keyword = pd.DataFrame(df_keyword, columns=df_main.columns)

    if df_keyword.shape[0] == 0:
        return pd.Series()

    #TFIDF計算
    tfidf_spycy = tfidf_model_showing.transform(df_keyword["disassembled_target"])
    tfidf_df = pd.DataFrame(tfidf_spycy.toarray(), columns=tfidf_model_showing.get_feature_names_out())
    tfidf_mean = tfidf_df.mean(axis=0)
    
    #単語名と結び付け
    tfidf_mean = tfidf_mean.sort_values(ascending=False)

    #ストップワードの削除
    for word in local_stopwords:
        try:
            tfidf_mean = tfidf_mean.drop(word)
        except:
            pass

    return tfidf_mean

get_tfidf("良い").head(5)

In [None]:
df_sentiment["CoOccurre-TFIDF"] = ""
for index, row in df_sentiment.iterrows():
    freq = get_tfidf(row["Token"])
    end = 6
    if len(freq) <= 1:
        end = 1
    elif len(freq) < 6:
        end = len(freq)

    if end != 1:
        cooc = ", ".join(list(freq.iloc[1:end].index))
        df_sentiment.at[index, "CoOccurre-TFIDF"] = cooc

pd.set_option('display.max_rows', 200)
df_sentiment.head(200).style.background_gradient(cmap="vlag_r", axis=0, subset="Sentiment", vmin=-1, vmax=1)