# 時期ごとの極性・共起変遷を追う

In [1]:
ID = "mimikkyu"

POPULATION_ID = "pokemon"

PERIOD_ID = "year"

#分析対象の単語ファイル
keywords_file = "Keywords/Timelapse/%s-from-%s.txt"%(ID, POPULATION_ID)

#ストップワード抜きの形態素解析・極性分析完了したデータ
without_stopwords_text_file = "Progresses/NonStopword/%s-from-%s.txt"%(ID, POPULATION_ID)

##TFIDFモデル
tfidf_model_file = "Models/Tfidf/%s-from-%s.model"%(ID, POPULATION_ID)

#保存するファイル
sentiment_timelapse_file = "Sentiment/Timelapse/%s-from-%s-by-%s.txt"%(ID, POPULATION_ID, PERIOD_ID)

#内部的に品詞を区別する区切り文字
TOKEN_DIVIDER = "<334>"

In [2]:
#共起キーワードのストップワード（ここだけ）
local_stopwords = ["br", "ミミッキュ", "ミミ"]

## 

## 与えられた時期にインデックスを割り振る
分けたい時期にあわせて書き換える

In [3]:
from datetime import datetime

In [4]:
#これを書き換える
#万が一 -1 が出てしまうとエラーが出てしまうので注意（上限はない）
def get_time_index(time: datetime):
    #2016年から一年ごとに割り振る

    years_passed = time.year - 2016
    
    if years_passed >= 0:
        return years_passed
    else:
        return 0

#保持しているデータの形式からdatetime型に変換
def convert_time(text: str):
    return datetime.strptime(text, "%Y/%m/%d|%H/%M/%S")

#最大インデックスを算出
MAX_TIME_INDEX = get_time_index(datetime.now())

## データ読み込み

In [5]:
import pandas as pd
import numpy as np

### 処理済みデータ

In [6]:
df_main = pd.read_table(without_stopwords_text_file)

df_main.head(2)

Unnamed: 0,date,link_index,number,content,disassembled_all,disassembled_target,Sentiment
0,2017/01/10|18/43/00,57557,4,いちおつまり やっとミミッキュ理想でた <\br>,*<334>一<334>おー<334>詰まり<334>漸と<334>ミミッキュ<334>理想...,一<334>ミミッキュ<334>理想<334>br,pos
1,2017/01/10|19/04/00,57557,78,ミミッキュて生き残るの？ <\br>,*<334>ミミ<334>っ<334>きゅっ<334>って<334>生き残る<334>の<3...,ミミ<334>br,neg


### 本ノート用キーワード

In [7]:
with open(keywords_file, "r", encoding="utf-8") as f:
    keywords = f.read().split()

keywords

['z', '襷', '対策', '剣舞', '鬼火', '多い', 'レート-rate', '化けの皮']

### 母集団からのTFIDFモデルをサルベージ

In [8]:
import pickle
from copy import deepcopy

def tokenize(text):
    return text.split(TOKEN_DIVIDER)

with open(tfidf_model_file, "rb") as f:
    tfidf_model = pickle.load(f)

#対象（名詞・形容詞）を見せる用
tfidf_model_showing = deepcopy(tfidf_model)
tfidf_model_showing.ngram = (1,1)
tfidf_model_showing.tokenizer = tokenize

## 分析

In [None]:
from IPython.display import display

### 各テキストに時期インデックスを割り振る

In [9]:
indices = []

for _, row in df_main.iterrows():
    indices.append(get_time_index(convert_time(row["date"])))

df_main["time_index"] = indices

df_main.head(3)

Unnamed: 0,date,link_index,number,content,disassembled_all,disassembled_target,Sentiment,time_index
0,2017/01/10|18/43/00,57557,4,いちおつまり やっとミミッキュ理想でた <\br>,*<334>一<334>おー<334>詰まり<334>漸と<334>ミミッキュ<334>理想...,一<334>ミミッキュ<334>理想<334>br,pos,1
1,2017/01/10|19/04/00,57557,78,ミミッキュて生き残るの？ <\br>,*<334>ミミ<334>っ<334>きゅっ<334>って<334>生き残る<334>の<3...,ミミ<334>br,neg,1
2,2017/01/10|19/39/00,57557,198,テテフよりミミッキュのほうが嫌い <\br>,*<334>ティー-tea<334>テフ-teff<334>より<334>ミミッキュ<334...,ティー-tea<334>テフ-teff<334>ミミッキュ<334>方<334>嫌い<334>br,neg,1


### キーワードごとに分析

In [16]:
#キーワードごとに
for keyword in keywords:
    scores = []
    coocs = []
    counts = []

    #時期インデックスごとに
    for time_index in range(MAX_TIME_INDEX):
        #極性スコア計算
        score = 0
        cnt = 0
        texts = []
        for _, row in df_main[df_main["time_index"] == time_index].iterrows():
            if keyword in row["disassembled_all"].split(TOKEN_DIVIDER):
                cnt += 1
                if row["Sentiment"] == "pos":
                    score += 1
                else:
                    score -= 1
                texts.append(row["disassembled_target"])
        if cnt != 0:
            score /= cnt
        else:
            score = 0
        
        ##共起
        #TFIDF計算
        if cnt != 0:
            tfidf_spycy = tfidf_model_showing.transform(texts)
            tfidf_df = pd.DataFrame(tfidf_spycy.toarray(), columns=tfidf_model_showing.get_feature_names_out())
            tfidf_mean = tfidf_df.mean(axis=0)

            #単語名と結び付け
            tfidf_mean = tfidf_mean.sort_values(ascending=False)

            #ストップワードの削除
            for word in local_stopwords:
                tfidf_mean = tfidf_mean.drop(word)

            if len(tfidf_mean) > 1:
                cooc = ", ".join(tfidf_mean[1:min(6, len(tfidf_mean))].index)
            else:
                cooc = ""
        else:
            cooc = ""
        
        scores.append(score)
        coocs.append(cooc)
        counts.append(cnt)

    df_keyword = pd.DataFrame(list(zip(counts, scores, coocs)), columns=["Count", "Score", "CoOccure"])

    print("\n"+keyword)
    display(df_keyword.style.bar(subset=["Score"], align='zero', color=['#d65f5f', '#5fba7d'], vmax=1, vmin=-1))


z


Unnamed: 0,Count,Score,CoOccure
0,32,-0.1875,"z 技, 技, ゴースト-ghost, ゴースト-ghost z, ポリ-poly"
1,69,-0.246377,"フェアリー-fairy z, z 技, フェアリー-fairy, 技, 襷"
2,0,0.0,
3,0,0.0,
4,0,0.0,
5,0,0.0,
6,0,0.0,



襷


Unnamed: 0,Count,Score,CoOccure
0,250,-0.256,"襷 ミミッキュ, 襷 ミミ, ギャラ-guarantee, ガブ, 玉"
1,334,-0.257485,"襷 ミミッキュ, 襷 ミミ, z, 良い, 剣舞"
2,0,0.0,
3,0,0.0,
4,0,0.0,
5,0,0.0,
6,2,0.0,"脅威, ダイ-dye, マックス-max, トリック-trick, 不明"



対策


Unnamed: 0,Count,Score,CoOccure
0,133,-0.24812,"ミミッキュ 対策, メガ-mega, 良い, 無い, フェアリー-fairy"
1,226,-0.345133,"ミミッキュ 対策, 無い, 良い, ポケモン, 襷"
2,0,0.0,
3,0,0.0,
4,0,0.0,
5,0,0.0,
6,1,-1.0,"環境, 滓, 途端, 禁, 擁護"



剣舞


Unnamed: 0,Count,Score,CoOccure
0,132,-0.30303,"z, 剣舞 ミミッキュ, 強い, 襷, 鬼火"
1,190,-0.357895,"z, 剣舞 ミミッキュ, 無い, 襷, 挑発"
2,0,0.0,
3,0,0.0,
4,0,0.0,
5,0,0.0,
6,1,1.0,"不明, 18, スカーフ-scarf, 剣舞, 型"



鬼火


Unnamed: 0,Count,Score,CoOccure
0,125,-0.2,"トリル-trill, ギャラ-guarantee, 剣舞, 電磁, 襷"
1,136,-0.088235,"トリル-trill, 挑発, 電磁, 剣舞, 物理"
2,0,0.0,
3,0,0.0,
4,0,0.0,
5,0,0.0,
6,1,-1.0,"15, "", 序で, 店, 底辺"



多い


Unnamed: 0,Count,Score,CoOccure
0,152,-0.342105,"無い, 人 多い, ポケモン, 奴, 奴 多い"
1,185,-0.545946,"無い, 奴 多い, 事, 奴, 1"
2,0,0.0,
3,0,0.0,
4,0,0.0,
5,0,0.0,
6,1,-1.0,"無い ポケモン, マスコット-mascot, グッズ-goods, ファン-fan（熱狂者）, 嬉しい"



レート-rate


Unnamed: 0,Count,Score,CoOccure
0,121,-0.421488,"無い, 1700, ub, 多い, パーティー-party"
1,188,-0.382979,"2000, パーティー-party, 無い, 切断, 雑魚"
2,0,0.0,
3,0,0.0,
4,0,0.0,
5,0,0.0,
6,0,0.0,



化けの皮


Unnamed: 0,Count,Score,CoOccure
0,91,-0.472527,"発動, 貫通, 襷, 状態, 無い"
1,92,-0.543478,"襷, 無い, 剣舞, 技, 特性"
2,0,0.0,
3,0,0.0,
4,0,0.0,
5,0,0.0,
6,0,0.0,
