# 重要語と、その極性を表示
ストップワードの調整・適用もここでする。

In [1]:
ID = "UT-sei"

POPULATION_ID = "population"

#分析対象テキストファイル
target_texts_file = "Responses/%s.txt"%ID

#ストップワード
stopwords_file = "Stopwords/%s.txt"%ID

#TFIDFモデル
tfidf_model_file = "Models/Tfidf/%s-from-%s.model"%(ID, POPULATION_ID)

#分析モデル
classifer_model_file = "Models/Classifers/%s-from-%s.model"%(ID, POPULATION_ID)

#ストップワード抜きの形態素解析・極性分析完了したデータの保存先
without_stopwords_text_file = "Progresses/NonStopword/%s-from-%s.txt"%(ID, POPULATION_ID)

#極性保存するファイル
sentiment_file = "Sentiment/%s-from-%s.txt"%(ID, POPULATION_ID)

#内部的に品詞を区別する区切り文字
TOKEN_DIVIDER = "<334>"

## TFIDFによる重要語の列挙

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import MeCab
import pandas as pd

### テキストを取り込み

In [3]:
df_target = pd.read_table(target_texts_file)

df_target.head(5)

Unnamed: 0,date,link_index,number,content
0,2017/10/30|23/00/00,18471,19,東京大学　新卒就職先トップ１０ ＜２０１１年＞ ＜２０１２年＞ ...
1,2009/01/07|23/05/00,4800,211,■東大生の選んだ法科大学院ランキング■ http://changi.2ch.net/te...
2,2001/01/21|02/07/00,48612,186,>184 東大生？
3,2004/06/22|13/19/00,79943,378,上智は今のレベルになってから月日があんまり経ってないせいか、あんまり知られてない。 外国...
4,2004/06/28|15/31/00,79943,799,というか、IVYに逆らってた東大生並に馬鹿。 大学ランキングで19位ってのを出してきたり...


### 10で計算したTFIDFモデルをサルベージ

In [4]:
import pickle
from copy import deepcopy

#TFIDFモデル復元用
#tokenizerをすり替え
#本ノートでは生テキストではなく形態素解析後のテキストが渡される
def tokenize(text):
    return text.split(TOKEN_DIVIDER)

with open(tfidf_model_file, "rb") as f:
    tfidf_model = pickle.load(f)

#対象（名詞・形容詞）を見せる用
tfidf_model_showing = deepcopy(tfidf_model)
tfidf_model_showing.ngram = (1,1)
tfidf_model_showing.tokenizer = tokenize

#極性を予測する用
tfidf_model_sentiment = deepcopy(tfidf_model)
tfidf_model_sentiment.tokenizer = tokenize

### 品詞を抽出

#### 形態素解析して原形をリスト化

#### 名詞・形容詞原形に限定

In [5]:
#求める品詞
WANTED_WORD = ["名詞", "形容詞"]

#返り値：[全原形], [求める品詞（原形)]
def get_target_tokens(text):
    output_all = []
    output_wanted = []
    
    tagger = MeCab.Tagger()
    node = tagger.parseToNode(text)

    while node:
        features = node.feature.split(",")

        #求める品詞なら
        if features[0] in WANTED_WORD:
            if len(features) >= 8:
                #原形を記録
                output_wanted.append(features[7])
            else:
                output_wanted.append(node.surface)

        #全品詞
        if len(features) >= 8:
            #原形を記録
            output_all.append(features[7])
        else:
            output_all.append(node.surface)

        node = node.next
    
    return TOKEN_DIVIDER.join(output_all), TOKEN_DIVIDER.join(output_wanted)

get_target_tokens("オレンジ今日も食べてみたけどまだ酸っぱくて泣いた")

('*<334>オレンジ-orange<334>今日<334>も<334>食べる<334>て<334>見る<334>た<334>けれど<334>未だ<334>酸っぱい<334>て<334>泣く<334>た<334>*',
 'オレンジ-orange<334>今日<334>酸っぱい')

### TFIDFを計算


#### テキスト群を原形に分解

In [6]:
df_target["disassembled_all"] = ""
df_target["disassembled_target"] = ""

for index, row in df_target.iterrows():
    df_target.at[index, "disassembled_all"], df_target.at[index, "disassembled_target"] = get_target_tokens(row["content"])

df_target.head(5)

Unnamed: 0,date,link_index,number,content,disassembled_all,disassembled_target
0,2017/10/30|23/00/00,18471,19,東京大学　新卒就職先トップ１０ ＜２０１１年＞ ＜２０１２年＞ ...,*<334>トウキョウ<334>大学<334>　<334>新卒<334>就職<334>先<3...,トウキョウ<334>大学<334>新卒<334>就職<334>先<334>トップ-top<3...
1,2009/01/07|23/05/00,4800,211,■東大生の選んだ法科大学院ランキング■ http://changi.2ch.net/te...,*<334>■<334>東大<334>生<334>の<334>選ぶ<334>た<334>法科...,東大<334>法科<334>大学<334>ランキング-ranking<334>http<33...
2,2001/01/21|02/07/00,48612,186,>184 東大生？,*<334>＞<334>184<334>東大<334>生<334>？<334>*,184<334>東大
3,2004/06/22|13/19/00,79943,378,上智は今のレベルになってから月日があんまり経ってないせいか、あんまり知られてない。 外国...,*<334>上智<334>は<334>今<334>の<334>レベル-level<334>に...,上智<334>今<334>レベル-level<334>月日<334>所為<334>外国<33...
4,2004/06/28|15/31/00,79943,799,というか、IVYに逆らってた東大生並に馬鹿。 大学ランキングで19位ってのを出してきたり...,*<334>と<334>言う<334>か<334>、<334>IVY<334>に<334>逆...,IVY<334>東大<334>馬鹿<334>大学<334>ランキング-ranking<334...


#### 各文書のTFIDFの平均値でランキング付け

In [7]:
#上記のテキストを分解
target_tfidf_spycy = tfidf_model_showing.transform(df_target["disassembled_target"])
target_tfidf = target_tfidf_spycy.toarray()

target_tfidf.shape

(16897, 33541)

In [8]:
#pandas行列に
df_tfidf = pd.DataFrame(target_tfidf, columns=tfidf_model.get_feature_names_out())

df_tfidf.head(5)

Unnamed: 0,"""",',' ',"' ,",' a,' i,' s,'.1,'.2,' ：,...,😯😴🥴😮😪🥴🤢🤧🤕🤕,😯😴🥴😮😪🥴🤢🤧🤕🤕 🤤🤕👺👿🤡💩👺👹🤕🤮,😯😴🥴😮😪🥴🤢🤧🤕🤕 🤤🤕👺👿🤡💩👺👹🤕🤮 💀👽👻💩😺🤖😸😸😺👺,😺🤞🏿👍👽🤞🏿😡😸👺🤮🤡,😻😾🤲🙌🏿👍👎🤞🏿,😻😾🤲🙌🏿👍👎🤞🏿 ✌,😻😾🤲🙌🏿👍👎🤞🏿 ✌ 👊🤛,🤤🤕👺👿🤡💩👺👹🤕🤮,🤤🤕👺👿🤡💩👺👹🤕🤮 💀👽👻💩😺🤖😸😸😺👺,🤤🤕👺👿🤡💩👺👹🤕🤮 💀👽👻💩😺🤖😸😸😺👺 😻😾🤲🙌🏿👍👎🤞🏿
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
#平均を算出
df_tfidf_mean = df_tfidf.mean(axis=0).sort_values(ascending=False)

df_tfidf_mean.head(100)

東大      0.115418
無い      0.037442
事       0.028268
大学      0.028167
ｗ       0.021295
          ...   
3       0.004814
訳 無い    0.004806
度       0.004787
阪大      0.004776
駅弁      0.004738
Length: 100, dtype: float64

### ストップワードを反映
ここでストップワードの調整をする

#### ストップワードを取り込み

In [10]:
with open(stopwords_file, "r", encoding="utf-8") as f:
    stopwords = f.read().split()

stopwords

[]

#### 除外

In [11]:
import numpy as np

In [12]:
#ストップワードのないものを除外

def has_stopword(text_list):
    for word in stopwords:
        if word in text_list.split(TOKEN_DIVIDER):
            return True
    
    return False

#df_target[has_stopword(df_target["disassembled"]) == False]
df_non_stopwords = np.empty((0, df_target.shape[1]))
for _, row in df_target.iterrows():
    if has_stopword(row["disassembled_all"]) == False:
        df_non_stopwords = np.vstack([df_non_stopwords, row.values])

df_non_stopwords = pd.DataFrame(df_non_stopwords, columns=df_target.columns)

df_non_stopwords.shape

(16897, 6)

#### TFIDFを計算

In [13]:
target_tfidf_spycy = tfidf_model_showing.transform(df_non_stopwords["disassembled_target"])
target_tfidf = target_tfidf_spycy.toarray()

#pandas行列に
df_tfidf = pd.DataFrame(target_tfidf, columns=tfidf_model_showing.get_feature_names_out())

#平均を算出
df_tfidf_mean = df_tfidf.mean(axis=0).sort_values(ascending=False)

#並べ替え
pd.set_option('display.max_rows', 100)
df_tfidf_mean.head(100)


東大            0.115418
無い            0.037442
事             0.028268
大学            0.028167
ｗ             0.021295
良い            0.018640
学歴            0.016818
奴             0.016664
京大            0.015152
部             0.014669
馬鹿            0.014345
多い            0.013616
ワセダ           0.013595
慶応            0.013277
人             0.012258
一             0.012194
受験            0.011274
1             0.011112
方             0.010751
合格            0.010205
早慶            0.009882
学生            0.009767
年             0.009753
勉強            0.009614
物             0.009463
文             0.008982
２             0.008965
自分            0.008836
東大 京大         0.008692
無い 東大         0.008688
１             0.008499
トウキョウ         0.008283
訳             0.008185
試験            0.008095
他             0.007636
日本            0.007588
今             0.007573
医学            0.007546
就職            0.007543
本当            0.007510
頭             0.007352
所             0.007207
高い            0.007134
レベル-level  

## 重要語の極性分析

### 全体の極性

### 分類モデルをサルベージ

In [14]:
with open(classifer_model_file, "rb") as f:
    classifer = pickle.load(f)

### 極性をとる

In [15]:
def predict(disassembled):
    if len(disassembled) == 0:
        return []

    #TFIDF行列へ変換
    df_tfidf = tfidf_model_sentiment.transform(disassembled)
    df_tfidf = df_tfidf.toarray()

    #分類器に食わせる
    output = classifer.predict(df_tfidf)

    return output

In [16]:
df_non_stopwords["Sentiment"] = predict(df_non_stopwords["disassembled_all"])

df_non_stopwords.head(5)

Unnamed: 0,date,link_index,number,content,disassembled_all,disassembled_target,Sentiment
0,2017/10/30|23/00/00,18471,19,東京大学　新卒就職先トップ１０ ＜２０１１年＞ ＜２０１２年＞ ...,*<334>トウキョウ<334>大学<334>　<334>新卒<334>就職<334>先<3...,トウキョウ<334>大学<334>新卒<334>就職<334>先<334>トップ-top<3...,pos
1,2009/01/07|23/05/00,4800,211,■東大生の選んだ法科大学院ランキング■ http://changi.2ch.net/te...,*<334>■<334>東大<334>生<334>の<334>選ぶ<334>た<334>法科...,東大<334>法科<334>大学<334>ランキング-ranking<334>http<33...,neu
2,2001/01/21|02/07/00,48612,186,>184 東大生？,*<334>＞<334>184<334>東大<334>生<334>？<334>*,184<334>東大,neu
3,2004/06/22|13/19/00,79943,378,上智は今のレベルになってから月日があんまり経ってないせいか、あんまり知られてない。 外国...,*<334>上智<334>は<334>今<334>の<334>レベル-level<334>に...,上智<334>今<334>レベル-level<334>月日<334>所為<334>外国<33...,neg
4,2004/06/28|15/31/00,79943,799,というか、IVYに逆らってた東大生並に馬鹿。 大学ランキングで19位ってのを出してきたり...,*<334>と<334>言う<334>か<334>、<334>IVY<334>に<334>逆...,IVY<334>東大<334>馬鹿<334>大学<334>ランキング-ranking<334...,neg


### 全体の極性

In [17]:
cnt = 0
score = 0
for _, row in df_non_stopwords.iterrows():
    if row["Sentiment"] == "pos":
        score += 1
    elif row["Sentiment"] == "neg":
        score -= 1
    cnt += 1

score /= cnt

print(score)

-0.5420488844173522


### 単語ごと

In [18]:
#分析する数
WANTED = 100

#keywordについての極性スコアを分析
#返り値：極性スコア、個数
def get_sentiment(keyword):
    count = 0
    score = 0

    #keywordを含む行を走査
    for _, row in df_non_stopwords.iterrows():
        if keyword in row["disassembled_all"].split(TOKEN_DIVIDER):
            count += 1
            if row["Sentiment"] == "pos":
                score += 1
            elif row["Sentiment"] == "neg":
                score -= 1

    #平均極性スコアを算出
    if count == 0:
        score = 0
    else:
        score /= count

    return score, count

#[Token, TFIDF, Sentiment, Counts]
df_sentiment = np.empty((0,4))
cnt = 0
for index, value in df_tfidf_mean.items():
    score, count = get_sentiment(index)
    row = np.array([index, value, score, count])
    df_sentiment = np.vstack([df_sentiment, row])
    cnt += 1
    if cnt > WANTED:
        break

df_sentiment = pd.DataFrame(df_sentiment, columns=["Token", "TFIDF", "Sentiment", "Counts"])

df_sentiment.shape

(101, 4)

### 表示

In [19]:
import seaborn as sns

In [20]:
df_sentiment.style.background_gradient(cmap="vlag_r", axis=0, subset="Sentiment")

Unnamed: 0,Token,TFIDF,Sentiment,Counts
0,東大,0.1154181832554306,-0.6394039454745576,11811
1,無い,0.0374421338556916,-0.7634408602150538,5394
2,事,0.0282684848607945,-0.6949197860962567,3740
3,大学,0.0281674895576021,-0.6126652320934522,3253
4,ｗ,0.0212946155234131,-0.5255863539445629,1876
5,良い,0.0186404660289752,-0.5967160592711254,2497
6,学歴,0.0168177266873164,-0.627373935821873,1527
7,奴,0.0166636333124363,-0.8099032441661924,1757
8,京大,0.0151524893611191,-0.6487603305785123,1210
9,部,0.0146690253414341,-0.5497448979591837,1568


### 保存

In [21]:
df_sentiment.to_csv(sentiment_file, sep="\t", index=False)

df_non_stopwords.to_csv(without_stopwords_text_file, sep="\t", index=False)