In [1]:
import pandas as pd
import numpy as np

# 可視化用のライブラリ
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

import neologdn
import MeCab

import re

from tqdm import tqdm_notebook as tqdm
from tqdm._tqdm_notebook import tqdm_notebook
# # プログレスバーのラベル設定
tqdm_notebook.pandas(desc="progress: ")

### 保存したテキスト内容×単語のデータフレーム を読み込み

In [2]:
# 今回はencoding='utf-8'でjupyter上では文字化けなく読み込めた
#model_tfidf_df = pd.read_csv('model_df.csv', encoding='shift_jis')
model_tfidf_df = pd.read_csv('model_tfidf_df.csv', encoding='utf-8')
model_tfidf_df

Unnamed: 0,内容,__記号-一般,t_名詞-固有名詞-組織,text_名詞-固有名詞-一般,、_記号-読点,「_記号-括弧開,」_記号-括弧閉,が_助詞-格助詞-一般,する_動詞-自立,で_助詞-格助詞-一般,の_助詞-連体化,は_助詞-係助詞,れる_動詞-接尾,単語_名詞-一般,型_名詞-接尾-一般,＼_記号-一般
0,neologd_tagger.parse(text)で各単語の原形、品詞などが1行で連続して...,0.065121,0.0,0.065121,0.038462,0.0,0.0,0.065121,0.076923,0.099052,0.038462,0.0,0.065121,0.049526,0.0,0.0
1,まずはparse()で分かち書きした単語群は1つの文字列型になっているので「＼n」で区切り、...,0.0,0.0,0.0,0.033333,0.056438,0.042923,0.0,0.066667,0.085845,0.033333,0.042923,0.0,0.042923,0.112876,0.056438
2,原形、品詞などの間には「＼t」が、分かち書きされた単語と単語の区切りには「＼n」が表示される,0.0,0.051307,0.0,0.060606,0.102615,0.078041,0.102615,0.060606,0.0,0.060606,0.078041,0.102615,0.078041,0.0,0.102615
3,次に邪魔な文字「＼t」を省きたい。よって、split('＼t')を使用したいがwakatid...,0.041296,0.082593,0.041296,0.02439,0.0,0.031407,0.0,0.02439,0.031407,0.02439,0.062814,0.0,0.0,0.082593,0.0


## 次元削減のための準備
#### DataFrameからarray型へ変換

In [3]:
model_tfidf_df_array = model_tfidf_df.iloc[:, 1:].values# [内容]カラムのみarray型にしないため除外
model_tfidf_df_array

array([[0.06512105, 0.        , 0.06512105, 0.03846154, 0.        ,
        0.        , 0.06512105, 0.07692308, 0.09905247, 0.03846154,
        0.        , 0.06512105, 0.04952623, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.03333333, 0.05643824,
        0.04292274, 0.        , 0.06666667, 0.08584547, 0.03333333,
        0.04292274, 0.        , 0.04292274, 0.11287648, 0.05643824],
       [0.        , 0.05130749, 0.        , 0.06060606, 0.10261498,
        0.07804134, 0.10261498, 0.06060606, 0.        , 0.06060606,
        0.07804134, 0.10261498, 0.07804134, 0.        , 0.10261498],
       [0.04129627, 0.08259255, 0.04129627, 0.02439024, 0.        ,
        0.03140688, 0.        , 0.02439024, 0.03140688, 0.02439024,
        0.06281376, 0.        , 0.        , 0.08259255, 0.        ]])

## NMFで次元削減

In [4]:
# 2次元に変換
from sklearn.decomposition import NMF
model = NMF(n_components=2, init='random', random_state=0)
tfidf_NMF = model.fit_transform(model_tfidf_df_array)
tfidf_NMF

array([[0.07638544, 0.15842403],
       [0.06807093, 0.2038885 ],
       [0.34804573, 0.        ],
       [0.00532199, 0.17673225]])

## PCAで次元削減

In [5]:
# StandardScaler を利用し数値を標準化(平均が 0 で標準偏差・分散が 1 )
# 負の値が出てくるのでNMFでの次元削減はできない
from sklearn.preprocessing import StandardScaler
sds = StandardScaler()
X_sds = sds.fit_transform(model_tfidf_df_array)
X_sds

array([[ 1.38023241, -0.94950885,  1.38023241, -0.05516488, -0.92506408,
        -1.3650381 ,  0.52722531,  0.9992857 ,  1.11828285, -0.05516488,
        -1.56787689,  0.52722531,  0.24723323, -0.97682787, -0.92506408],
       [-0.95335641, -0.94950885, -0.95335641, -0.43940215,  0.3879301 ,
         0.17308103, -0.95349258,  0.4810418 ,  0.78990615, -0.43940215,
        -0.10311774, -0.95349258,  0.01074927,  1.27950693,  0.3879301 ],
       [-0.95335641,  0.50581312, -0.95335641,  1.60404147,  1.46219806,
         1.43154215,  1.37975986,  0.17480677, -1.3445424 ,  1.60404147,
         1.09532155,  1.37975986,  1.26841397, -0.97682787,  1.46219806],
       [ 0.5264804 ,  1.39320457,  0.5264804 , -1.10947444, -0.92506408,
        -0.23958508, -0.95349258, -1.65513427, -0.56364659, -1.10947444,
         0.57567308, -0.95349258, -1.52639647,  0.67414881, -0.92506408]])

In [6]:
# 2次元に変換
from sklearn.decomposition import PCA
model = PCA(n_components=2)
X_pca = model.fit_transform(X_sds)
X_pca

array([[-1.68344157,  3.40572594],
       [-0.38990955, -0.89529288],
       [ 4.7285164 , -0.17353908],
       [-2.65516527, -2.33689398]])

## t-SNEで次元圧縮

In [7]:
# StandardScaler利用後の値を使用
# 2次元に変換
from sklearn.manifold import TSNE
X_sne = TSNE(n_components=2).fit_transform(X_sds)
X_sne

array([[-155.6651 ,  278.3851 ],
       [-147.73701,  147.23216],
       [-278.88998,  139.3046 ],
       [-286.8181 ,  270.45758]], dtype=float32)

## 次元削減後の値を元データフレームと連結

In [8]:
# 今回はNMFでの次元圧縮後の値を利用
# arrayをDataFrameに変換
dimension_reduction_df = pd.DataFrame(tfidf_NMF, columns=['vec1', 'vec2'])
dimension_reduction_df

Unnamed: 0,vec1,vec2
0,0.076385,0.158424
1,0.068071,0.203889
2,0.348046,0.0
3,0.005322,0.176732


In [9]:
# index をキーとして結合したい場合は、DataFrame.join
dimension_reduction_join_df = model_tfidf_df[['内容']].join(dimension_reduction_df)
dimension_reduction_join_df

Unnamed: 0,内容,vec1,vec2
0,neologd_tagger.parse(text)で各単語の原形、品詞などが1行で連続して...,0.076385,0.158424
1,まずはparse()で分かち書きした単語群は1つの文字列型になっているので「＼n」で区切り、...,0.068071,0.203889
2,原形、品詞などの間には「＼t」が、分かち書きされた単語と単語の区切りには「＼n」が表示される,0.348046,0.0
3,次に邪魔な文字「＼t」を省きたい。よって、split('＼t')を使用したいがwakatid...,0.005322,0.176732
