# day 3

このノートブックの実行例は[こちら(HTML版)](../notebooks-sample/day-3.html)で確認できます

---

## 0. はじめに

ページ上部のメニューバーにある **Kernel** メニューをクリックし、プルダウンメニューから [**Change Kernel ...**] を選び、**gssm2023:Python** を選択してください。

<img src="images/change_kernel1.png" width="30%">

ノートブック上部の右隅に表示されたカーネル名が **gssm2023:Python** になっていることを確認してください。

<img src="images/change_kernel2.png" width="30%">

---

## 1. テキスト解析 (2)

### 1.0 事前準備 (関数の定義)

以下のセルを**修正せず**に実行してください

In [None]:
import warnings
warnings.simplefilter('ignore')

import random
import numpy as np

seed = 42
random.seed(seed)
np.random.seed(seed)

# ワードクラウドを描画する
def plot_wordcloud(word_str, width=6, height=4):

    import matplotlib.pyplot as plt
    %matplotlib inline

    fig = plt.figure(figsize=(width, height))
    ax = fig.add_subplot(1, 1, 1)
    plot_wordcloud_ax(ax, word_str)
    plt.axis("off")
    plt.tight_layout()
    plt.show()

def plot_wordcloud_ax(ax, word_str):

    font_path = !find ${HOME} -name "ipaexg.ttf"
    # font_path = ['/Library/Fonts/Arial Unicode.ttf']

    import wordcloud

    wc = wordcloud.WordCloud(
        background_color='white',
        font_path=font_path[0],
        max_font_size=100)

    img = wc.generate(word_str)
    ax.imshow(img, interpolation='bilinear')


# トピックモデルによるワードクラウドを描画する
def plot_topic_model(lda, feature_names, n_top_words=20, width=10, height=4):

    font_path = !find ${HOME} -name "ipaexg.ttf"
    # font_path = ['/Library/Fonts/Arial Unicode.ttf']

    import matplotlib.pyplot as plt
    import wordcloud
    %matplotlib inline

    fig = plt.figure(figsize=(width, height))

    for topic_idx, topic in enumerate(lda.components_):
        sorted_text = ' '.join([feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]])

        wc = wordcloud.WordCloud(
            background_color='white',
            font_path=font_path[0],
            max_font_size=100)

        ax = fig.add_subplot(2, 3, topic_idx + 1)
        img = wc.generate(sorted_text)
        ax.imshow(img, interpolation='bilinear')
        ax.set_title(f"Topic # {topic_idx+1}:")

    plt.tight_layout()
    plt.show()


# 共起ネットワークを描画する (抽出語-抽出語用)
def plot_cooccur_network(df, word_counts, cutoff, width=8, height=8):

    import matplotlib.pyplot as plt
    import japanize_matplotlib

    plt.figure(figsize=(width, height))
    fig = plt.figure(figsize=(width, height))
    ax = fig.add_subplot(1, 1, 1)
    plot_cooccur_network_ax(ax, df, word_counts, cutoff)
    plt.axis("off")
    plt.show()

def plot_cooccur_network_ax(ax, df, word_counts, cutoff):

    import numpy as np
    import networkx as nx
    from networkx.algorithms import community
    from networkx.drawing.nx_agraph import graphviz_layout
    import matplotlib.pyplot as plt
    import japanize_matplotlib
    %matplotlib inline

    Xc = df.values
    Xc_max = Xc.max()

    words = df.columns
    count_max = word_counts.max()

    weights_w, weights_c = [], []
    for i, j in zip(*Xc.nonzero()):
        if i < j and Xc[i,j] > cutoff:
            weights_w.append((words[i], {'weight': word_counts[i] / count_max}))
            weights_w.append((words[j], {'weight': word_counts[j] / count_max}))
            weights_c.append((words[i], words[j], Xc[i,j] / Xc_max))

    G = nx.Graph()
    G.add_nodes_from(weights_w)
    G.add_weighted_edges_from(weights_c)
    G.remove_nodes_from(list(nx.isolates(G)))
    # G = nx.minimum_spanning_tree(G)

    # pos = nx.spring_layout(G, k=0.3)
    pos = graphviz_layout(G, prog='neato', args='-Goverlap="scalexy" -Gsep="+6" -Gnodesep=0.8 -Gsplines="polyline" -GpackMode="graph" -Gstart={}'.format(43))
    weights_n = np.array(list(nx.get_node_attributes(G, 'weight').values()))
    weights_e = np.array(list(nx.get_edge_attributes(G, 'weight').values()))

    communities = community.greedy_modularity_communities(G)
    color_map = []
    for node in G:
        for i, c in enumerate(communities):
            if node in c:
                color_map.append(i)

    nx.draw_networkx_nodes(G, pos, node_color=color_map, alpha=0.7, cmap=plt.cm.Set2, node_size=5000 * weights_n, ax=ax)
    nx.draw_networkx_edges(G, pos, edge_color='gray', edge_cmap=plt.cm.Blues, alpha=0.7, width=3 * weights_e, ax=ax)
    nx.draw_networkx_labels(G, pos, font_family='IPAexGothic', ax=ax)
    # ax.axis('off')


# 共起ネットワークを描画する (外部変数-抽出語用)
def plot_attrs_network(df, attr_counts, word_counts, cutoff, width=8, height=8):

    import numpy as np
    import networkx as nx
    import matplotlib.pyplot as plt
    import japanize_matplotlib
    from networkx.drawing.nx_agraph import graphviz_layout
    %matplotlib inline

    Xc = df.values
    Xc_max = Xc.max()

    attrs = list(df.index)
    attr_count_max = attr_counts.max()

    words = list(df.columns)
    word_count_max = word_counts.max()

    weights_n, weights_c = [], []
    for i, j in zip(*Xc.nonzero()):
        if Xc[i,j] > cutoff:
            weights_n.append((attrs[i], {'weight': attr_counts[i] / attr_count_max, 'type': 'attr'}))
            weights_n.append((words[j], {'weight': word_counts[j] / word_count_max, 'type': 'word'}))
            weights_c.append((attrs[i], words[j], Xc[i,j] / Xc_max))

    G = nx.Graph()
    G.add_nodes_from(weights_n)
    G.add_weighted_edges_from(weights_c)
    G.remove_nodes_from(list(nx.isolates(G)))
    # G = nx.minimum_spanning_tree(G)

    plt.figure(figsize=(width, height))
    # pos = nx.spring_layout(G, k=0.3)
    pos = graphviz_layout(G, prog='neato', args='-Goverlap="scalexy" -Gsep="+6" -Gnodesep=0.8 -Gsplines="polyline" -GpackMode="graph" -Gstart={}'.format(43))

    nodelist_a = [node for node in G.nodes if G.nodes[node]['type'] == 'attr']
    nodelist_w = [node for node in G.nodes if G.nodes[node]['type'] == 'word']
    weights_a = np.array([G.nodes[node]['weight'] for node in G.nodes if G.nodes[node]['type'] == 'attr'])
    weights_w = np.array([G.nodes[node]['weight'] for node in G.nodes if G.nodes[node]['type'] == 'word'])
    weights_e = np.array(list(nx.get_edge_attributes(G, 'weight').values()))

    color_map = []
    for node in G:
        if G.nodes[node]['type'] == 'word':
            color_map.append(G.degree(node)+3)

    nx.draw_networkx_nodes(G, pos, node_color='lightsalmon', alpha=0.7, cmap=plt.cm.Set2, node_size=1000 * weights_a, nodelist=nodelist_a, node_shape='s')
    nx.draw_networkx_nodes(G, pos, node_color=color_map, alpha=0.7, cmap=plt.cm.Set2, node_size=5000 * weights_w, nodelist=nodelist_w)
    nx.draw_networkx_edges(G, pos, edge_color='gray', edge_cmap=plt.cm.Blues, alpha=0.7, width=3 * weights_e)
    nx.draw_networkx_labels(G, pos, font_family='IPAexGothic')

    plt.axis("off")
    plt.show()


# 係り受けによる共起ネットワークを描画する
def plot_dependency_network(df, word_counts, cutoff, width=8, height=8):

    import numpy as np
    import networkx as nx
    from networkx.algorithms import community
    import matplotlib.pyplot as plt
    import japanize_matplotlib
    from networkx.drawing.nx_agraph import graphviz_layout
    %matplotlib inline

    Xc = df.values
    Xc_max = Xc.max()

    words = df.columns
    count_max = word_counts.max()

    weights_w, weights_c = [], []
    for i, j in zip(*Xc.nonzero()):
        if i != j and Xc[i,j] > cutoff:
            weights_w.append((words[i], {'weight': word_counts[i] / count_max}))
            weights_w.append((words[j], {'weight': word_counts[j] / count_max}))
            weights_c.append((words[i], words[j], Xc[i,j] / Xc_max))

    G = nx.DiGraph()
    G.add_nodes_from(weights_w)
    G.add_weighted_edges_from(weights_c)
    G.remove_nodes_from(list(nx.isolates(G)))
    # G = nx.minimum_spanning_tree(G)

    plt.figure(figsize=(width, height))
    # pos = nx.spring_layout(G, k=0.3)
    pos = graphviz_layout(G, prog='neato', args='-Goverlap="scalexy" -Gsep="+6" -Gnodesep=0.8 -Gsplines="polyline" -GpackMode="graph" -Gstart={}'.format(43))
    weights_n = np.array(list(nx.get_node_attributes(G, 'weight').values()))
    weights_e = np.array(list(nx.get_edge_attributes(G, 'weight').values()))

    communities = community.greedy_modularity_communities(G)
    color_map = []
    for node in G:
        for i, c in enumerate(communities):
            if node in c:
                color_map.append(i)

    nx.draw_networkx_nodes(G, pos, node_color=color_map, alpha=0.7, cmap=plt.cm.Set2, node_size=5000 * weights_n)
    nx.draw_networkx_edges(G, pos, edge_color='gray', edge_cmap=plt.cm.Blues, alpha=0.7, width=3 * weights_e)
    nx.draw_networkx_labels(G, pos, font_family='IPAexGothic')

    plt.axis("off")
    plt.show()


# 対応分析の結果をプロットする
def plot_coresp(row_coord, col_coord, row_labels, col_labels, explained_inertia=None, width=8, height=8):

    import matplotlib.pyplot as plt
    import japanize_matplotlib
    %matplotlib inline

    plt.figure(figsize=(width, height))

    # Plot of rows (外部変数)
    plt.plot(row_coord[:, 0], row_coord[:, 1], "*", color='red', alpha=0.5)
    for i, label in enumerate(row_labels):
        plt.text(row_coord[i, 0], row_coord[i, 1], label, color='red', ha='left', va='bottom')

    # Plot of columns (単語)
    plt.plot(col_coord[:, 0], col_coord[:, 1], "o", color='blue', alpha=0.5)
    for i, label in enumerate(col_labels):
        plt.text(col_coord[i, 0], col_coord[i, 1], label, color='blue', ha='left', va='bottom')

    plt.axvline(0, linestyle='dashed', color='gray', alpha=0.5)
    plt.axhline(0, linestyle='dashed', color='gray', alpha=0.5)

    if explained_inertia is not None:
        plt.xlabel(f"Dim 1 ({explained_inertia[0]:.3f}%)")
        plt.ylabel(f"Dim 2 ({explained_inertia[1]:.3f}%)")

    # plt.axis('equal')
    plt.show()


# PCA の結果をプロットする
def plot_pca(coeff, reduced, row_labels, col_labels, var_ratio=None, width=8, height=8):

    import matplotlib.pyplot as plt
    import japanize_matplotlib
    %matplotlib inline

    plt.figure(figsize=(width, height))

    # Plot of rows (外部変数)
    for i, label in enumerate(row_labels):
        plt.arrow(0, 0, coeff[i,0], coeff[i,1], color='r', alpha=0.5)
        plt.text(coeff[i, 0], coeff[i, 1], label, color='red', ha='left', va='bottom')

    # Plot of columns (単語)
    plt.plot(reduced[:, 0], reduced[:, 1], "o", color='blue', alpha=0.5)
    for i, label in enumerate(col_labels):
        plt.text(reduced[i, 0], reduced[i, 1], label, color='blue', ha='left', va='bottom')

    plt.axvline(0, linestyle='dashed', color='gray', alpha=0.5)
    plt.axhline(0, linestyle='dashed', color='gray', alpha=0.5)

    if var_ratio is not None:
        plt.xlabel(f"Dim 1 ({var_ratio[0]*100:.3f}%)")
        plt.ylabel(f"Dim 2 ({var_ratio[1]*100:.3f}%)")

    # plt.axis('equal')
    plt.show()


# 共起頻度行列を Jaccard 係数行列に変換する (抽出語-抽出語用)
def jaccard_coef(cooccur_df, cross_df):

    import numpy as np
    import pandas as pd

    Xc = cooccur_df.values
    Xj = np.zeros(Xc.shape)
    Xc_sum = cross_df.sum(axis=0).values

    for i, j in zip(*Xc.nonzero()):
        if i < j:
            Xj[i,j] = Xc[i,j] / (Xc_sum[i] + Xc_sum[j] - Xc[i,j])

    jaccard_df = pd.DataFrame(Xj, columns=cooccur_df.columns, index=cooccur_df.columns)

    return jaccard_df


# 共起頻度行列を Jaccard 係数行列に変換する (外部変数-抽出語用)
def jaccard_attrs_coef(df, attr_counts, word_counts, total=10000, conditional=False):

    import numpy as np
    import pandas as pd

    Xc = df.values
    Xj = np.zeros(df.shape)

    for i, j in zip(*Xc.nonzero()):

        if not conditional:
            conditional_prob = Xc[i,j] / attr_counts[i]
            assumption_prob = word_counts[j] / total

            if conditional_prob > assumption_prob:
                Xj[i,j] = Xc[i,j] / (attr_counts[i] + word_counts[j] - Xc[i,j])
            else:
                Xj[i,j] = .0
        else:
            Xj[i,j] = Xc[i,j] / (attr_counts[i] + word_counts[j] - Xc[i,j])

    jaccard_df = pd.DataFrame(Xj, columns=df.columns, index=df.index)

    return jaccard_df

### 1.1 データのダウンロード (前回ダウンロード済みのためスキップ)

以下のデータがダウンロード済みです

| ファイル名 | 件数 | データセット | 備考 |
| --- | --- | --- | --- |
| rakuten-1000-2022-2023.xlsx.zip | 10,000 | •レジャー+ビジネスの 10エリア<br>•エリアごと 1,000件 (ランダムサンプリング)<br>•期間: 2022/1~2023 GW明け | 本講義の全体を通して使用する |

In [None]:
# もし、再度ダウンロードが必要な場合は残りの行のコメントマーク「#」を除去して、このセルを再実行してください

# FILE_ID = "1n-uvGoH7XQhxexN57hYXuFrkGeHKp-HV"
# !gdown --id {FILE_ID}
# !unzip rakuten-1000-2022-2023.xlsx.zip

### 1.2 データの読み込み (DataFrame型)

In [None]:
import numpy as np
import pandas as pd

all_df = pd.read_excel("rakuten-1000-2022-2023.xlsx")
print(all_df.shape)
display(all_df.head())

### 1.3 単語の抽出

コメント列から単語を抽出する (単語を品詞「名詞」「形容詞」「未知語」で絞り込む)

In [None]:
from collections import defaultdict
import MeCab

tagger = MeCab.Tagger("-r ../tools/usr/local/etc/mecabrc --unk-feature 未知語")

word_counts = defaultdict(lambda: 0)
words = []

ZEN = "".join(chr(0xff01 + i) for i in range(94))
HAN = "".join(chr(0x21 + i) for i in range(94))
HAN2ZEN = str.maketrans(HAN, ZEN)

# stopwords = ['する', 'ある', 'ない', 'いう', 'もの', 'こと', 'よう', 'なる', 'ほう']
stopwords = ["湯畑"]

for index, row in all_df.iterrows():
    node = tagger.parseToNode(row["コメント"].translate(HAN2ZEN))
    while node:
        features = node.feature.split(',')
        pos1 = features[0]
        pos2 = features[1] if len(features) > 1 else ""
        base = features[6] if len(features) > 6 else None

        if base not in stopwords:

            if (pos1 == "名詞" and pos2 == "一般"):
                base = base if base is not None else node.surface
                postag = "名詞"
                key = (base, postag)
                word_counts[key] += 1
                words.append([index + 1, base, postag, row["カテゴリー"], row["エリア"], key])

            elif (pos1 == "名詞" and pos2 == "形容動詞語幹"):
                base = base if base is not None else node.surface
                base = f"{base}だ"
                postag = "形容動詞"
                key = (base, postag)
                word_counts[key] += 1
                words.append([index + 1, base, postag, row["カテゴリー"], row["エリア"], key])

            elif pos1 == "形容詞":
                base = base if base is not None else node.surface
                postag = "形容詞"
                key = (base, postag)
                word_counts[key] += 1
                words.append([index + 1, base, postag, row["カテゴリー"], row["エリア"], key])

            elif pos1 == "未知語":
                base = base if base is not None else node.surface
                postag = "未知語"
                key = (base, postag)
                word_counts[key] += 1
                words.append([index + 1, base, postag, row["カテゴリー"], row["エリア"], key])

        node = node.next

columns = [
    "文書ID",
    # "単語ID",
    "表層",
    "品詞",
    "カテゴリー",
    "エリア",
    "dict_key",
]
docs_df = pd.DataFrame(words, columns=columns)
print(docs_df.shape)
display(docs_df.head())

### 1.4 単語の出現回数 (Top 75)

単語の出現頻度をカウントする

In [None]:
word_list = []
for i, (k, v) in enumerate(sorted(word_counts.items(), key=lambda x:x[1], reverse=True)):
    word_list.append((i, k[0], v, k))

columns = [
    "単語ID",
    "表層",
    "出現頻度",
    "dict_key"
]
word_counts_df = pd.DataFrame(word_list, columns=columns)
print(word_counts_df.shape)
display(word_counts_df.head(10))

単語IDを紐つける (出現回数 Top 150語のみ抽出する)

In [None]:
word_counts_150_df = word_counts_df[0:150]
merged_df = pd.merge(docs_df, word_counts_150_df, how="inner", on="dict_key", suffixes=["", "_right"])
docs_150_df = merged_df[["文書ID", "単語ID", "表層", "品詞", "カテゴリー", "エリア", "dict_key"]]
print(docs_150_df.shape)
display(docs_150_df)

### 1.5 ワードクラウド

In [None]:
words = ' '.join(word_counts_df['表層'][0:75])
plot_wordcloud(words)

### 1.6 「文書-抽出語」表の作成

「文書-抽出語」表を作成する (出現回数 Top 75語)

In [None]:
word_counts_75_df = word_counts_df[0:75]
merged_df = pd.merge(docs_df, word_counts_75_df, how="inner", on="dict_key", suffixes=["", "_right"])
docs_75_df = merged_df[["文書ID", "単語ID", "表層", "品詞", "カテゴリー", "エリア", "dict_key"]]

cross_75_df = pd.crosstab(
    [
        docs_75_df['カテゴリー'], 
        docs_75_df['エリア'], 
        docs_75_df['文書ID']
    ], 
    docs_75_df['単語ID'], margins=False
)
cross_75_df.columns = word_counts_75_df["表層"]
print(cross_75_df.shape)
display(cross_75_df)

「文書-抽出語」 表を {0,1} に変換する

In [None]:
cross_75_df[cross_75_df > 0] = 1
print(cross_75_df.shape)
display(cross_75_df)

### 1.7 共起ネットワーク図

#### 1.7.1 共起度行列を作成する (抽出語-抽出語)

In [None]:
from scipy.sparse import csc_matrix

X = cross_75_df.values
X = csc_matrix(X)
Xc = (X.T * X)
Xc = np.triu(Xc.toarray())

cooccur_75_df = pd.DataFrame(Xc, columns=cross_75_df.columns, index=cross_75_df.columns)
print(cooccur_75_df.shape)
display(cooccur_75_df.head())

#### 1.7.2 Jaccard 係数を求める (抽出語-抽出語)

In [None]:
jaccard_75_df = jaccard_coef(cooccur_75_df, cross_75_df)
print(jaccard_75_df.shape)
display(jaccard_75_df.head())

### 1.7.3 プロットする

In [None]:
word_counts = cross_75_df.sum(axis=0).values
plot_cooccur_network(jaccard_75_df, word_counts, np.sort(jaccard_75_df.values.reshape(-1))[::-1][60])

### 1.8 係り受けネットワーク図 

#### 1.8.1 係り受け行列を作成する

In [None]:
# チャンク(chunk)から単語を取り出す
def get_words(tree, from_chunk, stopwords):
    beg = from_chunk.token_pos
    end = from_chunk.token_pos + from_chunk.token_size

    words = []
    for i in range(beg, end):
        token = tree.token(i)
        features = token.feature.split(',')
        pos1 = features[0]
        pos2 = features[1] if len(features) > 1 else ""
        base = features[6] if len(features) > 6 else None

        if base not in stopwords:

            if (pos1 == "名詞" and pos2 == "一般"):
                base = base if base is not None else node.surface
                postag = "名詞"
                key = (base, postag)
                words.append(key)

            elif (pos1 == "名詞" and pos2 == "形容動詞語幹"):
                base = base if base is not None else node.surface
                base = f"{base}だ"
                postag = "形容動詞"
                key = (base, postag)
                words.append(key)

            elif pos1 == "形容詞":
                base = base if base is not None else node.surface
                postag = "形容詞"
                key = (base, postag)
                words.append(key)

            elif pos1 == "未知語":
                base = base if base is not None else node.surface
                postag = "未知語"
                key = (base, postag)
                words.append(key)

    return words


import CaboCha

cp = CaboCha.Parser("-r ../tools/usr/local/etc/cabocharc")

ZEN = "".join(chr(0xff01 + i) for i in range(94))
HAN = "".join(chr(0x21 + i) for i in range(94))
HAN2ZEN = str.maketrans(HAN, ZEN)

# stopwords = ['する', 'ある', 'ない', 'いう', 'もの', 'こと', 'よう', 'なる', 'ほう']
stopwords = ['*']

pair_counts = defaultdict(lambda: 0)
pairs = []

for index, row in all_df.iterrows():
    # print(cp.parseToString(row["コメント"].translate(HAN2ZEN)))
    tree = cp.parse(row["コメント"].translate(HAN2ZEN))

    # chunks = get_chunks(tree)
    chunks = {}
    key = 0
    for i in range(tree.size()):
        tok = tree.token(i)
        if tok.chunk:
            chunks[key] = tok.chunk
            key += 1

    for from_chunk in chunks.values():
        if from_chunk.link < 0:
            continue
        to_chunk = chunks[from_chunk.link]

        # from_surface = get_surface(tree, from_chunk)
        from_words = get_words(tree, from_chunk, stopwords)

        # to_surface = get_surface(tree, to_chunk)
        to_words = get_words(tree, to_chunk, stopwords)

    for f in from_words:
        for t in to_words:
            key = (f[0], t[0])
            pair_counts[key] += 1

Xc = cooccur_75_df.values
Xd = np.zeros(Xc.shape)

for (f,t), v in pair_counts.items():
    columns = list(cooccur_75_df.columns)
    if f in columns and t in columns:
        i = columns.index(f)
        j = columns.index(t)
        Xd[i,j] = v

dep_75_df = pd.DataFrame(Xd, columns=cooccur_75_df.columns, index=cooccur_75_df.columns)
print(dep_75_df.shape)
display(dep_75_df.head())

#### 1.8.2 条件付き確率を求める

In [None]:
Xc = cooccur_75_df.values
Xd = np.zeros(Xc.shape)
word_counts = cooccur_75_df.sum(axis=0).values

for (f,t), v in pair_counts.items():
    columns = list(cooccur_75_df.columns)
    if f in columns and t in columns:
        i = columns.index(f)
        j = columns.index(t)
        Xd[i,j] = v / word_counts[i]

dep_75_df = pd.DataFrame(Xd, columns=cooccur_75_df.columns, index=cooccur_75_df.columns)
print(dep_75_df.shape)
display(dep_75_df.head())

#### 1.8.3 プロットする

In [None]:
word_counts = cross_75_df.sum(axis=0).values
plot_dependency_network(dep_75_df, word_counts, np.sort(dep_75_df.values.reshape(-1))[::-1][60])

### 1.9 対応分析

「文書-抽出語」 表を確認する

In [None]:
print(cross_75_df.shape)
display(cross_75_df.head())

#### 1.9.1 「外部変数-抽出語」 クロス集計表を作成する

In [None]:
aggregate_75_df = pd.concat([
    cross_75_df.groupby(level='カテゴリー').sum(), 
    cross_75_df.groupby(level='エリア').sum()
    ])
print(aggregate_75_df.shape)
display(aggregate_75_df)

#### 1.9.2 対応分析プロットを作成する

In [None]:
import mca

ncols = aggregate_75_df.shape[1]
mca_ben = mca.MCA(aggregate_75_df, ncols=ncols, benzecri=False)
row_coord = mca_ben.fs_r(N=2)
col_coord = mca_ben.fs_c(N=2)

eigenvalues = mca_ben.L
total = np.sum(eigenvalues)
explained_inertia = 100 * eigenvalues / total

row_labels = aggregate_75_df.index
col_labels = aggregate_75_df.columns
plot_coresp(row_coord, col_coord,row_labels, col_labels, explained_inertia)

### 1.10 トピックモデル

「文書-抽出語」 表を確認する

In [None]:
print(cross_75_df.shape)
display(cross_75_df.head())

#### 1.10.1 トピックを抽出する (LDA)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation as LDA

lda = LDA(max_iter=25, learning_method='batch', random_state=0, n_jobs=-1, n_components=6)
lda.fit(cross_75_df.values)

n_top_words = 20
feature_names = cross_75_df.columns

for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic # {topic_idx+1}:", end=" ")
    for i in topic.argsort()[:-n_top_words-1:-1]:
        print(feature_names[i], end=" ")
    print()

#### 1.10.2 結果をワードクラウドで描画する

In [None]:
n_top_words = 75
plot_topic_model(lda, feature_names, n_top_words)

---

### 1.11 外部変数の利用

#### 1.11.1 「文書-抽出語」表の作成

「文書-抽出語」表を作成する (出現回数 Top 150語)

In [None]:
word_counts_150_df = word_counts_df[0:150]
merged_df = pd.merge(docs_df, word_counts_150_df, how="inner", on="dict_key", suffixes=["", "_right"])
docs_150_df = merged_df[["文書ID", "単語ID", "表層", "品詞", "カテゴリー", "エリア", "dict_key"]]

cross_150_df = pd.crosstab(
    [
        docs_150_df['カテゴリー'], 
        docs_150_df['エリア'], 
        docs_150_df['文書ID']
    ], 
    docs_150_df['単語ID'], margins=False
)
cross_150_df.columns = word_counts_150_df["表層"]

「文書-抽出語」表を {0,1} に変換する

In [None]:
cross_150_df[cross_150_df > 0] = 1
print(cross_150_df.shape)
display(cross_150_df)

#### 1.11.2 共起行列を作成する (外部変数-抽出語)

In [None]:
aggregate_df = pd.concat(
    [
        cross_150_df.groupby(level='カテゴリー').sum(), 
        cross_150_df.groupby(level='エリア').sum()
    ]
)
print(aggregate_df.shape)
display(aggregate_df)

#### 1.11.3 Jaccard 係数を求める (外部変数-抽出語)

In [None]:
word_counts = cross_150_df.sum(axis=0).values
attr_counts = np.hstack(
    [
        all_df.value_counts('カテゴリー').values,
        all_df.value_counts('エリア').values
    ]
)
jaccard_attrs_df = jaccard_attrs_coef(aggregate_df, attr_counts, word_counts, total=10000, conditional=False)
print(jaccard_attrs_df.shape)
display(jaccard_attrs_df)

#### 1.11.4 特徴語ランキング

In [None]:
df_list = []
for index, row in jaccard_attrs_df.iterrows():
    df_list.append(row.iloc[np.argsort(row.values)[::-1][:10]].reset_index())

ranking_df = pd.DataFrame(pd.concat(df_list, axis=1))
ranking_df.columns = np.array([c for pair in [[x,f"jaccard"] for x in jaccard_attrs_df.index] for c in pair], dtype='object')
display(ranking_df)

#### 1.11.5 ワードクラウド (カテゴリーごと)

In [None]:
from scipy.sparse import csc_matrix

for name, group in cross_150_df.groupby(level='カテゴリー'):
    print(name)

    sorted_columns = np.argsort(jaccard_attrs_df.loc[name].values)[::-1][:75]
    group_cross_df = group.iloc[:,sorted_columns]
    plot_wordcloud(" ".join(group_cross_df.columns))

#### 1.11.6 共起ネットワーク図 (カテゴリーごと)

In [None]:
from scipy.sparse import csc_matrix

for name, group in cross_150_df.groupby(level='カテゴリー'):
    print(name)

    sorted_columns = np.argsort(jaccard_attrs_df.loc[name].values)[::-1][:75]
    group_cross_df = group.iloc[:,sorted_columns]

    X = group_cross_df.values
    X = csc_matrix(X)
    Xc = (X.T * X)
    Xc = np.triu(Xc.toarray())

    group_cooccur_df = pd.DataFrame(Xc, columns=group_cross_df.columns, index=group_cross_df.columns)
    group_jaccard_df = jaccard_coef(group_cooccur_df, group_cross_df)

    word_counts = group.sum(axis=0).values
    plot_cooccur_network(group_jaccard_df, word_counts, np.sort(group_jaccard_df.values.reshape(-1))[::-1][60])

#### 1.11.7 【演習】 ワードクラウド (エリアごと)

In [None]:
# ToDo

#### 1.11.8 【演習】 共起ネットワーク図 (エリアごと)

In [None]:
# ToDo