# データ分析
- 前処理
    - spacy (ginza)を用いた分析
        - 基礎統計量に関する分析の整理
        - 係り受けに関する分析の整理
- 可視化
    - 情報可視化: 品詞情報
    - 情報可視化: 係り受けの情報(報告なし)

## 前処理

## spacy (ginza)を用いた分析

In [116]:
# 前処理と分析
import spacy
import re
import json

def analysis(input_file, output_file):
    url = r"https?://\S+" # URL
    email = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}" # メールアドレス

    nlp = spacy.load("ja_ginza")
    doc_sent = []
    doc_info = {}
    with open(input_file, 'r') as f:
        for line in f:
            for sent in nlp(line.strip()).sents:
                sent = str(sent)
                sent = re.sub(url, "URL", sent)
                sent = re.sub(email, "MAIL", sent)
                doc_sent.append(str(sent))

    for i, sent in enumerate(doc_sent):
        sent_info = {'sent_id': str(i), 'sent': sent}
        token_info_all = {}
        for j, token in enumerate(nlp(sent)):
            ja_pos = token.tag_.split('-')
            if len(ja_pos) == 1:
                pos1, pos2, pos3 = ja_pos[0], 'N', 'N'
            elif len(ja_pos) == 2:
                pos1, pos2, pos3 = ja_pos[0], ja_pos[1], 'N'
            else: # len(ja_pos) == 3:
                pos1, pos2, pos3 = ja_pos[0], ja_pos[1], ja_pos[2]
            dist = abs(int(token.head.i) - j)
            token_info = {'token_id': str(token.i),
                        'token': str(token),
                        'lemma': str(token.lemma_),
                        'pos': str(token.pos_),
                        'pos1': str(pos1),
                        'pos2': str(pos2),
                        'pos3': str(pos3), 
                        'dep': str(token.dep_), # 依存関係
                        'head': str(token.head), # 係り先のトークン
                        'head.id': str(token.head.i), # 係り先のトークンのインデックス
                        'dist': str(dist),# 係り受けの距離
                        'children': [child.i for child in token.children]} # 係り先のトークンのインデックス
            token_info_all[str(j)] = token_info
        sent_info['token_info'] = token_info_all
        doc_info[str(i)] = sent_info
    with open(output_file, 'w') as g:
        print(json.dumps(doc_info, ensure_ascii=False, indent=2), file=g)

In [117]:
# 分析用ファイルの作成
import os
mt_folder = './MultiEnJa/MT-PE/en-ja.mt'
ht_folder = './MultiEnJa/human-translation/en-ja.final'
mt_out_folder = './MultiEnJa/mt-info'
ht_out_folder = './MultiEnJa/ht-info'

for file in os.listdir(mt_folder):
    mt_path = mt_folder + '/' + file 
    mt_file = os.path.basename(mt_path)
    mt_id = os.path.splitext(mt_file)[0]
    mt_out = mt_out_folder + '/' + mt_id + '.json'
    analysis(mt_path, mt_out)

for file in os.listdir(ht_folder):
    ht_path = ht_folder + '/' + file 
    ht_file = os.path.basename(ht_path)
    ht_id = os.path.splitext(ht_file)[0]
    ht_out = ht_out_folder + '/' + ht_id + '.json'
    analysis(ht_path, ht_out)

### 基礎統計量に関する分析の整理

In [118]:
# 語に関する基礎統計量、品詞情報(頻度)の格納
import json
from collections import defaultdict

def analysis_2(output_file):
    #mt_output_file = './MultiEnJa/mt-info/00000088-B-2-X-9.json'
    word_freq = defaultdict(int)
    part_of_speech = {}
    with open(output_file, 'r') as f:
        mt_info = json.load(f)
        for sent_id, sent_info in mt_info.items():
            for token_id, token_info in sent_info['token_info'].items():
                word_freq[token_info['token']] += 1
                if token_info['pos1'] not in part_of_speech:
                    part_of_speech[token_info['pos1']] = {}
                if token_info['pos2'] not in part_of_speech[token_info['pos1']]:
                    part_of_speech[token_info['pos1']][token_info['pos2']] = {}
                if token_info['pos3'] not in part_of_speech[token_info['pos1']][token_info['pos2']]:
                    part_of_speech[token_info['pos1']][token_info['pos2']][token_info['pos3']] = 1
                else:
                    part_of_speech[token_info['pos1']][token_info['pos2']][token_info['pos3']] += 1
    #print(dict(sorted(word_freq.items(), key=lambda item: item[1], reverse=True))) # 各単語の出現頻度を降順に並べ替え
    return len(word_freq), sum(word_freq.values()), round(len(word_freq)/sum(word_freq.values()), 3), part_of_speech
    # 異なり語数、延べ語数、TTR、品詞情報を返す

In [None]:
# 品詞情報(頻度)の記録(1文書ずつ)
import os
import json
mt_pos_folder = './MultiEnJa/mt-pos-info'
ht_pos_folder = './MultiEnJa/ht-pos-info'
mt_out_folder = './MultiEnJa/mt-info'
ht_out_folder = './MultiEnJa/ht-info'
for file in os.listdir(mt_out_folder):
    mt_path = mt_out_folder + '/' + file 
    mt_file = os.path.basename(mt_path)
    mt_id = os.path.splitext(mt_file)[0]
    mt_out_file = mt_pos_folder + '/' + mt_id + '.json'
    _, _, _, pos = analysis_2(mt_path)
    with open(mt_out_file, 'w') as f:
        print(json.dumps(pos, ensure_ascii=False, indent=1), file=f)
for file in os.listdir(ht_out_folder):
    ht_path = ht_out_folder + '/' + file 
    ht_file = os.path.basename(ht_path)
    ht_id = os.path.splitext(ht_file)[0]
    ht_out_file = ht_pos_folder + '/' + ht_id + '.json'
    _, _, _, pos = analysis_2(ht_path)
    with open(ht_out_file, 'w') as f:
        print(json.dumps(pos, ensure_ascii=False, indent=1), file=f)

In [120]:
# ヘッダーのみ作成
import csv
def add_header(file):
    header = ['doc_id', 'type', 'token', 'TTR']
    with open(file, 'w') as f:
        writer = csv.writer(f, delimiter="\t")
        writer.writerow(header)

In [121]:
# ヘッダーのみ作成
domain_A_mt = './MultiEnJa/domain_A_mt.tsv' # 金融・経済・法務"
domain_B_mt = './MultiEnJa/domain_B_mt.tsv' # 医学・医薬
domain_C_mt = './MultiEnJa/domain_C_mt.tsv' # 工業・科学技術
domain_A_ht = './MultiEnJa/domain_A_ht.tsv' # 金融・経済・法務"
domain_B_ht = './MultiEnJa/domain_B_ht.tsv' # 医学・医薬
domain_C_ht = './MultiEnJa/domain_C_ht.tsv' # 工業・科学技術

files = [domain_A_mt, domain_B_mt, domain_C_mt, domain_A_ht, domain_B_ht, domain_C_ht]
for file in files:
    add_header(file)

In [122]:
# 異なり語数、延べ語数、TTRの記録
import os
mt_out_folder = './MultiEnJa/mt-info'
ht_out_folder = './MultiEnJa/ht-info'
domain_A_mt = './MultiEnJa/domain_A_mt.tsv' # 金融・経済・法務"
domain_B_mt = './MultiEnJa/domain_B_mt.tsv' # 医学・医薬
domain_C_mt = './MultiEnJa/domain_C_mt.tsv' # 工業・科学技術
domain_A_ht = './MultiEnJa/domain_A_ht.tsv' # 金融・経済・法務"
domain_B_ht = './MultiEnJa/domain_B_ht.tsv' # 医学・医薬
domain_C_ht = './MultiEnJa/domain_C_ht.tsv' # 工業・科学技術

for file in os.listdir(mt_out_folder):
    mt_path = mt_out_folder + '/' + file 
    mt_file = os.path.basename(mt_path)
    mt_id = os.path.splitext(mt_file)[0]
    token_w, type_w, ttr, _ = analysis_2(mt_path)
    data = [mt_id, token_w, type_w, ttr]
    if 'A' in mt_id:
        with open(domain_A_mt, 'a') as f:
            writer = csv.writer(f, delimiter="\t")
            writer.writerow(data)
    elif 'B' in mt_id:
        with open(domain_B_mt, 'a') as f:
            writer = csv.writer(f, delimiter="\t")
            writer.writerow(data)
    else: # 'C' in mt_id:
        with open(domain_C_mt, 'a') as f:
            writer = csv.writer(f, delimiter="\t")
            writer.writerow(data)
        

for file in os.listdir(ht_out_folder):
    ht_path = ht_out_folder + '/' + file 
    ht_file = os.path.basename(ht_path)
    ht_id = os.path.splitext(ht_file)[0]
    token_w, type_w, ttr, _ = analysis_2(ht_path)
    data = [ht_id, token_w, type_w, ttr]
    if 'A' in ht_id:
        with open(domain_A_ht, 'a') as f:
            writer = csv.writer(f, delimiter="\t")
            writer.writerow(data)
    elif 'B' in ht_id:
        with open(domain_B_ht, 'a') as f:
            writer = csv.writer(f, delimiter="\t")
            writer.writerow(data)
    else: # 'C' in ht_id:
        with open(domain_C_ht, 'a') as f:
            writer = csv.writer(f, delimiter="\t")
            writer.writerow(data)

In [154]:
# 分野ごとに品詞情報をまとめる
import os
import json
import csv
from collections import defaultdict
mt_pos_folder = './MultiEnJa/mt-pos-info'
ht_pos_folder = './MultiEnJa/ht-pos-info'
domain_A_mt = './MultiEnJa/domain_A_mt_pos.tsv' # 金融・経済・法務"
domain_B_mt = './MultiEnJa/domain_B_mt_pos.tsv' # 医学・医薬
domain_C_mt = './MultiEnJa/domain_C_mt_pos.tsv' # 工業・科学技術
domain_A_ht = './MultiEnJa/domain_A_ht_pos.tsv' # 金融・経済・法務"
domain_B_ht = './MultiEnJa/domain_B_ht_pos.tsv' # 医学・医薬
domain_C_ht = './MultiEnJa/domain_C_ht_pos.tsv' # 工業・科学技術
header = ['pos1', 'pos2', 'pos3', 'freq']
with open(domain_A_mt, 'w') as f1, open(domain_B_mt, 'w') as f2, open(domain_C_mt, 'w') as f3:
    writer = csv.writer(f1, delimiter="\t")
    writer.writerow(header)
    writer = csv.writer(f2, delimiter="\t")
    writer.writerow(header)
    writer = csv.writer(f3, delimiter="\t")
    writer.writerow(header)
domain_A_pos = defaultdict(int)
domain_B_pos = defaultdict(int)
domain_C_pos = defaultdict(int)
for file in os.listdir(mt_pos_folder):
    mt_path = mt_pos_folder + '/' + file 
    mt_file = os.path.basename(mt_path)
    mt_id = os.path.splitext(mt_file)[0]  
    with open(mt_path, 'r') as f:
        pos_info = json.load(f)
        for pos1, pos2_info in pos_info.items():
            for pos2, pos3_info in pos2_info.items():
                for pos3, freq in pos3_info.items():
                    pos_data = pos1 + '\t' + pos2 + '\t' + pos3
                    if 'A' in mt_id:
                        domain_A_pos[pos_data] += int(freq)
                    elif 'B' in mt_id:
                        domain_B_pos[pos_data] += int(freq)
                    elif 'C' in mt_id:
                        domain_C_pos[pos_data] += int(freq)
with open(domain_A_mt, 'a') as g:
    for pos_info, freq in domain_A_pos.items():
        pos_data = pos_info.split('\t')
        pos_data.append(freq)
        writer = csv.writer(g, delimiter="\t")
        writer.writerow(pos_data)
with open(domain_B_mt, 'a') as g:
    for pos_info, freq in domain_B_pos.items():
        pos_data = pos_info.split('\t')
        pos_data.append(freq)
        writer = csv.writer(g, delimiter="\t")
        writer.writerow(pos_data)
with open(domain_C_mt, 'a') as g:
    for pos_info, freq in domain_C_pos.items():
        pos_data = pos_info.split('\t')
        pos_data.append(freq)
        writer = csv.writer(g, delimiter="\t")
        writer.writerow(pos_data)


with open(domain_A_ht, 'w') as f1, open(domain_B_ht, 'w') as f2, open(domain_C_ht, 'w') as f3:
    writer = csv.writer(f1, delimiter="\t")
    writer.writerow(header)
    writer = csv.writer(f2, delimiter="\t")
    writer.writerow(header)
    writer = csv.writer(f3, delimiter="\t")
    writer.writerow(header)
domain_A_pos = defaultdict(int)
domain_B_pos = defaultdict(int)
domain_C_pos = defaultdict(int)
for file in os.listdir(ht_pos_folder): 
    ht_path = ht_pos_folder + '/' + file 
    ht_file = os.path.basename(ht_path)
    ht_id = os.path.splitext(ht_file)[0]  
    with open(ht_path, 'r') as f:
        pos_info = json.load(f)
        for pos1, pos2_info in pos_info.items():
            for pos2, pos3_info in pos2_info.items():
                for pos3, freq in pos3_info.items():
                    pos_data = pos1 + '\t' + pos2 + '\t' + pos3
                    if 'A' in ht_id:
                        domain_A_pos[pos_data] += int(freq)
                    elif 'B' in ht_id:
                        domain_B_pos[pos_data] += int(freq)
                    elif 'C' in ht_id:
                        domain_C_pos[pos_data] += int(freq)
with open(domain_A_ht, 'a') as g:
    for pos_info, freq in domain_A_pos.items():
        pos_data = pos_info.split('\t')
        pos_data.append(freq)
        writer = csv.writer(g, delimiter="\t")
        writer.writerow(pos_data)
with open(domain_B_ht, 'a') as g:
    for pos_info, freq in domain_B_pos.items():
        pos_data = pos_info.split('\t')
        pos_data.append(freq)
        writer = csv.writer(g, delimiter="\t")
        writer.writerow(pos_data)
with open(domain_C_ht, 'a') as g:
    for pos_info, freq in domain_C_pos.items():
        pos_data = pos_info.split('\t')
        pos_data.append(freq)
        writer = csv.writer(g, delimiter="\t")
        writer.writerow(pos_data)

In [155]:
# 品詞情報の格納(htとmtを結合)
import pandas as pd
domain_list = ['domain_A', 'domain_B', 'domain_C']
for domain in domain_list:
    mt_path = './MultiEnJa/' + domain + '_mt_pos.tsv'
    ht_path = './MultiEnJa/' + domain + '_ht_pos.tsv'
    mt_ht_out = './MultiEnJa/' + domain + '_all_pos.tsv'
    mt_df = pd.read_csv(mt_path, sep="\t")
    ht_df = pd.read_csv(ht_path, sep="\t")

    # 結合
    merged_df = pd.merge(
        mt_df, ht_df, 
        on=["pos1", "pos2", "pos3"], 
        how="outer", 
        suffixes=("_1", "_2")
    )
    merged_df.fillna(0, inplace=True)
    merged_df["freq_1"] = merged_df["freq_1"].astype(int)
    merged_df["freq_2"] = merged_df["freq_2"].astype(int)
    merged_df.rename(columns={"freq_1": "mt_freq", "freq_2": "ht_freq"}, inplace=True)
    merged_df.to_csv(mt_ht_out, sep="\t", index=False)


In [156]:
# 品詞情報の格納(htとmtを結合、ファイル出力、分野ごとに記録)

file1 = './MultiEnJa/domain_A_all_pos.tsv'
file2 = './MultiEnJa/domain_B_all_pos.tsv'
file3 = './MultiEnJa/domain_C_all_pos.tsv'

df1 = pd.read_csv(file1, sep="\t")
df2 = pd.read_csv(file2, sep="\t")
df3 = pd.read_csv(file3, sep="\t")

all_data = pd.concat([df1, df2, df3])
merged_df = all_data.groupby(["pos1", "pos2", "pos3"], as_index=False).sum()
merged_df.to_csv("all_pos.tsv", sep="\t", index=False)

## 情報可視化

### 情報可視化: 品詞情報

In [129]:
import pandas as pd
import altair as alt
import itertools
import warnings
warnings.filterwarnings('ignore')
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [130]:
# フォント
label_font = "IBM Plex Mono"
caption_font = "Noto Sans JP"

In [159]:
#file_A = './MultiEnJa/compare_mt_ht/00000046-A-4-X-8_pos.tsv'
#file_B = './MultiEnJa/compare_mt_ht/00000140-B-5-X-8_pos.tsv'
#file_C = './MultiEnJa/compare_mt_ht/00000189-C-3-X-8_pos.tsv'
file_A = './MultiEnJa/domain_A_all_pos.tsv'
file_B = './MultiEnJa/domain_B_all_pos.tsv'
file_C = './MultiEnJa/domain_C_all_pos.tsv'
file_all = './MultiEnJa/all_pos.tsv'

df_A = pd.read_csv(file_A, sep='\t')
df_B = pd.read_csv(file_B, sep='\t')
df_C = pd.read_csv(file_C, sep='\t')
df_all = pd.read_csv(file_all, sep='\t')

print(df_A.to_string())

    pos1   pos2     pos3  mt_freq  ht_freq
0    代名詞      N        N       29       38
1     副詞      N        N       23       29
2    助動詞      N        N      412      418
3     助詞    係助詞        N      236      252
4     助詞    副助詞        N       43       54
5     助詞   接続助詞        N      153      156
6     助詞    格助詞        N      967     1092
7     助詞   準体助詞        N        9        6
8     助詞    終助詞        N        1        0
9     動詞     一般        N      172      242
10    動詞  非自立可能        N      415      441
11    名詞   固有名詞       一般       22       15
12    名詞   固有名詞       人名       51       33
13    名詞   固有名詞       地名       65       75
14    名詞     数詞        N      319      306
15    名詞   普通名詞     サ変可能      540      646
16    名詞   普通名詞  サ変形状詞可能        5        8
17    名詞   普通名詞       一般     1159     1317
18    名詞   普通名詞     副詞可能      152      136
19    名詞   普通名詞    助数詞可能      178      208
20    名詞   普通名詞    形状詞可能       65       55
21   形容詞     一般        N       17       17
22   形容詞  非

In [160]:
def add_pos_column(row):
    if row["pos2"] == "N" or row["pos3"] == "N":
        return row["pos1"] if row["pos2"] == "N" else f"{row['pos1']}-{row['pos2']}"
    else:
        return f"{row['pos1']}-{row['pos2']}-{row['pos3']}"

In [161]:
# データフレームの変換
df_A_melted = pd.melt(
    df_A,
    id_vars=["pos1", "pos2", "pos3"],  
    value_vars=["mt_freq", "ht_freq"],  
    var_name="category",  
    value_name="freq"    
)
df_A_melted['pos'] = df_A_melted.apply(add_pos_column, axis=1)
df_B_melted = pd.melt(
    df_B,
    id_vars=["pos1", "pos2", "pos3"],  
    value_vars=["mt_freq", "ht_freq"],  
    var_name="category",  
    value_name="freq"    
)
df_B_melted['pos'] = df_B_melted.apply(add_pos_column, axis=1)
df_C_melted = pd.melt(
    df_C,
    id_vars=["pos1", "pos2", "pos3"],  
    value_vars=["mt_freq", "ht_freq"],  
    var_name="category",  
    value_name="freq"    
)
df_C_melted['pos'] = df_C_melted.apply(add_pos_column, axis=1)
df_all_melted = pd.melt(
    df_all,
    id_vars=["pos1", "pos2", "pos3"],  
    value_vars=["mt_freq", "ht_freq"],  
    var_name="category",  
    value_name="freq"    
)
df_all_melted['pos'] = df_all_melted.apply(add_pos_column, axis=1)

df_A_melted['category'] = df_A_melted['category'].replace({'mt_freq': '機械翻訳', 'ht_freq': '人手翻訳'})
df_B_melted['category'] = df_B_melted['category'].replace({'mt_freq': '機械翻訳', 'ht_freq': '人手翻訳'})
df_C_melted['category'] = df_C_melted['category'].replace({'mt_freq': '機械翻訳', 'ht_freq': '人手翻訳'})
df_all_melted['category'] = df_all_melted['category'].replace({'mt_freq': '機械翻訳', 'ht_freq': '人手翻訳'})

# 名詞のみのデータフレーム
df_A_noun = df_A_melted[df_A_melted['pos1'] == '名詞']
df_B_noun = df_B_melted[df_B_melted['pos1'] == '名詞']
df_C_noun = df_C_melted[df_C_melted['pos1'] == '名詞']
df_all_noun = df_all_melted[df_all_melted['pos1'] == '名詞']
# 助詞のみのデータフレーム
#df_A_particle = df_A_melted[df_A_melted['pos1'] == '助詞']

In [162]:
chart = alt.Chart(df_A_melted).mark_bar().encode(
    alt.X("pos1:N", sort="-y", title="品詞"),
    alt.Y("freq:Q", title="頻度"),
    alt.Tooltip(["pos"]),
    alt.Color("pos:N", title='品詞細分類').scale(scheme="set2"),
    alt.Column("category:N"),
    alt.Size(legend=alt.Legend(orient='bottom', titleOrient='left'))
).properties(
    title="品詞(第一細分類)ごとの頻度 (金融・経済・法務)"
).configure_axis(
    labelFont=caption_font,
    titleFont=caption_font,
    labelFontSize=14,
    titleFontSize=16
).configure_title(
    fontSize=16,
    font=caption_font
).configure_legend(
    labelFont=label_font,
    labelFontSize=14,
    titleFont=caption_font,
    titleFontSize=16
)
#chart.save("./chart/grouped_bar_chart_all_pos_A.pdf", scale_factor=4)
chart.show()

In [163]:
chart = alt.Chart(df_A_melted).mark_bar().encode(
    alt.X("pos1:N", sort="-y", title="品詞"),
    alt.Y("freq:Q", title="頻度"),
    alt.Tooltip(["freq"]),
    alt.Color("category:N", title='翻訳の種別').scale(scheme="accent"),
    xOffset="category:N",
    #color="category:N"
).properties(
    title="品詞(大分類)ごとの頻度 (金融・経済・法務)"
).configure_axis(
    labelFont=caption_font,
    titleFont=caption_font,
    labelFontSize=14,
    titleFontSize=16
).configure_title(
    fontSize=16,
    font=caption_font
).configure_legend(
    labelFont=label_font,
    labelFontSize=14,
    titleFont=caption_font,
    titleFontSize=16
)
chart.save("./chart/grouped_bar_chart_all_pos_A.pdf", scale_factor=4)
chart.show()

In [164]:
chart = alt.Chart(df_B_melted).mark_bar().encode(
    alt.X("pos1:N", sort="-y", title="品詞"),
    alt.Y("freq:Q", title="頻度"),
    alt.Tooltip(["freq"]),
    alt.Color("category:N", title='翻訳の種別').scale(scheme="accent"),
    xOffset="category:N",
    #color="category:N"
).properties(
    title="品詞(大分類)ごとの頻度 (医学・医薬)"
).configure_axis(
    labelFont=caption_font,
    titleFont=caption_font,
    labelFontSize=14,
    titleFontSize=16
).configure_title(
    fontSize=16,
    font=caption_font
).configure_legend(
    labelFont=label_font,
    labelFontSize=14,
    titleFont=caption_font,
    titleFontSize=16
)
chart.save("./chart/grouped_bar_chart_all_pos_B.pdf", scale_factor=4)
chart.show()

In [165]:
chart = alt.Chart(df_C_melted).mark_bar().encode(
    alt.X("pos1:N", sort="-y", title="品詞"),
    alt.Y("freq:Q", title="頻度"),
    alt.Tooltip(["freq"]),
    alt.Color("category:N", title='翻訳の種別').scale(scheme="accent"),
    xOffset="category:N",
    #color="category:N"
).properties(
    title="品詞(大分類)ごとの頻度 (工業・科学技術)"
).configure_axis(
    labelFont=caption_font,
    titleFont=caption_font,
    labelFontSize=14,
    titleFontSize=16
).configure_title(
    fontSize=16,
    font=caption_font
).configure_legend(
    labelFont=label_font,
    labelFontSize=14,
    titleFont=caption_font,
    titleFontSize=16
)
chart.save("./chart/grouped_bar_chart_all_pos_C.pdf", scale_factor=4)
chart.show()

In [166]:
chart = alt.Chart(df_all_melted).mark_bar().encode(
    alt.X("pos1:N", sort="-y", title="品詞"),
    alt.Y("freq:Q", title="頻度"),
    alt.Tooltip(["freq"]),
    alt.Color("category:N", title='翻訳の種別').scale(scheme="accent"),
    xOffset="category:N",
    #color="category:N"
).properties(
    title="品詞(大分類)ごとの頻度 (全文書)"
).configure_axis(
    labelFont=caption_font,
    titleFont=caption_font,
    labelFontSize=14,
    titleFontSize=16
).configure_title(
    fontSize=16,
    font=caption_font
).configure_legend(
    labelFont=label_font,
    labelFontSize=14,
    titleFont=caption_font,
    titleFontSize=16
)
chart.save("./chart/grouped_bar_chart_all_pos_all.pdf", scale_factor=4)
chart.show()

In [167]:
chart = alt.Chart(df_A_noun).mark_bar().encode(
    alt.X("pos:N", sort="-y", title="品詞"),
    alt.Y("freq:Q", title="頻度"),
    alt.Tooltip(["freq"]),
    alt.Color("category:N", title='翻訳の種別').scale(scheme="accent"),
    xOffset="category:N",
    #color="category:N"
).properties(
    title="名詞の品詞小分類ごとの頻度 (金融・経済・法務)"
).configure_axis(
    labelFont=caption_font,
    titleFont=caption_font,
    labelFontSize=14,
    titleFontSize=16
).configure_title(
    fontSize=16,
    font=caption_font
).configure_legend(
    labelFont=label_font,
    labelFontSize=14,
    titleFont=caption_font,
    titleFontSize=16
)
chart.save("./chart/grouped_bar_chart_noun_A.pdf", scale_factor=4)
chart.show()

In [168]:
chart = alt.Chart(df_B_noun).mark_bar().encode(
    alt.X("pos:N", sort="-y", title="品詞"),
    alt.Y("freq:Q", title="頻度"),
    alt.Tooltip(["freq"]),
    alt.Color("category:N", title='翻訳の種別').scale(scheme="accent"),
    xOffset="category:N",
    #color="category:N"
).properties(
    title="名詞の品詞小分類ごとの頻度 (医学・医薬)"
).configure_axis(
    labelFont=caption_font,
    titleFont=caption_font,
    labelFontSize=14,
    titleFontSize=16
).configure_title(
    fontSize=16,
    font=caption_font
).configure_legend(
    labelFont=label_font,
    labelFontSize=14,
    titleFont=caption_font,
    titleFontSize=16
)
chart.save("./chart/grouped_bar_chart_noun_B.pdf", scale_factor=4)
chart.show()

In [169]:
chart = alt.Chart(df_C_noun).mark_bar().encode(
    alt.X("pos:N", sort="-y", title="品詞"),
    alt.Y("freq:Q", title="頻度"),
    alt.Tooltip(["freq"]),
    alt.Color("category:N", title='翻訳の種別').scale(scheme="accent"),
    xOffset="category:N",
    #color="category:N"
).properties(
    title="名詞の品詞小分類ごとの頻度 (工業・科学技術)"
).configure_axis(
    labelFont=caption_font,
    titleFont=caption_font,
    labelFontSize=14,
    titleFontSize=16
).configure_title(
    fontSize=16,
    font=caption_font
).configure_legend(
    labelFont=label_font,
    labelFontSize=14,
    titleFont=caption_font,
    titleFontSize=16
)
chart.save("./chart/grouped_bar_chart_noun_C.pdf", scale_factor=4)
chart.show()

In [170]:
chart = alt.Chart(df_all_noun).mark_bar().encode(
    alt.X("pos:N", sort="-y", title="品詞"),
    alt.Y("freq:Q", title="頻度"),
    alt.Tooltip(["freq"]),
    alt.Color("category:N", title='翻訳の種別').scale(scheme="accent"),
    xOffset="category:N",
    #color="category:N"
).properties(
    title="名詞の品詞小分類ごとの頻度 (全文書)"
).configure_axis(
    labelFont=caption_font,
    titleFont=caption_font,
    labelFontSize=14,
    titleFontSize=16
).configure_title(
    fontSize=16,
    font=caption_font
).configure_legend(
    labelFont=label_font,
    labelFontSize=14,
    titleFont=caption_font,
    titleFontSize=16
)
chart.save("./chart/grouped_bar_chart_noun_all.pdf", scale_factor=4)
chart.show()