## 文書の選択・削除
分野ごとの分析ファイルの数を揃えるため、最もファイル数が小さい分野Aを基準として、分野B、分野Cのファイルをランダムに選択

In [2]:
import os
import random

def select_files(directory, domain_id, file_ids):
    all_files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
    files = [f for f in all_files if domain_id in f]
    if file_ids is None:
        selected_files = random.sample(files, min(7, len(files)))
    else:
        selected_files = file_ids
    for f in files:
        if f not in selected_files:
            os.remove(os.path.join(directory, f))
    #print("Selected files:", selected_files)
    return selected_files


In [3]:
mt_dir = './MultiEnJa/MT-PE/en-ja.mt'
ht_dir = './MultiEnJa/human-translation/en-ja.final'
domain_B = 'B'
domain_C = 'C'

mt_domain_ids_B = select_files(mt_dir, domain_B, None)
ht_domain_ids_B = select_files(ht_dir, domain_B, mt_domain_ids_B)
mt_domain_ids_C = select_files(mt_dir, domain_C, None)
ht_domain_ids_C = select_files(ht_dir, domain_C, mt_domain_ids_C)

In [4]:
print(mt_domain_ids_B)
print(mt_domain_ids_C)

['00000088-B-2-X-9.txt', '00000140-B-5-X-8.txt', '00000109-B-3-X-9.txt', '00000117-B-4-X-8.txt', '00000407-B-5-X-11.txt', '00000450-B-5-X-13.txt', '00000069-B-1-X-8.txt']
['00000164-C-2-X-8.txt', '00000340-C-7-X-18.txt', '00000366-C-8-X-13.txt', '00000500-C-10-X-13.txt', '00000373-C-8-X-19.txt', '00000198-C-4-X-14.txt', '00000474-C-4-X-13.txt']


## 基礎統計量の算出
analysis.ipynbで作成済みのファイルを利用

In [5]:
# 文書数、延べ語数、異なり語数
import pandas as pd
def count(file):
    df = pd.read_csv(file, sep='\t')
    row_count = df.shape[0] # 文書数
    token_sum = df['token'].sum()
    type_sum = df['type'].sum()
    return row_count, token_sum, type_sum

In [6]:
A_mt = './MultiEnJa/domain_A_mt.tsv'
B_mt = './MultiEnJa/domain_B_mt.tsv'
C_mt = './MultiEnJa/domain_C_mt.tsv'
A_ht = './MultiEnJa/domain_A_ht.tsv'
B_ht = './MultiEnJa/domain_B_ht.tsv'
C_ht = './MultiEnJa/domain_C_ht.tsv'
doc_num, token_sum, type_sum = count(A_mt)
print('domain A MT: ', doc_num, token_sum, type_sum)
doc_num, token_sum, type_sum = count(B_mt)
print('domain B MT: ', doc_num, token_sum, type_sum)
doc_num, token_sum, type_sum = count(C_mt)
print('domain C MT: ', doc_num, token_sum, type_sum)
doc_num, token_sum, type_sum = count(A_ht)
print('domain A HT: ', doc_num, token_sum, type_sum)
doc_num, token_sum, type_sum = count(B_ht)
print('domain B HT: ', doc_num, token_sum, type_sum)
doc_num, token_sum, type_sum = count(C_ht)
print('domain C HT: ', doc_num, token_sum, type_sum)

domain A MT:  7 6070 1979
domain B MT:  7 8833 2583
domain C MT:  7 8408 2317
domain A HT:  7 6633 2130
domain B HT:  7 9157 2615
domain C HT:  7 8627 2335


In [7]:
# 文数
import spacy
import os

def count_sent(folder):
    sent_num_A = 0
    sent_num_B = 0
    sent_num_C = 0
    for file in os.listdir(folder):
        path = folder + '/' + file 
        file_name = os.path.basename(path)
        idx = os.path.splitext(file_name)[0]
        nlp = spacy.load("ja_ginza")
        with open(path, 'r') as f:
            for line in f:
                for sent in nlp(line.strip()).sents:
                    if 'A' in idx:
                        sent_num_A += 1
                    elif 'B' in idx:
                        sent_num_B += 1
                    else:
                        sent_num_C += 1
    return sent_num_A, sent_num_B, sent_num_C

In [8]:
mt_folder = './MultiEnJa/MT-PE/en-ja.mt'
ht_folder = './MultiEnJa/human-translation/en-ja.final'
sent_num_A_mt, sent_num_B_mt, sent_num_C_mt = count_sent(mt_folder)
sent_num_A_ht, sent_num_B_ht, sent_num_C_ht = count_sent(ht_folder)
print(sent_num_A_mt, sent_num_B_mt, sent_num_C_mt)
print(sent_num_A_ht, sent_num_B_ht, sent_num_C_ht)

247 372 453
252 376 484
