# 基礎統計量の算出
analysis.ipynbで作成済みのファイルを利用

In [25]:
# 文書数、延べ語数、異なり語数
import pandas as pd
def count(file):
    df = pd.read_csv(file, sep='\t')
    row_count = df.shape[0] # 文書数
    token_sum = df['token'].sum()
    type_sum = df['type'].sum()
    return row_count, token_sum, type_sum

In [26]:
A_mt = './MultiEnJa/domain_A_mt.tsv'
B_mt = './MultiEnJa/domain_B_mt.tsv'
C_mt = './MultiEnJa/domain_C_mt.tsv'
A_ht = './MultiEnJa/domain_A_ht.tsv'
B_ht = './MultiEnJa/domain_B_ht.tsv'
C_ht = './MultiEnJa/domain_C_ht.tsv'
doc_num, token_sum, type_sum = count(A_mt)
print('domain A MT: ', doc_num, token_sum, type_sum)
doc_num, token_sum, type_sum = count(B_mt)
print('domain B MT: ', doc_num, token_sum, type_sum)
doc_num, token_sum, type_sum = count(C_mt)
print('domain C MT: ', doc_num, token_sum, type_sum)
doc_num, token_sum, type_sum = count(A_ht)
print('domain A HT: ', doc_num, token_sum, type_sum)
doc_num, token_sum, type_sum = count(B_ht)
print('domain B HT: ', doc_num, token_sum, type_sum)
doc_num, token_sum, type_sum = count(C_ht)
print('domain C HT: ', doc_num, token_sum, type_sum)

domain A MT:  7 6070 1979
domain B MT:  13 18444 5021
domain C MT:  26 37291 9279
domain A HT:  7 6633 2130
domain B HT:  13 18866 4955
domain C HT:  26 38681 9287


In [21]:
# 文数
import spacy
import os

def count_sent(folder):
    sent_num_A = 0
    sent_num_B = 0
    sent_num_C = 0
    for file in os.listdir(folder):
        path = folder + '/' + file 
        file_name = os.path.basename(path)
        idx = os.path.splitext(file_name)[0]
        nlp = spacy.load("ja_ginza")
        with open(path, 'r') as f:
            for line in f:
                for sent in nlp(line.strip()).sents:
                    if 'A' in idx:
                        sent_num_A += 1
                    elif 'B' in idx:
                        sent_num_B += 1
                    else:
                        sent_num_C += 1
    return sent_num_A, sent_num_B, sent_num_C

In [22]:
mt_folder = './MultiEnJa/MT-PE/en-ja.mt'
ht_folder = './MultiEnJa/human-translation/en-ja.final'
sent_num_A_mt, sent_num_B_mt, sent_num_C_mt = count_sent(mt_folder)
sent_num_A_ht, sent_num_B_ht, sent_num_C_ht = count_sent(ht_folder)
print(sent_num_A_mt, sent_num_B_mt, sent_num_C_mt)
print(sent_num_A_ht, sent_num_B_ht, sent_num_C_ht)

247 863 2018
252 863 2067


In [23]:
# 考察用のファイルの作成
import spacy
import re
import json
import csv

def analysis(input_file, output_file):
    url = r"https?://\S+" # URL
    email = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}" # メールアドレス

    nlp = spacy.load("ja_ginza")
    doc_sent = []
    with open(input_file, 'r') as f:
        for line in f:
            for sent in nlp(line.strip()).sents:
                sent = str(sent)
                sent = re.sub(url, "URL", sent)
                sent = re.sub(email, "MAIL", sent)
                doc_sent.append(sent)
    with open(output_file, 'w') as g:
        header = ['token', 'pos1', 'pos2', 'pos3']
        writer = csv.writer(g, delimiter="\t")
        writer.writerow(header)
        for i, sent in enumerate(doc_sent):
            for j, token in enumerate(nlp(sent)):
                ja_pos = token.tag_.split('-')
                if len(ja_pos) == 1:
                    pos1, pos2, pos3 = str(ja_pos[0]), 'N', 'N'
                elif len(ja_pos) == 2:
                    pos1, pos2, pos3 = str(ja_pos[0]), str(ja_pos[1]), 'N'
                else: # len(ja_pos) == 3:
                    pos1, pos2, pos3 = str(ja_pos[0]), str(ja_pos[1]), str(ja_pos[2])
                data = [str(token), pos1, pos2, pos3]
                writer.writerow(data)

In [24]:
domain_A_mt_morphs = './MultiEnJa/domain_A_mt_morphs'
domain_A_ht_morphs = './MultiEnJa/domain_A_ht_morphs'
domain_B_mt_morphs = './MultiEnJa/domain_B_mt_morphs'
domain_B_ht_morphs = './MultiEnJa/domain_B_ht_morphs'
domain_C_mt_morphs = './MultiEnJa/domain_C_mt_morphs'
domain_C_ht_morphs = './MultiEnJa/domain_C_ht_morphs'
mt_folder = './MultiEnJa/MT-PE/en-ja.mt'
ht_folder = './MultiEnJa/human-translation/en-ja.final'

for file in os.listdir(mt_folder):
    mt_path = mt_folder + '/' + file 
    file_name = os.path.basename(mt_path)
    idx = os.path.splitext(file_name)[0]
    if 'A' in idx:
        output_file = domain_A_mt_morphs + '/' + idx + '.tsv'
        analysis(mt_path, output_file)
    elif 'B' in idx:
        output_file = domain_B_mt_morphs + '/' + idx + '.tsv'
        analysis(mt_path, output_file)
    else:
        output_file = domain_C_mt_morphs + '/' + idx + '.tsv'
        analysis(mt_path, output_file)

for file in os.listdir(ht_folder):
    ht_path = ht_folder + '/' + file 
    file_name = os.path.basename(ht_path)
    idx = os.path.splitext(file_name)[0]
    if 'A' in idx:
        output_file = domain_A_ht_morphs + '/' + idx + '.tsv'
        analysis(ht_path, output_file)
    elif 'B' in idx:
        output_file = domain_B_ht_morphs + '/' + idx + '.tsv'
        analysis(ht_path, output_file)
    else:
        output_file = domain_C_ht_morphs + '/' + idx + '.tsv'
        analysis(ht_path, output_file)
