# 概要
* transription_jobの結果を評価する
* 2_kick_transcribeで起動したジョブが全て完了したころに実行すること

# 成果物
* 各ファイルごとの、正解の文字起こし結果と、transcribeの文字起こし結果と、WERのcsvファイル(ローカルに保存)

In [None]:
# 使用するライブラリのインストール
# 形態素解析用に janome をインストール。mecab はインストールが面倒なので処理速度が遅いがインストールが容易な janome を利用
!pip install janome
!pip install python-Levenshtein

In [None]:
import pandas as pd
from janome.tokenizer import Tokenizer
import boto3,json,EvaluateSTT,Levenshtein

In [None]:
# 定数
BUCKET_NAME = 'transcribe-output-bucket-202004161130'

In [None]:
client = boto3.client('s3')
t = Tokenizer()

In [None]:
label_df = pd.read_csv('./label.csv')
label_df.head()

In [None]:
# S3 に出力された文字起こし結果を取得し、transcripts_resultに格納する
transcripts_result = []
for i,r in label_df.iterrows():
    body = client.get_object(Bucket=BUCKET_NAME,Key=r[1]+".json")['Body'].read().decode('utf-8')
    transcript = json.loads(body)['results']['transcripts'][0]['transcript'].replace(' ','')
    transcripts_result.append(transcript)
    if i % 100 == 99:
        print(i)

In [None]:
label_df['transcripts_result'] = transcripts_result

In [None]:
label_df.head()

In [None]:
label_df.to_csv('./eval.csv',index=False)

In [None]:
label_df = pd.read_csv('./eval.csv')

In [None]:
test_y = list(label_df['transcript_data'])
pred_y = list(label_df['transcripts_result'])

# WERの算出
* 形態素解析したあとWERを算出する


In [None]:
segment_test_list = []
segment_pred_list = []
eval_list = []
for test,pred in zip(test_y,pred_y):
    segment_test = [token.surface for token in t.tokenize(test)]
    segment_pred = [token.surface for token in t.tokenize(pred)]
    segment_test_list.append(segment_test)
    segment_pred_list.append(segment_pred)
    evaluate = EvaluateSTT.Levenshtein_distance(segment_pred, segment_test)
    result = EvaluateSTT.output_result(evaluate)
    eval_list.append(result)

In [None]:
label_df['transcript_data_segment'] = segment_test_list
label_df['transcript_result_segment'] = segment_pred_list
label_df['eval'] = eval_list

In [None]:
label_df['eval'].head()

In [None]:
label_df['WER'] = label_df['eval'].apply(lambda x: float(x[0][5:]))

In [None]:
label_df['levenshtein_distance'] = label_df.apply(lambda x: Levenshtein.distance(x[2],x[3]),axis=1)

In [None]:
label_df.head()

In [None]:
label_df['string_length'] = label_df['transcript_data'].apply(lambda x: len(x))

# 句読点を除去したWERの算出

In [None]:
label_df['transcript_data_remove_punctuation_marks'] = label_df['transcript_data'].apply(lambda x: x.replace('。','').replace('、',''))
label_df['transcript_result_remove_punctuation_marks'] = label_df['transcripts_result'].apply(lambda x: x.replace('。','').replace('、',''))
label_df['string_length_remove_punctuation_marks'] = label_df['transcript_data_remove_punctuation_marks'].apply(lambda x: len(x))
label_df['levenshtein_distance_remove_punctuation_marks'] = label_df[['transcript_data_remove_punctuation_marks','transcript_result_remove_punctuation_marks']].apply(lambda x: Levenshtein.distance(x[0],x[1]),axis=1)
label_df.head()

In [None]:
label_df['accuracy_remove_punctuation_marks'] = 1-label_df['levenshtein_distance_remove_punctuation_marks']/label_df['string_length_remove_punctuation_marks']

In [None]:
test_y = list(label_df['transcript_data_remove_punctuation_marks'])
pred_y = list(label_df['transcript_result_remove_punctuation_marks'])

In [None]:
segment_test_list = []
segment_pred_list = []
eval_list = []
for test,pred in zip(test_y,pred_y):
    segment_test = [token.surface for token in t.tokenize(test)]
    segment_pred = [token.surface for token in t.tokenize(pred)]
    segment_test_list.append(segment_test)
    segment_pred_list.append(segment_pred)
    evaluate = EvaluateSTT.Levenshtein_distance(segment_pred, segment_test)
    result = EvaluateSTT.output_result(evaluate)
    eval_list.append(result)

In [None]:
label_df['transcript_data_segment_remove_punctuation_marks'] = segment_test_list
label_df['transcript_result_segment_remove_punctuation_marks'] = segment_pred_list
label_df['eval_remove_punctuation_marks'] = eval_list

In [None]:
label_df['WER_remove_punctuation_marks'] = label_df['eval_remove_punctuation_marks'].apply(lambda x: float(x[0][5:]))

In [None]:
label_df['voice_type'] = label_df['file_path'].map(lambda x:x.split('/')[3])

In [None]:
label_df.to_csv('./eval.csv',index=False)

In [None]:
label_df.groupby('voice_type').mean()