From f1f9a485bf6be59a5118273a9d01796a1724a316 Mon Sep 17 00:00:00 2001 From: huangruizhe Date: Wed, 26 Sep 2018 17:56:18 -0400 Subject: [PATCH] [egs] Add missing file local/join_suffix.py in TEDLIUM s5_r3; thx:anand@sayint.ai (#2741) --- egs/tedlium/s5_r3/local/join_suffix.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100755 egs/tedlium/s5_r3/local/join_suffix.py diff --git a/egs/tedlium/s5_r3/local/join_suffix.py b/egs/tedlium/s5_r3/local/join_suffix.py new file mode 100755 index 00000000000..64c62964331 --- /dev/null +++ b/egs/tedlium/s5_r3/local/join_suffix.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python +# +# Copyright 2014 Nickolay V. Shmyrev +# 2016 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0 + + +import sys +from codecs import open + +# This script joins together pairs of split-up words like "you 're" -> "you're". +# The TEDLIUM transcripts are normalized in a way that's not traditional for +# speech recognition. + +for line in sys.stdin: + items = line.split() + new_items = [] + i = 1 + while i < len(items): + if i < len(items) - 1 and items[i+1][0] == '\'': + new_items.append(items[i] + items[i+1]) + i = i + 1 + else: + new_items.append(items[i]) + i = i + 1 + print(items[0] + ' ' + ' '.join(new_items))