Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Match tabs as tokens

  • Loading branch information...
commit 69fdf1ac410fcb635edadffc7317e9c1cdd6256a 1 parent a4af087
@hinrik hinrik authored
View
2  Changes
@@ -7,6 +7,8 @@ Revision history for Hailo
- Word tokenizer: Improve matching/capitalization of filenames and
domain names
+ - Word tokenizer: Match tabs as tokens
+
0.68 2011-05-03 13:16:05
- Speed up the learning of repetitive sentences by caching more
View
2  lib/Hailo/Tokenizer/Words.pm
@@ -70,7 +70,7 @@ sub make_tokens {
my @tokens;
$input =~ s/$DASH\K\s*\n+\s*//;
$input =~ s/\s*\n+\s*/ /gm;
- my @chunks = split /\s+/, $input;
+ my @chunks = split /[\n ]+/, $input;
# process all whitespace-delimited chunks
for my $chunk (@chunks) {
View
5 t/tokenizer/Words.t
@@ -468,6 +468,11 @@ subtest make_output => sub {
[qw<the file is C:\\hlagh\\bar\\foo.txt . just read it .>],
"The file is C:\\hlagh\\bar\\foo.txt. Just read it.",
],
+ [
+ "Tabs\ttabs\ttabs.",
+ ['tabs', "\t", 'tabs', "\t", 'tabs', '.'],
+ "Tabs\ttabs\ttabs.",
+ ],
);
my $toke = Hailo::Tokenizer::Words->new();
Please sign in to comment.
Something went wrong with that request. Please try again.