-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
162 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,37 @@ | ||
# Pangu.cr | ||
Paranoid text spacing in Crystal | ||
# pangu | ||
|
||
Paranoid text spacing for good readability, to automatically insert whitespace between CJK (Chinese, Japanese, Korean) and half-width characters (alphabetical letters, numerical digits and symbols). | ||
|
||
This is the Crystal copy version from [bugtender/rangu](https://github.com/bugtender/rangu). | ||
|
||
## Installation | ||
|
||
Add the dependency to your `shard.yml`: | ||
|
||
```yaml | ||
dependencies: | ||
pangu: | ||
github: isaced/pangu.cr | ||
``` | ||
Run `shards install` | ||
|
||
## Usage | ||
|
||
```crystal | ||
require "pangu" | ||
Pangu.spacing("當你凝視著bug,bug也凝視著你") | ||
=> "當你凝視著 bug,bug 也凝視著你" | ||
Pangu.spacing("path/to/file.txt") | ||
=> "與 PM 戰鬥的人,應當小心自己不要成為 PM" | ||
``` | ||
|
||
## Contributing | ||
|
||
1. Fork it (<https://github.com/isaced/pangu.cr/fork>) | ||
2. Create your feature branch (`git checkout -b my-new-feature`) | ||
3. Commit your changes (`git commit -am 'Add some feature'`) | ||
4. Push to the branch (`git push origin my-new-feature`) | ||
5. Create a new Pull Request |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
name: pangu | ||
version: 0.1.0 | ||
|
||
authors: | ||
- isaced <isaced@163.com> | ||
|
||
crystal: 0.27.2 | ||
|
||
license: MIT |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
require "./spec_helper" | ||
|
||
describe Pangu do | ||
it "略過 _ 符號" do | ||
Pangu.spacing("前面_後面").should eq "前面_後面" | ||
Pangu.spacing("前面 _ 後面").should eq "前面 _ 後面" | ||
Pangu.spacing("Vinta_Mollie").should eq "Vinta_Mollie" | ||
Pangu.spacing("Vinta _ Mollie").should eq "Vinta _ Mollie" | ||
end | ||
|
||
it "處理 Alphabets" do | ||
Pangu.spacing("中文abc").should eq "中文 abc" | ||
Pangu.spacing("中文abc").should eq "中文 abc" | ||
end | ||
|
||
it "處理 Numbers" do | ||
Pangu.spacing("中文123").should eq "中文 123" | ||
Pangu.spacing("123中文").should eq "123 中文" | ||
end | ||
|
||
# https://unicode-table.com/en/blocks/latin-1-supplement/ | ||
it "處理 Latin-1 Supplement" do | ||
Pangu.spacing("中文Ø漢字").should eq "中文 Ø 漢字" | ||
Pangu.spacing("中文 Ø 漢字").should eq "中文 Ø 漢字" | ||
end | ||
|
||
# // https://unicode-table.com/en/blocks/greek-coptic/ | ||
it "處理 Greek and Coptic" do | ||
Pangu.spacing("中文β漢字").should eq "中文 β 漢字" | ||
Pangu.spacing("中文 β 漢字").should eq "中文 β 漢字" | ||
Pangu.spacing("我是α,我是Ω").should eq "我是 α,我是 Ω" | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
require "spec" | ||
require "../src/pangu" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
# | ||
# Paranoid text spacing for good readability, to automatically insert whitespace between CJK (Chinese, Japanese, Korean) and half-width characters (alphabetical letters, numerical digits and symbols). | ||
# | ||
# ```crystal | ||
# require "pangu" | ||
# | ||
# Pangu.spacing("當你凝視著bug,bug也凝視著你") | ||
# => "當你凝視著 bug,bug 也凝視著你" | ||
# | ||
# Pangu.spacing("path/to/file.txt") | ||
# => "與 PM 戰鬥的人,應當小心自己不要成為 PM" | ||
# ``` | ||
# | ||
module Pangu | ||
VERSION = "0.1.0" | ||
|
||
CJK_QUOTE = /([\x{2e80}-\x{2eff}\x{2f00}-\x{2fdf}\x{3040}-\x{309f}\x{30a0}-\x{30ff}\x{3100}-\x{312f}\x{3200}-\x{32ff}\x{3400}-\x{4dbf}\x{4e00}-\x{9fff}\x{f900}-\x{faff}])(["\'])/i | ||
QUOTE_CJK = /(["\'])([\x{3040}-\x{312f}\x{3200}-\x{32ff}\x{3400}-\x{4dbf}\x{4e00}-\x{9fff}\x{f900}-\x{faff}])/i | ||
|
||
FIX_QUOTE = /(["\'\(\[\{<\x{201c}]+)(\s*)(.+?)(\s*)(["\'\)\]\}>\x{201d}]+)/i | ||
FIX_SINGLE_QUOTE = /([\x{2e80}-\x{2eff}\x{2f00}-\x{2fdf}\x{3040}-\x{309f}\x{30a0}-\x{30ff}\x{3100}-\x{312f}\x{3200}-\x{32ff}\x{3400}-\x{4dbf}\x{4e00}-\x{9fff}\x{f900}-\x{faff}])( )(\')([A-Za-zΑ-Ωα-ω])/i | ||
|
||
CJK_HASH = /([\x{2e80}-\x{2eff}\x{2f00}-\x{2fdf}\x{3040}-\x{309f}\x{30a0}-\x{30ff}\x{3100}-\x{312f}\x{3200}-\x{32ff}\x{3400}-\x{4dbf}\x{4e00}-\x{9fff}\x{f900}-\x{faff}])(#(\S+))/i | ||
HASH_CJK = /((\S+)#)([\x{2e80}-\x{2eff}\x{2f00}-\x{2fdf}\x{3040}-\x{309f}\x{30a0}-\x{30ff}\x{3100}-\x{312f}\x{3200}-\x{32ff}\x{3400}-\x{4dbf}\x{4e00}-\x{9fff}\x{f900}-\x{faff}])/i | ||
|
||
CJK_OPERATOR_ANS = /([\x{2e80}-\x{2eff}\x{2f00}-\x{2fdf}\x{3040}-\x{309f}\x{30a0}-\x{30ff}\x{3100}-\x{312f}\x{3200}-\x{32ff}\x{3400}-\x{4dbf}\x{4e00}-\x{9fff}\x{f900}-\x{faff}])([\+\-\*\/=&\\|<>])([A-Za-zΑ-Ωα-ω0-9])/i | ||
ANS_OPERATOR_CJK = /([A-Za-zΑ-Ωα-ω0-9])([\+\-\*\/=&\\|<>])([\x{2e80}-\x{2eff}\x{2f00}-\x{2fdf}\x{3040}-\x{309f}\x{30a0}-\x{30ff}\x{3100}-\x{312f}\x{3200}-\x{32ff}\x{3400}-\x{4dbf}\x{4e00}-\x{9fff}\x{f900}-\x{faff}])/i | ||
|
||
CJK_BRACKET_CJK = /([\x{2e80}-\x{2eff}\x{2f00}-\x{2fdf}\x{3040}-\x{309f}\x{30a0}-\x{30ff}\x{3100}-\x{312f}\x{3200}-\x{32ff}\x{3400}-\x{4dbf}\x{4e00}-\x{9fff}\x{f900}-\x{faff}])([\(\[\{<\x{201c}]+(.*?)[\)\]\}>\x{201d}]+)([\x{2e80}-\x{2eff}\x{2f00}-\x{2fdf}\x{3040}-\x{309f}\x{30a0}-\x{30ff}\x{3100}-\x{312f}\x{3200}-\x{32ff}\x{3400}-\x{4dbf}\x{4e00}-\x{9fff}\x{f900}-\x{faff}])/i | ||
CJK_BRACKET = /([\x{2e80}-\x{2eff}\x{2f00}-\x{2fdf}\x{3040}-\x{309f}\x{30a0}-\x{30ff}\x{3100}-\x{312f}\x{3200}-\x{32ff}\x{3400}-\x{4dbf}\x{4e00}-\x{9fff}\x{f900}-\x{faff}])([\(\[\{<\x{201c}>])/i | ||
BRACKET_CJK = /([\)\]\}>\x{201d}<])([\x{2e80}-\x{2eff}\x{2f00}-\x{2fdf}\x{3040}-\x{309f}\x{30a0}-\x{30ff}\x{3100}-\x{312f}\x{3200}-\x{32ff}\x{3400}-\x{4dbf}\x{4e00}-\x{9fff}\x{f900}-\x{faff}])/i | ||
FIX_BRACKET = /([\(\[\{<\x{201c}]+)(\s*)(.+?)(\s*)([\)\]\}>\x{201d}]+)/i | ||
|
||
FIX_SYMBOL = /([\x{2e80}-\x{2eff}\x{2f00}-\x{2fdf}\x{3040}-\x{309f}\x{30a0}-\x{30ff}\x{3100}-\x{312f}\x{3200}-\x{32ff}\x{3400}-\x{4dbf}\x{4e00}-\x{9fff}\x{f900}-\x{faff}])([~!;:,\.\?\x{2026}])([A-Za-zΑ-Ωα-ω0-9])/i | ||
|
||
CJK_ANS = /([\x{2e80}-\x{2eff}\x{2f00}-\x{2fdf}\x{3040}-\x{309f}\x{30a0}-\x{30ff}\x{3100}-\x{312f}\x{3200}-\x{32ff}\x{3400}-\x{4dbf}\x{4e00}-\x{9fff}\x{f900}-\x{faff}])([A-Za-zΑ-Ωα-ω0-9`\$%\^&\*\-=\+\\\|\/@\x{00a1}-\x{00ff}\x{2022}\x{2027}\x{2150}-\x{218f}])/i | ||
ANS_CJK = /([A-Za-zΑ-Ωα-ω0-9`~\$%\^&\*\-=\+\\\|\/!;:,\.\?\x{00a1}-\x{00ff}\x{2022}\x{2026}\x{2027}\x{2150}-\x{218f}])([\x{2e80}-\x{2eff}\x{2f00}-\x{2fdf}\x{3040}-\x{309f}\x{30a0}-\x{30ff}\x{3100}-\x{312f}\x{3200}-\x{32ff}\x{3400}-\x{4dbf}\x{4e00}-\x{9fff}\x{f900}-\x{faff}])/i | ||
|
||
def self.spacing_text(text : String) | ||
text = text.gsub(CJK_QUOTE, "\\1 \\2") | ||
text = text.gsub(QUOTE_CJK, "\\1 \\2") | ||
|
||
text = text.gsub(FIX_QUOTE, "\\1\\3\\5") | ||
text = text.gsub(FIX_SINGLE_QUOTE, "\\1\\3\\4") | ||
|
||
text = text.gsub(CJK_HASH, "\\1 \\2") | ||
text = text.gsub(HASH_CJK, "\\1 \\3") | ||
|
||
text = text.gsub(CJK_OPERATOR_ANS, "\\1 \\2 \\3") | ||
text = text.gsub(ANS_OPERATOR_CJK, "\\1 \\2 \\3") | ||
|
||
old_text = text | ||
text = text.gsub(CJK_BRACKET_CJK, "\\1 \\2 \\4") | ||
if old_text == text | ||
text = text.gsub(CJK_BRACKET, "\\1 \\2") | ||
text = text.gsub(BRACKET_CJK, "\\1 \\2") | ||
end | ||
text = text.gsub(FIX_BRACKET, "\\1\\3\\5") | ||
|
||
text = text.gsub(FIX_SYMBOL, "\\1\\2 \\3") | ||
|
||
text = text.gsub(CJK_ANS, "\\1 \\2") | ||
text = text.gsub(ANS_CJK, "\\1 \\2") | ||
text | ||
end | ||
|
||
def self.spacing_file(path) | ||
file = File.open(path, "r") | ||
data = file.gets_to_end | ||
file.close | ||
spacing_text(data) | ||
end | ||
|
||
def self.spacing(text_or_path) | ||
if File.file?(text_or_path) | ||
spacing_file(text_or_path) | ||
else | ||
spacing_text(text_or_path) | ||
end | ||
end | ||
end |