Skip to content

Commit

Permalink
Copy from rangu
Browse files Browse the repository at this point in the history
  • Loading branch information
isaced committed Feb 11, 2019
1 parent 0c288ac commit 6d56d91
Show file tree
Hide file tree
Showing 5 changed files with 162 additions and 2 deletions.
39 changes: 37 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,37 @@
# Pangu.cr
Paranoid text spacing in Crystal
# pangu

Paranoid text spacing for good readability, to automatically insert whitespace between CJK (Chinese, Japanese, Korean) and half-width characters (alphabetical letters, numerical digits and symbols).

This is the Crystal copy version from [bugtender/rangu](https://github.com/bugtender/rangu).

## Installation

Add the dependency to your `shard.yml`:

```yaml
dependencies:
pangu:
github: isaced/pangu.cr
```
Run `shards install`

## Usage

```crystal
require "pangu"
Pangu.spacing("當你凝視著bug,bug也凝視著你")
=> "當你凝視著 bug,bug 也凝視著你"
Pangu.spacing("path/to/file.txt")
=> "與 PM 戰鬥的人,應當小心自己不要成為 PM"
```

## Contributing

1. Fork it (<https://github.com/isaced/pangu.cr/fork>)
2. Create your feature branch (`git checkout -b my-new-feature`)
3. Commit your changes (`git commit -am 'Add some feature'`)
4. Push to the branch (`git push origin my-new-feature`)
5. Create a new Pull Request
9 changes: 9 additions & 0 deletions shard.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
name: pangu
version: 0.1.0

authors:
- isaced <isaced@163.com>

crystal: 0.27.2

license: MIT
33 changes: 33 additions & 0 deletions spec/pangu_spec.cr
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
require "./spec_helper"

describe Pangu do
it "略過 _ 符號" do
Pangu.spacing("前面_後面").should eq "前面_後面"
Pangu.spacing("前面 _ 後面").should eq "前面 _ 後面"
Pangu.spacing("Vinta_Mollie").should eq "Vinta_Mollie"
Pangu.spacing("Vinta _ Mollie").should eq "Vinta _ Mollie"
end

it "處理 Alphabets" do
Pangu.spacing("中文abc").should eq "中文 abc"
Pangu.spacing("中文abc").should eq "中文 abc"
end

it "處理 Numbers" do
Pangu.spacing("中文123").should eq "中文 123"
Pangu.spacing("123中文").should eq "123 中文"
end

# https://unicode-table.com/en/blocks/latin-1-supplement/
it "處理 Latin-1 Supplement" do
Pangu.spacing("中文Ø漢字").should eq "中文 Ø 漢字"
Pangu.spacing("中文 Ø 漢字").should eq "中文 Ø 漢字"
end

# // https://unicode-table.com/en/blocks/greek-coptic/
it "處理 Greek and Coptic" do
Pangu.spacing("中文β漢字").should eq "中文 β 漢字"
Pangu.spacing("中文 β 漢字").should eq "中文 β 漢字"
Pangu.spacing("我是α,我是Ω").should eq "我是 α,我是 Ω"
end
end
2 changes: 2 additions & 0 deletions spec/spec_helper.cr
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
require "spec"
require "../src/pangu"
81 changes: 81 additions & 0 deletions src/pangu.cr
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#
# Paranoid text spacing for good readability, to automatically insert whitespace between CJK (Chinese, Japanese, Korean) and half-width characters (alphabetical letters, numerical digits and symbols).
#
# ```crystal
# require "pangu"
#
# Pangu.spacing("當你凝視著bug,bug也凝視著你")
# => "當你凝視著 bug,bug 也凝視著你"
#
# Pangu.spacing("path/to/file.txt")
# => "與 PM 戰鬥的人,應當小心自己不要成為 PM"
# ```
#
module Pangu
VERSION = "0.1.0"

CJK_QUOTE = /([\x{2e80}-\x{2eff}\x{2f00}-\x{2fdf}\x{3040}-\x{309f}\x{30a0}-\x{30ff}\x{3100}-\x{312f}\x{3200}-\x{32ff}\x{3400}-\x{4dbf}\x{4e00}-\x{9fff}\x{f900}-\x{faff}])(["\'])/i
QUOTE_CJK = /(["\'])([\x{3040}-\x{312f}\x{3200}-\x{32ff}\x{3400}-\x{4dbf}\x{4e00}-\x{9fff}\x{f900}-\x{faff}])/i

FIX_QUOTE = /(["\'\(\[\{<\x{201c}]+)(\s*)(.+?)(\s*)(["\'\)\]\}>\x{201d}]+)/i
FIX_SINGLE_QUOTE = /([\x{2e80}-\x{2eff}\x{2f00}-\x{2fdf}\x{3040}-\x{309f}\x{30a0}-\x{30ff}\x{3100}-\x{312f}\x{3200}-\x{32ff}\x{3400}-\x{4dbf}\x{4e00}-\x{9fff}\x{f900}-\x{faff}])( )(\')([A-Za-zΑ-Ωα-ω])/i

CJK_HASH = /([\x{2e80}-\x{2eff}\x{2f00}-\x{2fdf}\x{3040}-\x{309f}\x{30a0}-\x{30ff}\x{3100}-\x{312f}\x{3200}-\x{32ff}\x{3400}-\x{4dbf}\x{4e00}-\x{9fff}\x{f900}-\x{faff}])(#(\S+))/i
HASH_CJK = /((\S+)#)([\x{2e80}-\x{2eff}\x{2f00}-\x{2fdf}\x{3040}-\x{309f}\x{30a0}-\x{30ff}\x{3100}-\x{312f}\x{3200}-\x{32ff}\x{3400}-\x{4dbf}\x{4e00}-\x{9fff}\x{f900}-\x{faff}])/i

CJK_OPERATOR_ANS = /([\x{2e80}-\x{2eff}\x{2f00}-\x{2fdf}\x{3040}-\x{309f}\x{30a0}-\x{30ff}\x{3100}-\x{312f}\x{3200}-\x{32ff}\x{3400}-\x{4dbf}\x{4e00}-\x{9fff}\x{f900}-\x{faff}])([\+\-\*\/=&\\|<>])([A-Za-zΑ-Ωα-ω0-9])/i
ANS_OPERATOR_CJK = /([A-Za-zΑ-Ωα-ω0-9])([\+\-\*\/=&\\|<>])([\x{2e80}-\x{2eff}\x{2f00}-\x{2fdf}\x{3040}-\x{309f}\x{30a0}-\x{30ff}\x{3100}-\x{312f}\x{3200}-\x{32ff}\x{3400}-\x{4dbf}\x{4e00}-\x{9fff}\x{f900}-\x{faff}])/i

CJK_BRACKET_CJK = /([\x{2e80}-\x{2eff}\x{2f00}-\x{2fdf}\x{3040}-\x{309f}\x{30a0}-\x{30ff}\x{3100}-\x{312f}\x{3200}-\x{32ff}\x{3400}-\x{4dbf}\x{4e00}-\x{9fff}\x{f900}-\x{faff}])([\(\[\{<\x{201c}]+(.*?)[\)\]\}>\x{201d}]+)([\x{2e80}-\x{2eff}\x{2f00}-\x{2fdf}\x{3040}-\x{309f}\x{30a0}-\x{30ff}\x{3100}-\x{312f}\x{3200}-\x{32ff}\x{3400}-\x{4dbf}\x{4e00}-\x{9fff}\x{f900}-\x{faff}])/i
CJK_BRACKET = /([\x{2e80}-\x{2eff}\x{2f00}-\x{2fdf}\x{3040}-\x{309f}\x{30a0}-\x{30ff}\x{3100}-\x{312f}\x{3200}-\x{32ff}\x{3400}-\x{4dbf}\x{4e00}-\x{9fff}\x{f900}-\x{faff}])([\(\[\{<\x{201c}>])/i
BRACKET_CJK = /([\)\]\}>\x{201d}<])([\x{2e80}-\x{2eff}\x{2f00}-\x{2fdf}\x{3040}-\x{309f}\x{30a0}-\x{30ff}\x{3100}-\x{312f}\x{3200}-\x{32ff}\x{3400}-\x{4dbf}\x{4e00}-\x{9fff}\x{f900}-\x{faff}])/i
FIX_BRACKET = /([\(\[\{<\x{201c}]+)(\s*)(.+?)(\s*)([\)\]\}>\x{201d}]+)/i

FIX_SYMBOL = /([\x{2e80}-\x{2eff}\x{2f00}-\x{2fdf}\x{3040}-\x{309f}\x{30a0}-\x{30ff}\x{3100}-\x{312f}\x{3200}-\x{32ff}\x{3400}-\x{4dbf}\x{4e00}-\x{9fff}\x{f900}-\x{faff}])([~!;:,\.\?\x{2026}])([A-Za-zΑ-Ωα-ω0-9])/i

CJK_ANS = /([\x{2e80}-\x{2eff}\x{2f00}-\x{2fdf}\x{3040}-\x{309f}\x{30a0}-\x{30ff}\x{3100}-\x{312f}\x{3200}-\x{32ff}\x{3400}-\x{4dbf}\x{4e00}-\x{9fff}\x{f900}-\x{faff}])([A-Za-zΑ-Ωα-ω0-9`\$%\^&\*\-=\+\\\|\/@\x{00a1}-\x{00ff}\x{2022}\x{2027}\x{2150}-\x{218f}])/i
ANS_CJK = /([A-Za-zΑ-Ωα-ω0-9`~\$%\^&\*\-=\+\\\|\/!;:,\.\?\x{00a1}-\x{00ff}\x{2022}\x{2026}\x{2027}\x{2150}-\x{218f}])([\x{2e80}-\x{2eff}\x{2f00}-\x{2fdf}\x{3040}-\x{309f}\x{30a0}-\x{30ff}\x{3100}-\x{312f}\x{3200}-\x{32ff}\x{3400}-\x{4dbf}\x{4e00}-\x{9fff}\x{f900}-\x{faff}])/i

def self.spacing_text(text : String)
text = text.gsub(CJK_QUOTE, "\\1 \\2")
text = text.gsub(QUOTE_CJK, "\\1 \\2")

text = text.gsub(FIX_QUOTE, "\\1\\3\\5")
text = text.gsub(FIX_SINGLE_QUOTE, "\\1\\3\\4")

text = text.gsub(CJK_HASH, "\\1 \\2")
text = text.gsub(HASH_CJK, "\\1 \\3")

text = text.gsub(CJK_OPERATOR_ANS, "\\1 \\2 \\3")
text = text.gsub(ANS_OPERATOR_CJK, "\\1 \\2 \\3")

old_text = text
text = text.gsub(CJK_BRACKET_CJK, "\\1 \\2 \\4")
if old_text == text
text = text.gsub(CJK_BRACKET, "\\1 \\2")
text = text.gsub(BRACKET_CJK, "\\1 \\2")
end
text = text.gsub(FIX_BRACKET, "\\1\\3\\5")

text = text.gsub(FIX_SYMBOL, "\\1\\2 \\3")

text = text.gsub(CJK_ANS, "\\1 \\2")
text = text.gsub(ANS_CJK, "\\1 \\2")
text
end

def self.spacing_file(path)
file = File.open(path, "r")
data = file.gets_to_end
file.close
spacing_text(data)
end

def self.spacing(text_or_path)
if File.file?(text_or_path)
spacing_file(text_or_path)
else
spacing_text(text_or_path)
end
end
end

0 comments on commit 6d56d91

Please sign in to comment.