forked from kurehajime/dajarep
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dajarep.go
124 lines (112 loc) · 3.26 KB
/
dajarep.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
package dajarep
import (
"math"
"regexp"
"strings"
"github.com/ikawaha/kagome/tokenizer"
)
func init() {
tokenizer.SysDic()
}
//単語
type word struct {
str string
kana string
wtype string
}
//文章
type sentence struct {
str string
kana string
words []word
}
//駄洒落を返す
func Dajarep(text string) (dajares []string, debugStrs []string) {
sentences := getSentences(text)
for i := 0; i < len(sentences); i++ {
if ok, kana := isDajare(sentences[i]); ok == true {
dajares = append(dajares, sentences[i].str)
debugStrs = append(debugStrs, kana)
}
}
return dajares, debugStrs
}
//駄洒落かどうかを評価する。
func isDajare(sen sentence) (bool, string) {
words := sen.words
for i := 0; i < len(words); i++ {
w := words[i]
if w.wtype == "名詞" && len([]rune(w.kana)) > 1 {
r_str := regexp.MustCompile(w.str)
r_kana := regexp.MustCompile(fixWord(w.kana))
hit_str := r_str.FindAllString(sen.str, -1)
hit_kana := r_kana.FindAllString(sen.kana, -1)
hit_kana2 := r_kana.FindAllString(fixSentence(sen.kana), -1)
//ある単語における 原文の一致文字列数<フリガナでの一致文字列数 → 駄洒落の読みが存在
if len(hit_str) < int(math.Max(float64(len(hit_kana)), float64(len(hit_kana2)))) {
return true, w.kana
}
}
}
return false, ""
}
//置き換え可能な文字を考慮した正規表現を返す。
func fixWord(text string) string {
text = strings.Replace(text, "ッ", "[ツッ]?", -1)
text = strings.Replace(text, "ー", "[ー]?", -1)
text = strings.Replace(text, "ァ", "[アァ]?", -1)
text = strings.Replace(text, "ィ", "[イィ]?", -1)
text = strings.Replace(text, "ゥ", "[ウゥ]?", -1)
text = strings.Replace(text, "ェ", "[エェ]?", -1)
text = strings.Replace(text, "ォ", "[オォ]?", -1)
text = strings.Replace(text, "ャ", "[ヤャ]", -1)
text = strings.Replace(text, "ュ", "[ユュ]", -1)
text = strings.Replace(text, "ョ", "[ヨョ]", -1)
return text
}
//本文から省略可能文字を消したパターンを返す。
func fixSentence(text string) string {
text = strings.Replace(text, "ッ", "", -1)
text = strings.Replace(text, "ー", "", -1)
text = strings.Replace(text, "、", "", -1)
text = strings.Replace(text, ",", "", -1)
text = strings.Replace(text, " ", "", -1)
text = strings.Replace(text, " ", "", -1)
return text
}
//テキストからsentenceオブジェクトを作る。
func getSentences(text string) []sentence {
var sentences []sentence
t := tokenizer.New()
text = strings.Replace(text, "。", "\n", -1)
text = strings.Replace(text, ".", "\n", -1)
text = strings.Replace(text, "?", "?\n", -1)
text = strings.Replace(text, "!", "!\n", -1)
text = strings.Replace(text, "?", "?\n", -1)
text = strings.Replace(text, "!", "!\n", -1)
senstr := strings.Split(text, "\n")
for i := 0; i < len(senstr); i++ {
tokens := t.Tokenize(senstr[i])
var words []word
var kana string
for j := 0; j < len(tokens); j++ {
tk := tokens[j]
ft := tk.Features()
if len(ft) > 7 {
w := word{str: ft[6],
kana: ft[7],
wtype: ft[0],
}
words = append(words, w)
kana += ft[7]
}
}
sentences = append(sentences,
sentence{
str: senstr[i],
words: words,
kana: kana,
})
}
return sentences
}