This repository has been archived by the owner on Sep 3, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
15 changed files
with
392,350 additions
and
155 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,4 +6,5 @@ | |
|
||
build/ | ||
|
||
lib/assets/dic.csv | ||
#lib/assets/dic.csv | ||
coverage/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,127 +1,34 @@ | ||
library ringo; | ||
|
||
import 'src/utils/path.dart'; | ||
import 'src/utils/string_utils.dart'; | ||
import 'src/utils/list_utils.dart'; | ||
import 'package:ringo/src/lattice/lattice.dart'; | ||
|
||
/* | ||
main() async { | ||
final ringo = await Ringo.init(); | ||
final tokenized = ringo.tokenize('吾輩はRingoである'); | ||
print('tokenized: $tokenized'); | ||
} | ||
*/ | ||
import 'src/data/dictionary.dart'; | ||
import 'src/data/double_array.dart'; | ||
import 'src/data/tokenized_word.dart'; | ||
import 'src/search/longest_match.dart'; | ||
|
||
class Ringo { | ||
const Ringo(this._doubleArray); | ||
const Ringo(this._doubleArray, this._dictionary); | ||
final DoubleArray _doubleArray; | ||
final Dictionary _dictionary; | ||
|
||
static Future<Ringo> init() async { | ||
final base = await ListUtils.loadList(Path.base); | ||
final check = await ListUtils.loadList(Path.check); | ||
final doubleArray = DoubleArray(base: base, check: check); | ||
return Ringo(doubleArray); | ||
final doubleArray = await DoubleArray.init(); | ||
final dictionary = await Dictionary.init(); | ||
return Ringo(doubleArray, dictionary); | ||
} | ||
|
||
List<String> tokenize(String query) { | ||
final searchdResult = _searchPrefix(query); | ||
final longestMatchResult = _longestMatch(searchdResult, query); | ||
final searchdResult = _doubleArray.searchCommonPrefix(query); | ||
final longestMatchResult = LongestMatch.simpleTokenize(searchdResult, query); | ||
|
||
return longestMatchResult; | ||
} | ||
|
||
List<String> _longestMatch(List<SearchedWord> words, String query) { | ||
final result = <String>[]; | ||
int current = 0; | ||
words.sort((a, b) => b.end.compareTo(a.end)); | ||
final unknownWordBuffer = StringBuffer(); | ||
//print('query.length: ${query.length}'); | ||
for (var i = 0; i < query.length && current < query.length; i++) { | ||
//print(current); | ||
final startWords = words.where((e) => e.start == current).toList(); | ||
if (startWords.isNotEmpty) { | ||
/// 未知語を追加 | ||
if (unknownWordBuffer.toString() != '') { | ||
result.add(unknownWordBuffer.toString()); | ||
unknownWordBuffer.clear(); | ||
} | ||
|
||
/// 辞書にある単語を追加 | ||
final word = startWords.first; | ||
current = word.end; | ||
result.add(word.word); | ||
} else { | ||
/// 未知語は一つにまとめるため,StringWordBufferに保存 | ||
unknownWordBuffer.write(query[current]); | ||
current++; | ||
} | ||
} | ||
|
||
/// 最後に未知後があったケースの処理 | ||
if (unknownWordBuffer.toString() != '') { | ||
result.add(unknownWordBuffer.toString()); | ||
unknownWordBuffer.clear(); | ||
} | ||
return result; | ||
} | ||
|
||
List<SearchedWord> _searchPrefix(String word) { | ||
final searchResult = <SearchedWord>[]; | ||
for (var start = 0; start < word.length; start++) { | ||
for (var end = 1; end < word.length + 1; end++) { | ||
final result = _searchPrefixHelper(word, start, end); | ||
if (result != null) { | ||
searchResult.add(result); | ||
} | ||
} | ||
} | ||
return searchResult; | ||
} | ||
|
||
SearchedWord? _searchPrefixHelper(String word, int start, int end) { | ||
if (start == end || start > end) { | ||
return null; | ||
} | ||
final query = word.substring(start, end); | ||
if (_search(query)) { | ||
final word = SearchedWord(word: query, start: start, end: end); | ||
return word; | ||
} | ||
return null; | ||
void analyzeMorphologic(String query) { | ||
final words = _doubleArray.searchCommonPrefix(query); | ||
final allWords = _dictionary.allWordsFromTokenizedWords(words); | ||
print(words.first.dicWords); | ||
//print(allWords.length); | ||
} | ||
|
||
bool _search(String word) { | ||
int current = 0; | ||
final codes = StringUtils.stringToUint8List(word); | ||
for (var i = 0; i < codes.length; i++) { | ||
final code = codes[i]; | ||
final next = _doubleArray.base[current] + code; | ||
final checkData = _doubleArray.check[next] - 1; | ||
if (current != checkData) { | ||
return false; | ||
} | ||
current = next; | ||
} | ||
return true; | ||
} | ||
} | ||
|
||
class DoubleArray { | ||
const DoubleArray({ | ||
required this.base, | ||
required this.check, | ||
}); | ||
final List<int> base; | ||
final List<int> check; | ||
} | ||
|
||
class SearchedWord { | ||
const SearchedWord({ | ||
required this.word, | ||
required this.start, | ||
required this.end, | ||
}); | ||
final String word; | ||
final int start; | ||
final int end; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
import 'package:ringo/src/utils/list_utils.dart'; | ||
import 'tokenized_word.dart'; | ||
import 'word.dart'; | ||
|
||
class Dictionary { | ||
const Dictionary(this.words); | ||
final List<Word> words; | ||
|
||
static Future<Dictionary> init() async { | ||
final words = await ListUtils.loadWords(); | ||
return Dictionary(words); | ||
} | ||
|
||
void allWordsFromTokenizedWords(List<TokenizedWord> tokenizedWords) { | ||
for(final tokenizedWord in tokenizedWords) { | ||
final allWords = words.where((e) => e.word==tokenizedWord.word).toList(); | ||
if (allWords.isNotEmpty){ | ||
tokenizedWord.dicWords.addAll(allWords); | ||
} | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
import '../utils/string_utils.dart'; | ||
import '../utils/list_utils.dart'; | ||
import '../utils/path.dart'; | ||
import 'tokenized_word.dart'; | ||
|
||
class DoubleArray { | ||
const DoubleArray({ | ||
required this.base, | ||
required this.check, | ||
}); | ||
final List<int> base; | ||
final List<int> check; | ||
|
||
static Future<DoubleArray> init() async { | ||
final base = await ListUtils.loadList(Path.base); | ||
final check = await ListUtils.loadList(Path.check); | ||
return DoubleArray(base: base, check: check); | ||
} | ||
|
||
List<TokenizedWord> searchCommonPrefix(String word) { | ||
final searchResult = <TokenizedWord>[]; | ||
for (var start = 0; start < word.length; start++) { | ||
for (var end = 1; end < word.length + 1; end++) { | ||
final result = _searchCommonPrefixHelper(word, start, end); | ||
if (result != null) { | ||
searchResult.add(result); | ||
} | ||
} | ||
} | ||
return searchResult; | ||
} | ||
|
||
TokenizedWord? _searchCommonPrefixHelper(String word, int start, int end) { | ||
if (start == end || start > end) { | ||
return null; | ||
} | ||
final query = word.substring(start, end); | ||
if (_search(query)) { | ||
final word = TokenizedWord(word: query, start: start, end: end); | ||
return word; | ||
} | ||
return null; | ||
} | ||
|
||
bool _search(String word) { | ||
int current = 0; | ||
final codes = StringUtils.stringToUint8List(word); | ||
for (var i = 0; i < codes.length; i++) { | ||
final code = codes[i]; | ||
final next = base[current] + code; | ||
final checkData = check[next] - 1; | ||
if (current != checkData) { | ||
return false; | ||
} | ||
current = next; | ||
} | ||
return true; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
import 'package:ringo/src/data/word.dart'; | ||
|
||
class TokenizedWord { | ||
TokenizedWord({ | ||
required this.word, | ||
required this.start, | ||
required this.end, | ||
dicWords, | ||
}); | ||
final String word; | ||
final int start; | ||
final int end; | ||
final List<Word> dicWords = <Word>[]; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
class Word { | ||
const Word({ | ||
required this.word, | ||
required this.leftContextId, | ||
required this.rightContextId, | ||
required this.occurrenceCost, | ||
}); | ||
final String word; | ||
final int leftContextId; | ||
final int rightContextId; | ||
final int occurrenceCost; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.