From 56de02e4f4c58535488745c457a52ea513942181 Mon Sep 17 00:00:00 2001 From: Marcus Lager Date: Mon, 1 May 2017 21:55:21 +0200 Subject: [PATCH] deflate index flag --- rn-write.bat | 17 ++++++++++------- src/Resin.Cli/Program.cs | 4 +--- src/Resin/IO/IxInfo.cs | 2 ++ src/Resin/IO/Read/DocumentReader.cs | 8 ++++---- src/Resin/IO/Serializer.cs | 18 +++++++++++------- src/Resin/Searcher.cs | 12 ++++++------ src/Resin/UpsertOperation.cs | 3 ++- 7 files changed, 36 insertions(+), 28 deletions(-) diff --git a/rn-write.bat b/rn-write.bat index 55c45d35..edc83f26 100644 --- a/rn-write.bat +++ b/rn-write.bat @@ -1,7 +1,10 @@ -call rn write --file c:\temp\10Mwikipedia.json --dir d:\resin\wikipedia --skip 0 --take 1000000 -call rn write --file c:\temp\10Mwikipedia.json --dir d:\resin\wikipedia --skip 1000000 --take 1000000 -call rn write --file c:\temp\10Mwikipedia.json --dir d:\resin\wikipedia --skip 2000000 --take 1000000 -call rn write --file c:\temp\10Mwikipedia.json --dir d:\resin\wikipedia --skip 3000000 --take 1000000 -call rn write --file c:\temp\10Mwikipedia.json --dir d:\resin\wikipedia --skip 4000000 --take 1000000 -call rn write --file c:\temp\10Mwikipedia.json --dir d:\resin\wikipedia --skip 5000000 --take 1000000 -call rn write --file c:\temp\10Mwikipedia.json --dir d:\resin\wikipedia --skip 6000000 --take 1000000 \ No newline at end of file +rem call rn write --file "C:\Users\Marcus\Downloads\enwiki-20120502-lines-1k.txt" --dir D:\resin\wikipedia --take 100000 --compress +call rn write --file "C:\Users\Marcus\Downloads\enwiki-20120502-lines-1k.txt" --dir D:\resin\wikipedia --skip 100000 --take 100000 --compress +call rn write --file "C:\Users\Marcus\Downloads\enwiki-20120502-lines-1k.txt" --dir D:\resin\wikipedia --skip 200000 --take 100000 --compress +call rn write --file "C:\Users\Marcus\Downloads\enwiki-20120502-lines-1k.txt" --dir D:\resin\wikipedia --skip 300000 --take 100000 --compress +call rn write --file "C:\Users\Marcus\Downloads\enwiki-20120502-lines-1k.txt" --dir D:\resin\wikipedia --skip 400000 --take 100000 --compress +call rn write --file "C:\Users\Marcus\Downloads\enwiki-20120502-lines-1k.txt" --dir D:\resin\wikipedia --skip 500000 --take 100000 --compress +call rn write --file "C:\Users\Marcus\Downloads\enwiki-20120502-lines-1k.txt" --dir D:\resin\wikipedia --skip 600000 --take 100000 --compress +call rn write --file "C:\Users\Marcus\Downloads\enwiki-20120502-lines-1k.txt" --dir D:\resin\wikipedia --skip 700000 --take 100000 --compress +call rn write --file "C:\Users\Marcus\Downloads\enwiki-20120502-lines-1k.txt" --dir D:\resin\wikipedia --skip 800000 --take 100000 --compress +call rn write --file "C:\Users\Marcus\Downloads\enwiki-20120502-lines-1k.txt" --dir D:\resin\wikipedia --skip 900000 --take 100000 --compress \ No newline at end of file diff --git a/src/Resin.Cli/Program.cs b/src/Resin.Cli/Program.cs index a3de142c..590a94b7 100644 --- a/src/Resin.Cli/Program.cs +++ b/src/Resin.Cli/Program.cs @@ -84,9 +84,7 @@ static void Query(string[] args) { string dir = null; string indexName = null; - bool deflate = false; - if (Array.IndexOf(args, "--deflate") > 0) deflate = true; if (Array.IndexOf(args, "--dir") > 0) dir = args[Array.IndexOf(args, "--dir") + 1]; if (Array.IndexOf(args, "--name") > 0) indexName = args[Array.IndexOf(args, "--name") + 1]; @@ -106,7 +104,7 @@ static void Query(string[] args) { var timer = new Stopwatch(); timer.Start(); - using (var s = new Searcher(dir, new QueryParser(new Analyzer()), new Tfidf(), deflate)) + using (var s = new Searcher(dir, new QueryParser(new Analyzer()), new Tfidf())) { result = s.Search(q, page, size); diff --git a/src/Resin/IO/IxInfo.cs b/src/Resin/IO/IxInfo.cs index 0fe903cd..4a7e2e70 100644 --- a/src/Resin/IO/IxInfo.cs +++ b/src/Resin/IO/IxInfo.cs @@ -12,6 +12,8 @@ public class IxInfo public int DocumentCount { get; set; } + public bool Compressed { get; set; } + public static IxInfo Load(string fileName) { var time = new Stopwatch(); diff --git a/src/Resin/IO/Read/DocumentReader.cs b/src/Resin/IO/Read/DocumentReader.cs index 96fc975b..46ceae61 100644 --- a/src/Resin/IO/Read/DocumentReader.cs +++ b/src/Resin/IO/Read/DocumentReader.cs @@ -4,16 +4,16 @@ namespace Resin.IO.Read { public class DocumentReader : BlockReader { - private readonly bool _withCompression; + private readonly bool _deflate; - public DocumentReader(Stream stream, bool withCompression) : base(stream) + public DocumentReader(Stream stream, bool deflate) : base(stream) { - _withCompression = withCompression; + _deflate = deflate; } protected override Document Deserialize(byte[] data) { - return Serializer.DeserializeDocument(data, _withCompression); + return Serializer.DeserializeDocument(data, _deflate); } } } \ No newline at end of file diff --git a/src/Resin/IO/Serializer.cs b/src/Resin/IO/Serializer.cs index 030e3129..4a18b0f0 100644 --- a/src/Resin/IO/Serializer.cs +++ b/src/Resin/IO/Serializer.cs @@ -183,6 +183,7 @@ public static byte[] Serialize(this IxInfo ix) stream.Write(versionBytes, 0, sizeof(long)); stream.Write(docCountBytes, 0, sizeof(int)); + stream.WriteByte(EncodedBoolean[ix.Compressed]); return stream.ToArray(); } @@ -198,6 +199,8 @@ public static IxInfo DeserializeIxInfo(Stream stream) stream.Read(docCountBytes, 0, sizeof(int)); + var compressed = stream.ReadByte(); + if (!BitConverter.IsLittleEndian) { Array.Reverse(versionBytes); @@ -207,7 +210,8 @@ public static IxInfo DeserializeIxInfo(Stream stream) return new IxInfo { VersionId= BitConverter.ToInt64(versionBytes, 0), - DocumentCount = BitConverter.ToInt32(docCountBytes, 0) + DocumentCount = BitConverter.ToInt32(docCountBytes, 0), + Compressed = compressed==1 }; } @@ -419,7 +423,7 @@ public static LcrsNode DeserializeNode(Stream stream) } } - public static Document DeserializeDocument(byte[] data, bool wasCompressed) + public static Document DeserializeDocument(byte[] data, bool deflate) { var idBytes = new byte[sizeof(int)]; Array.Copy(data, 0, idBytes, 0, sizeof(int)); @@ -434,7 +438,7 @@ public static Document DeserializeDocument(byte[] data, bool wasCompressed) } var id = BitConverter.ToInt32(idBytes, 0); - var dic = DeserializeStringStringDic(dicBytes, wasCompressed).ToDictionary(x=>x.Key, y=>y.Value); + var dic = DeserializeStringStringDic(dicBytes, deflate).ToDictionary(x=>x.Key, y=>y.Value); return new Document(dic) {Id = id}; } @@ -536,15 +540,15 @@ public static IEnumerable DeserializePostings(byte[] data) } } - public static IEnumerable> DeserializeStringStringDic(byte[] data, bool wasCompressed) + public static IEnumerable> DeserializeStringStringDic(byte[] data, bool deflate) { using (var stream = new MemoryStream(data)) { - return DeserializeStringStringDic(stream, wasCompressed).ToList(); + return DeserializeStringStringDic(stream, deflate).ToList(); } } - public static IEnumerable> DeserializeStringStringDic(Stream stream, bool wasCompressed) + public static IEnumerable> DeserializeStringStringDic(Stream stream, bool deflate) { while (true) { @@ -592,7 +596,7 @@ public static IEnumerable DeserializePostings(byte[] data) Array.Reverse(valBytes); } - string value = wasCompressed ? Compressor.DecompressText(valBytes) : Encoding.GetString(valBytes); + string value = deflate ? Compressor.DecompressText(valBytes) : Encoding.GetString(valBytes); yield return new KeyValuePair(key, value); } diff --git a/src/Resin/Searcher.cs b/src/Resin/Searcher.cs index ee44fbfb..aca08f09 100644 --- a/src/Resin/Searcher.cs +++ b/src/Resin/Searcher.cs @@ -21,22 +21,20 @@ public class Searcher : IDisposable private readonly string _directory; private readonly QueryParser _parser; private readonly IScoringScheme _scorerFactory; - private readonly bool _compression; private readonly IList _ixs; private readonly int _blockSize; private readonly int _documentCount; - public Searcher(string directory, bool compression = false) - :this(directory, new QueryParser(new Analyzer()), new Tfidf(), compression) + public Searcher(string directory) + :this(directory, new QueryParser(new Analyzer()), new Tfidf()) { } - public Searcher(string directory, QueryParser parser, IScoringScheme scorerFactory, bool compression = false) + public Searcher(string directory, QueryParser parser, IScoringScheme scorerFactory) { _directory = directory; _parser = parser; _scorerFactory = scorerFactory; - _compression = compression; _ixs = Util.GetIndexFileNamesInChronologicalOrder(directory).Select(IxInfo.Load).ToList(); @@ -123,7 +121,9 @@ private IEnumerable GetDocs(IList scores, IxInfo var docFileName = Path.Combine(_directory, ix.VersionId + ".doc"); - using (var docReader = new DocumentReader(new FileStream(docFileName, FileMode.Open, FileAccess.Read, FileShare.Read, 4096*4, FileOptions.SequentialScan), _compression)) + using (var docReader = new DocumentReader( + new FileStream(docFileName, FileMode.Open, FileAccess.Read, FileShare.Read, 4096*4, FileOptions.SequentialScan), + ix.Compressed)) { var dic = scores.ToDictionary(x => x.DocumentId, y => y.Score); diff --git a/src/Resin/UpsertOperation.cs b/src/Resin/UpsertOperation.cs index 557d78fb..7a6be76e 100644 --- a/src/Resin/UpsertOperation.cs +++ b/src/Resin/UpsertOperation.cs @@ -284,7 +284,8 @@ private IxInfo CreateIxInfo() return new IxInfo { VersionId = _indexVersionId, - DocumentCount = _docId + DocumentCount = _docId, + Compressed = _compression }; } }