db/key.cpp

// @file key.cpp

/**
*    Copyright (C) 2011 10gen Inc.
*
*    This program is free software: you can redistribute it and/or  modify
*    it under the terms of the GNU Affero General Public License, version 3,
*    as published by the Free Software Foundation.
*
*    This program is distributed in the hope that it will be useful,
*    but WITHOUT ANY WARRANTY; without even the implied warranty of
*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*    GNU Affero General Public License for more details.
*
*    You should have received a copy of the GNU Affero General Public License
*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

#include "pch.h"
#include "key.h"
#include "../util/unittest.h"

namespace mongo {

    // [ ][HASMORE][x][y][canontype_4bits]
    enum CanonicalsEtc { 
        cminkey=1,
        cnull=2,
        cdouble=4,
        cstring=6,
        cbindata=7,
        coid=8,
        cfalse=10,
        ctrue=11,
        cdate=12,
        cmaxkey=14,
        cCANONTYPEMASK = 0xf,
        cY = 0x10,
        cint = cY | cdouble,
        cX = 0x20,
        clong = cX | cdouble,
        cHASMORE = 0x40,
        cNOTUSED = 0x80 // but see IsBSON sentinel - this bit not usable without great care
    };

    // bindata bson type
    const unsigned BinDataLenMask = 0xf0;  // lengths are powers of 2 of this value
    const unsigned BinDataTypeMask = 0x0f; // 0-7 as you would expect, 8-15 are 128+value.  see BinDataType.
    const int BinDataLenMax = 32;
    const int BinDataLengthToCode[] = { 
        0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 
        0x80, -1/*9*/, 0x90/*10*/, -1/*11*/, 0xa0/*12*/, -1/*13*/, 0xb0/*14*/, -1/*15*/,
        0xc0/*16*/, -1, -1, -1, 0xd0/*20*/, -1, -1, -1, 
        0xe0/*24*/, -1, -1, -1, -1, -1, -1, -1, 
        0xf0/*32*/ 
    };
    const int BinDataCodeToLength[] = { 
        0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 32
    };

    int binDataCodeToLength(int codeByte) { 
        return BinDataCodeToLength[codeByte >> 4];
    }

    /** object cannot be represented in compact format.  so store in traditional bson format 
        with a leading sentinel byte IsBSON to indicate it's in that format.

        Given that the KeyV1Owned constructor already grabbed a bufbuilder, we reuse it here 
        so that we don't have to do an extra malloc.
    */
    void KeyV1Owned::traditional(const BSONObj& obj) { 
        b.reset();
        b.appendUChar(IsBSON);
        b.appendBuf(obj.objdata(), obj.objsize());
        _keyData = (const unsigned char *) b.buf();
    }

    // fromBSON to Key format
    KeyV1Owned::KeyV1Owned(const BSONObj& obj) {
        BSONObj::iterator i(obj);
        assert( i.more() );
        unsigned char bits = 0;
        while( 1 ) { 
            BSONElement e = i.next();
            if( i.more() )
                bits |= cHASMORE;
            switch( e.type() ) { 
            case MinKey:
                b.appendUChar(cminkey|bits);
                break;
            case jstNULL:
                b.appendUChar(cnull|bits);
                break;
            case MaxKey:
                b.appendUChar(cmaxkey|bits);
                break;
            case Bool:
                b.appendUChar( (e.boolean()?ctrue:cfalse) | bits );
                break;
            case jstOID:
                b.appendUChar(coid|bits);
                b.appendBuf(&e.__oid(), sizeof(OID));
                break;
            case BinData:
                {
                    int t = e.binDataType();
                    // 0-7 and 0x80 to 0x87 are supported by Key
                    if( (t & 0x78) == 0 && t != ByteArrayDeprecated ) {
                        int len;
                        const char * d = e.binData(len);
                        int code = BinDataLengthToCode[len];
                        if( code >= 0 ) {
                            if( t >= 128 )
                                t = (t-128) | 0x08;
                            dassert( (code&t) == 0 );
                            b.appendUChar( cbindata|bits );
                            b.appendUChar( code | t );
                            b.appendBuf(d, len);
                            break;
                        }
                    }
                    traditional(obj);
                    return;
                }
            case Date:
                b.appendUChar(cdate|bits);
                b.appendStruct(e.date());
                break;
            case String:
                {
                    b.appendUChar(cstring|bits);
                    // note we do not store the terminating null, to save space.
                    unsigned x = (unsigned) e.valuestrsize() - 1;
                    if( x > 255 ) { 
                        traditional(obj);
                        return;
                    }
                    b.appendUChar(x);
                    b.appendBuf(e.valuestr(), x);
                    break;
                }
            case NumberInt:
                b.appendUChar(cint|bits);
                b.appendNum((double) e._numberInt());
                break;
            case NumberLong:
                {
                    long long n = e._numberLong();
                    double d = (double) n;
                    if( d != n ) { 
                        traditional(obj);
                        return;
                    }
                    b.appendUChar(clong|bits);
                    b.appendNum(d);
                    break;
                }
            case NumberDouble:
                {
                    double d = e._numberDouble();
                    bool nan = !( d <= numeric_limits< double >::max() &&
                        d >= -numeric_limits< double >::max() );
                    if( !nan ) { 
                        b.appendUChar(cdouble|bits);
                        b.appendNum(d);
                        break;
                    }
                    // else fall through and return a traditional BSON obj so our compressed keys need not check for nan
                }
            default:
                // if other types involved, store as traditional BSON
                traditional(obj);
                return;
            }
            if( !i.more() )
                break;
            bits = 0;
        }
        _keyData = (const unsigned char *) b.buf();
        dassert( b.len() == dataSize() ); // check datasize method is correct
        dassert( (*_keyData & cNOTUSED) == 0 );
    }

    BSONObj KeyV1::toBson() const { 
        if( !isCompactFormat() )
            return bson();

        BSONObjBuilder b(512);
        const unsigned char *p = _keyData;
        while( 1 ) { 
            unsigned bits = *p++;

            switch( bits & 0x3f ) {
                case cminkey: b.appendMinKey(""); break;
                case cnull:   b.appendNull(""); break;
                case cfalse:  b.appendBool("", false); break;
                case ctrue:   b.appendBool("", true); break;
                case cmaxkey: 
                    b.appendMaxKey(""); 
                    break;
                case cstring:
                    {
                        unsigned sz = *p++;
                        // we build the element ourself as we have to null terminate it
                        BufBuilder &bb = b.bb();
                        bb.appendNum((char) String);
                        bb.appendUChar(0); // fieldname ""
                        bb.appendNum(sz+1);
                        bb.appendBuf(p, sz);
                        bb.appendUChar(0); // null char at end of string
                        p += sz;
                        break;
                    }
                case coid:
                    b.appendOID("", (OID *) p);
                    p += sizeof(OID);
                    break;
                case cbindata:
                    {
                        int len = binDataCodeToLength(*p);
                        int subtype = (*p) & BinDataTypeMask;
                        if( subtype & 0x8 ) { 
                            subtype = (subtype & 0x7) | 0x80;
                        }
                        b.appendBinData("", len, (BinDataType) subtype, ++p);
                        p += len;
                        break;
                    }
                case cdate:
                    b.appendDate("", (Date_t&) *p);
                    p += 8;
                    break;
                case cdouble:
                    b.append("", (double&) *p);
                    p += sizeof(double);
                    break;
                case cint:
                    b.append("", (int) ((double&) *p));
                    p += sizeof(double);
                    break;
                case clong:
                    b.append("", (long long) ((double&) *p));
                    p += sizeof(double);
                    break;
                default:
                    assert(false);
            }

            if( (bits & cHASMORE) == 0 )
                break;
        }
        return b.obj();
    }

    static int compare(const unsigned char *&l, const unsigned char *&r) { 
        int lt = (*l & cCANONTYPEMASK);
        int rt = (*r & cCANONTYPEMASK);
        int x = lt - rt;
        if( x ) 
            return x;

        l++; r++;

        // same type
        switch( lt ) { 
        case cdouble:
            {
                double L = *((double *) l);
                double R = *((double *) r);
                if( L < R )
                    return -1;
                if( L > R )
                    return 1;
                l += 8; r += 8;
                break;
            }
        case cstring:
            {
                int lsz = *l;
                int rsz = *r;
                int common = min(lsz, rsz);
                l++; r++; // skip the size byte
                // use memcmp as we (will) allow zeros in UTF8 strings
                int res = memcmp(l, r, common);
                if( res ) 
                    return res;
                // longer string is the greater one
                int diff = lsz-rsz;
                if( diff ) 
                    return diff;
                l += lsz; r += lsz;
                break;
            }
        case coid:
            {
                int res = memcmp(l, r, sizeof(OID));
                if( res ) 
                    return res;
                l += 12; r += 12;
                break;
            }
        case cbindata:
            {
                int L = *l;
                int R = *r;
                int diff = L-R; // checks length and subtype simultaneously
                if( diff )
                    return diff;
                // same length, same type
                l++; r++;
                int len = binDataCodeToLength(L);
                int res = memcmp(l, r, len);
                if( res ) 
                    return res;
                l += len; r += len;
                break;
            }
        case cdate:
            {
                long long L = *((long long *) l);
                long long R = *((long long *) r);
                if( L < R )
                    return -1;
                if( L > R )
                    return 1;
                l += 8; r += 8;
                break;
            }
        default:
            // all the others are a match -- e.g. null == null
            ;
        }

        return 0;
    }

    // at least one of this and right are traditional BSON format
    int NOINLINE_DECL KeyV1::compareHybrid(const KeyV1& right, const Ordering& order) const { 
        BSONObj L = toBson();
        BSONObj R = right.toBson();
        return L.woCompare(R, order, /*considerfieldname*/false);
    }

    int KeyV1::woCompare(const KeyV1& right, const Ordering &order) const {
        const unsigned char *l = _keyData;
        const unsigned char *r = right._keyData;

        if( (*l|*r) == IsBSON ) // only can do this if cNOTUSED maintained
            return compareHybrid(right, order);

        unsigned mask = 1;
        while( 1 ) { 
            char lval = *l; 
            char rval = *r;
            {
                int x = compare(l, r); // updates l and r pointers
                if( x ) {
                    if( order.descending(mask) )
                        x = -x;
                    return x;
                }
            }

            {
                int x = ((int)(lval & cHASMORE)) - ((int)(rval & cHASMORE));
                if( x ) 
                    return x;
                if( (lval & cHASMORE) == 0 )
                    break;
            }

            mask <<= 1;
        }

        return 0;
    }

    static unsigned sizes[] = {
        0,
        1, //cminkey=1,
        1, //cnull=2,
        0,
        9, //cdouble=4,
        0,
        0, //cstring=6,
        0,
        13, //coid=8,
        0,
        1, //cfalse=10,
        1, //ctrue=11,
        9, //cdate=12,
        0,
        1, //cmaxkey=14,
        0
    };

    inline unsigned sizeOfElement(const unsigned char *p) { 
        unsigned type = *p & cCANONTYPEMASK;
        unsigned sz = sizes[type];
        if( sz == 0 ) {
            if( type == cstring ) { 
                sz = ((unsigned) p[1]) + 2;
            }
            else {
                assert( type == cbindata );
                sz = binDataCodeToLength(p[1]) + 2;
            }
        }
        return sz;
    }

    int KeyV1::dataSize() const { 
        const unsigned char *p = _keyData;
        if( !isCompactFormat() ) {
            return bson().objsize() + 1;
        }

        bool more;
        do { 
            unsigned z = sizeOfElement(p);
            more = (*p & cHASMORE) != 0;
            p += z;
        } while( more );
        return p - _keyData;
    }

    bool KeyV1::woEqual(const KeyV1& right) const {
        const unsigned char *l = _keyData;
        const unsigned char *r = right._keyData;

        if( (*l|*r) == IsBSON ) {
            return toBson().woEqual(right.toBson());
        }

        while( 1 ) { 
            char lval = *l; 
            char rval = *r;
            if( (lval&(cCANONTYPEMASK|cHASMORE)) != (rval&(cCANONTYPEMASK|cHASMORE)) )
                return false;
            l++; r++;
            switch( lval&cCANONTYPEMASK ) { 
            case coid:
                if( *((unsigned*) l) != *((unsigned*) r) )
                    return false;
                l += 4; r += 4;
            case cdate:
            case cdouble:
                if( *((unsigned long long *) l) != *((unsigned long long *) r) )
                    return false;
                l += 8; r += 8;
                break;
            case cstring:
                {
                    unsigned sz = ((unsigned) *l) + 1;
                    if( memcmp(l, r, sz) ) // first byte checked is the length byte
                        return false;
                    l += sz; r += sz;
                    break;
                }
            case cbindata:
                {
                    int len = binDataCodeToLength(*l) + 1;
                    if( memcmp(l, r, len) ) 
                        return false;
                    l += len; r += len;
                    break;
                }
            case cminkey:
            case cnull:
            case cfalse:
            case ctrue:
            case cmaxkey:
                break;
            default:
                assert(false);
            }
            if( (lval&cHASMORE) == 0 )
                break;
        }
        return true;
    }

    struct CmpUnitTest : public UnitTest {
        void run() {
            char a[2];
            char b[2];
            a[0] = -3;
            a[1] = 0;
            b[0] = 3;
            b[1] = 0;
            assert( strcmp(a,b)>0 && memcmp(a,b,2)>0 );
        }
    } cunittest;

}