# Group sizes



In [28]:
import numpy as np
from scipy.stats import entropy
from tabulate import tabulate
from pymongo import MongoClient

db = MongoClient()['stores']
col = db.data

TOTAL_NUMBER_OF_PRODUCTS = col.count()

results = col.aggregate(
    [
        {
            "$group": {
                "_id": "$size",
                "count": {"$sum": 1},
            }
        },
        {
            "$sort": {
                "count": -1,
            }
        }
    ]
)

TOP_SIZES = [(str(x['_id']), x['count']) for x in list(results)]

print(
    "\n" + 
    tabulate(TOP_SIZES[:20], headers=['Size', 'Number of Products'], tablefmt="orgtbl") +
    "\n"
)




| Size     |   Number of Products |
|----------+----------------------|
| M        |               202965 |
| L        |               202641 |
| S        |               202589 |
| XL       |               186166 |
| 2XL      |               152014 |
| 3XL      |               106966 |
| XS       |                71104 |
| 4XL      |                50869 |
| MEDIUM   |                31645 |
| LARGE    |                31599 |
| SMALL    |                31506 |
| EXTRA    |                31098 |
| 2X       |                20091 |
| 3X       |                20085 |
| 5XL      |                17541 |
| ONE      |                13786 |
| XXL      |                11288 |
| OS       |                 7615 |
| ONE SIZE |                 7269 |
| XX       |                 7184 |



In [4]:
# calculate probability vector
p = []
for _, count in TOP_SIZES:
    p.append(count)
size_prob_vector = np.array(p) / TOTAL_NUMBER_OF_PRODUCTS

# calculate entropy
first_entropy = entropy(size_prob_vector)

print("Data entropy:", first_entropy)

3.07781166841


In [16]:
sizes_mapping = {
    'ALL': [],
    'NO SIZE': ['PLAIN', 'CONE', 'BLANKET'],
    'ONE': ['OS', 'ONE SIZE', '1 SIZ'],
    'XS': ['XXS', 'XX-SMALL', '2XS'],
    'S': ['SMALL', 'S/M'],
    'M': ['MEDIUM', 'S/M', 'M/L'],
    'L': ['LARGE', 'L/XL', 'M/L'],
    'XL': ['EXTRA', 'XLT', 'XT', 'L/XL'],
    '2XL': ['2X', 'XXL', '2XT', '2XLL', '2X/', '2XLT'],
    '3XL': ['3X', '3XT', '3XLL', '3XLT'],
    '4XL': ['4X', '4XT', '4XLT'],
    '5XL': ['5X', '5XT', '5XLT'],
    '6XL': ['6X'],
}

def build_matching_table(matching_rules):
    """Build matching table from matching rules

    :param matching_rules: matching rules used to build matching table
    :type matching_rules: dict
    :return: matching table ``{'S/M: ['S', 'M']...}``
    :rtype: dict
    """
    matching_table = {}
    # transform matching rules to the "shortcut": "group_key" table
    for key, values in matching_rules.items():
        if not values:  # skip undefined rules i.e. "[]"
            continue

        if key not in matching_table:
            # NOTE: set('ab') would be {'a', 'b'}
            # so it's impossible to matching_table[key] = set(key)
            matching_table[key] = set()
            matching_table[key].add(key)
            
        for value in values:
            if value not in matching_table:
                matching_table[value] = set()
                matching_table[value].add(key)
            else:
                matching_table[value].add(key)
    return matching_table

 
MATCHING_TABLE = build_matching_table(sizes_mapping)
print(MATCHING_TABLE)

{'6X': {'6XL'}, 'MEDIUM': {'M'}, '6XL': {'6XL'}, 'NO SIZE': {'NO SIZE'}, 'XLT': {'XL'}, 'XXS': {'XS'}, 'L': {'L'}, 'L/XL': {'L', 'XL'}, '2XL': {'2XL'}, '3XL': {'3XL'}, '4XT': {'4XL'}, 'BLANKET': {'NO SIZE'}, 'XL': {'XL'}, '4XL': {'4XL'}, '5X': {'5XL'}, '4XLT': {'4XL'}, 'M/L': {'M', 'L'}, '5XLT': {'5XL'}, 'S': {'S'}, 'XX-SMALL': {'XS'}, 'S/M': {'S', 'M'}, 'XXL': {'2XL'}, 'PLAIN': {'NO SIZE'}, '3XT': {'3XL'}, '5XT': {'5XL'}, '5XL': {'5XL'}, 'EXTRA': {'XL'}, '3X': {'3XL'}, '2XLT': {'2XL'}, 'SMALL': {'S'}, '3XLL': {'3XL'}, 'ONE': {'ONE'}, '2XS': {'XS'}, 'XT': {'XL'}, 'OS': {'ONE'}, '3XLT': {'3XL'}, '2XLL': {'2XL'}, '4X': {'4XL'}, 'ONE SIZE': {'ONE'}, 'XS': {'XS'}, '2X': {'2XL'}, '2XT': {'2XL'}, '1 SIZ': {'ONE'}, 'M': {'M'}, '2X/': {'2XL'}, 'CONE': {'NO SIZE'}, 'LARGE': {'L'}}


In [26]:
# process data into the new table
def get_groups(mtable, size):
    """Get size groups for the given ``size`` according to matching table

    :param size: size (case insensetive)
    :type size: str
    :return: list of strings i.e. size groups or ``['UNDEFINED']``
    if not found
    :rtype: list or ['UNDEFINED']
    """
    return list(mtable.get(size, ['UNDEFINED']))


# create new collection
col.aggregate(
    [
        {
            "$project": {
                "_id": 1,
                "source": 1,
                "size": 1,
            },
        },
        {
            "$out": "size_mapping"
        }
        
    ]
)

# create indexes
db.size_mapping.create_index([("size", 1)])
db.size_mapping.create_index([("source", 1)])

for k, v in MATCHING_TABLE.items():
    res = db.size_mapping.update_many(
        {"size": k}, 
        {"$set": {"size": get_groups(MATCHING_TABLE, k)}})
    print(res.raw_result)
    


{'n': 794, 'updatedExisting': True, 'ok': 1, 'nModified': 794}
{'n': 31645, 'updatedExisting': True, 'ok': 1, 'nModified': 31645}
{'n': 7787, 'updatedExisting': True, 'ok': 1, 'nModified': 6993}
{'n': 94, 'updatedExisting': True, 'ok': 1, 'nModified': 94}
{'n': 3819, 'updatedExisting': True, 'ok': 1, 'nModified': 3819}
{'n': 1837, 'updatedExisting': True, 'ok': 1, 'nModified': 1837}
{'n': 202641, 'updatedExisting': True, 'ok': 1, 'nModified': 202641}
{'n': 3901, 'updatedExisting': True, 'ok': 1, 'nModified': 3901}
{'n': 152014, 'updatedExisting': True, 'ok': 1, 'nModified': 152014}
{'n': 106966, 'updatedExisting': True, 'ok': 1, 'nModified': 106966}
{'n': 260, 'updatedExisting': True, 'ok': 1, 'nModified': 260}
{'n': 51, 'updatedExisting': True, 'ok': 1, 'nModified': 51}
{'n': 193886, 'updatedExisting': True, 'ok': 1, 'nModified': 190067}
{'n': 51129, 'updatedExisting': True, 'ok': 1, 'nModified': 50869}
{'n': 2448, 'updatedExisting': True, 'ok': 1, 'nModified': 2448}
{'n': 2535, 'upda

Let's calculate data entropy for results

In [27]:
results = db.size_mapping.aggregate(
    [
        {
            "$group": {
                "_id": "$size",
                "count": {"$sum": 1},
            }
        },
        {
            "$sort": {
                "count": -1,
            }
        }
    ]
)

NEW_SIZES = [(str(x['_id']), x['count']) for x in list(results)]

print(
    "\n" + 
    tabulate(NEW_SIZES[:20], headers=['Size', 'Number of Products'], tablefmt="orgtbl") +
    "\n"
)

# calculate probability vector
p = []
for _, count in NEW_SIZES:
    p.append(count)
size_prob_vector = np.array(p) / TOTAL_NUMBER_OF_PRODUCTS

# calculate entropy
first_entropy = entropy(size_prob_vector)

print("Data entropy: ", first_entropy)


| Size    |   Number of Products |
|---------+----------------------|
| ['M']   |               238640 |
| ['L']   |               234240 |
| ['S']   |               234095 |
| ['XL']  |               225167 |
| ['2XL'] |               187326 |
| ['3XL'] |               130454 |
| ['XS']  |                73671 |
| ['4XL'] |                60609 |
| ['ONE'] |                29188 |
| ['5XL'] |                20189 |
| ['6XL'] |                 7787 |
| XX      |                 7184 |
| ALL     |                 4087 |
| OSFA    |                 3687 |
| LT      |                 3355 |
| YL      |                 2341 |
| YM      |                 2305 |
| YS      |                 2304 |
| 4T      |                 2185 |
| 2T      |                 2145 |

Data entropy:  2.61658173526
