In [42]:
from sqlalchemy import create_engine,text
import pandas as pd
import numpy as np
import time
engine = create_engine('postgresql://dslab:dslab2018@localhost/dslab')
c = engine.connect()

# STEP 1, 3, 4, 5 code
SQL instructions to create the table we need as described in the workflow_summary document. For category 6.

18.10.2018: `reduced_purchase`, `category_purchased_6` created


# STEP 2 code

### First define the new set of possible answers.

In [8]:
#All but the last (righthand-most) bin is half-open.  for hist function
def create_categories(df_category):
    """ Defines the new answers. 
    Args:
        df_category: the product table restricted to one single category.
    
    Returns:
        result: a dict mapping filters to set of possible answers
        type_filters: a dict mapping filters to type of filters (option, bin or value)
    """
    result = {}
    type_filters = {}
    c = 0
    q = 0
    for f in df_category["PropertyDefinitionId"].drop_duplicates().values:
        c+=1
        values_defOpt = df_category.loc[df_category["PropertyDefinitionId"]==f, \
                                    'PropertyDefinitionOptionId'].dropna().drop_duplicates().values
        if len(values_defOpt)==0: #the answer is not a fixed set of values
            valuesProp = df_category.loc[df_category["PropertyDefinitionId"]==f, \
                                    'PropertyValue'].dropna().drop_duplicates().values
            if len(valuesProp) > 10:
                _, bins = np.histogram(valuesProp)
                result.update({f: bins})
                type_filters.update({f: 'bin'})
                q+=1
            else:
                result.update({f : valuesProp})
                type_filters.update({f: 'value'})
        else: 
            result.update({f : values_defOpt})
            type_filters.update({f: 'option'})
    print('Have to categorize {} filters out of {}'.format(q,c))
    return(result, type_filters)

Test the function

In [43]:
filters_category_6 = pd.read_sql_query('''
SELECT "ProductId", "BrandId", "ProductTypeId", "PropertyValue", "PropertyDefinitionId", "PropertyDefinitionOptionId"
FROM product_only_ids
WHERE "ProductTypeId"='6' ;
''', c)

print(filters_category_6.loc[1:10, ])

    ProductId  BrandId  ProductTypeId  PropertyValue  PropertyDefinitionId  \
1     6611001        8              6            NaN                  8745   
2     6611001        8              6            NaN                   346   
3     6611001        8              6            NaN                  9649   
4     6611001        8              6            NaN                 10525   
5     6611001        8              6            NaN                  7202   
6     6611001        8              6            NaN                 11296   
7     6611001        8              6            NaN                   746   
8     6611001        8              6            NaN                 11339   
9     6611001        8              6            NaN                 10526   
10    6611001        8              6            NaN                 10656   

    PropertyDefinitionOptionId  
1                     306566.0  
2                        496.0  
3                       7090.0  
4        

In [44]:
filters_def_dict, type_filters  = create_categories(filters_category_6)

Have to categorize 22 filters out of 139


In [87]:
def map_origAnswer_newAnswer(df, filters_def_dict, type_filters):
    """ finds the new answer for each row of the dataframe, returns list of new values.
    """
    answers = []
    for i in df.index.values:
        filter = df.loc[i, "PropertyDefinitionId"]
        if type_filters[filter]=='option':
            answers.append(df.loc[i,"PropertyDefinitionOptionId"])
        elif type_filters[filter]=='value':
            answers.append(df.loc[i,"PropertyValue"])
        else:
            bins = filters_def_dict[filter]
            n= len(bins)-1
            j=0
            while (df.loc[i,"PropertyValue"]>=bins[j] and j<n):
                j=j+1
            answers.append(bins[j-1])
    return(answers)

In [85]:
print(filters_category_6.loc[1:10,])
print(map_origAnswer_newAnswer(filters_category_6.loc[1:10,], filters_def_dict, type_filters))

    ProductId  BrandId  ProductTypeId  PropertyValue  PropertyDefinitionId  \
1     6611001        8              6            NaN                  8745   
2     6611001        8              6            NaN                   346   
3     6611001        8              6            NaN                  9649   
4     6611001        8              6            NaN                 10525   
5     6611001        8              6            NaN                  7202   
6     6611001        8              6            NaN                 11296   
7     6611001        8              6            NaN                   746   
8     6611001        8              6            NaN                 11339   
9     6611001        8              6            NaN                 10526   
10    6611001        8              6            NaN                 10656   

    PropertyDefinitionOptionId    answer  
1                     306566.0  306566.0  
2                        496.0     496.0  
3           

In [88]:
filters_category_6["answer"] = map_origAnswer_newAnswer(filters_category_6, filters_def_dict, type_filters)

In [91]:
filters_category_6.loc[1000:1010, ["answer", "PropertyValue", "PropertyDefinitionOptionId"]]

Unnamed: 0,answer,PropertyValue,PropertyDefinitionOptionId
1000,3991.0,,3991.0
1001,19530.0,,19530.0
1002,167407.0,,167407.0
1003,12936.0,,12936.0
1004,374279.0,,374279.0
1005,3465.0,,3465.0
1006,5767.0,,5767.0
1007,324876.0,,324876.0
1008,185639.0,,185639.0
1009,168077.0,,168077.0


# STEP 6 and 7

## Need the parser created by Mohammed (with small modifications)

In [None]:
# PropertyGroup is for example Width, Table Properties or RAM
# PropertyDefinition is for example Shape, Size or Ram Type
# PropertyDefinitions have a Type. 
# - Either its just a value then we find it in the PropertyValue
# - If we have a fixed set of options they are PropertyDefinitionOption objects.
# PropertyDefinitionOption is actually a choice of Shape or RAM Type
# A Property links a ProductType a ProductDefinition and a Product Group
# A Property itself can also take one or multiple values -> TODO

# The parser handles the query string on pages that support filtering.
# The output is represents the currently selected options 

import json

def handle_opt(opt, result):
    '''
    This section handles properties that either directly have a primitive type value (bool, multidimensional properties)
    or Properties that link a PropertyGroup and PropertyDefinition which again map directly to a primitive type value.
    The Properties that directly have a primitive value will have the value in the coloumn PropertyValue
    Also the properties with a list of possible values will have the value in the coloumn PropertyValue
    '''
    parts = opt.split('|')
    for part in parts:
        # Boolean properties
        if part[0] == 't' or part[0] == 'f':
            property_id = int(part[1:])
            
            if 'Property' not in result:
                result['Property'] = dict()
            result['Property'][property_id] = 1 if part[0] == 't' else 0
        
        # Multidimensional properties
        if part[0] == 'm':
            prefix, values = part.split(':')
            property_id = int(prefix[1:])
            values = values.split(',')
            
            if 'Property' not in result:
                result['Property'] = dict()
            result['Property'][property_id] = values
        
        # Single Value properties
        if part[0] == 'v':
            prefix, value = part.split(':')
            if '~' in prefix:
                continue
            property_group_id, property_definition_id = prefix[1:].split('-')
            
            if 'PropertyGroup' not in result:
                result['PropertyGroup'] = dict()
            if property_group_id not in result:
                result['PropertyGroup'][property_group_id] = dict()
            if 'PropertyDefinition' not in result['PropertyGroup'][property_group_id]:
                result['PropertyGroup'][property_group_id]['PropertyDefinition'] = dict()
                
            result['PropertyGroup'][property_group_id]['PropertyDefinition'][property_definition_id] = value
            
    return result

def handle_bra(section, result):
    '''
    This section handles Brands.
    Example:
    https://www.galaxus.ch/de/s1/producttype/notebook-6?bra=1|47&tagIds=614
    ProductType 6 (Notebook)
    Selected are the Brands 1 (ASUS) and 47 (Apple)
    '''
    brand_ids = section.split('|')
    result['Brands'] = brand_ids
    return result

def handle_rng_rou(section, result):
    '''
    This section handles ranges.
    I.e. Table Width for example.
    The min and max values are to be found in the coloumn PropertyValue
    '''
    tuples = section.split('|')
    for tup in tuples:
        prefix, suffix = tup.split(':')
        if '~' in prefix:
            continue
        property_group_id, property_definition_id = prefix.split('-')
        minimum, maximum = suffix.split(',')
        if 'PropertyGroup' not in result:
            result['PropertyGroup'] = dict()
        if property_group_id not in result:
            result['PropertyGroup'][property_group_id] = dict()
        if 'PropertyDefinition' not in result['PropertyGroup'][property_group_id]:
            result['PropertyGroup'][property_group_id]['PropertyDefinition'] = dict()
        result['PropertyGroup'][property_group_id]['PropertyDefinition'][property_definition_id] = dict()
        result['PropertyGroup'][property_group_id]['PropertyDefinition'][property_definition_id]['Min'] = minimum
        result['PropertyGroup'][property_group_id]['PropertyDefinition'][property_definition_id]['Max'] = maximum
    return result

def handle_pdo(section, result):
    '''
    This section handles ProductPropertyOptions.
    These represent the fixed sets of options there are on a specific product type
    Example:
    https://www.galaxus.ch/de/s1/producttype/notebook-6?pdo=13-6885:277226&tagIds=614
    ProductType 6 (Notebook)
    Selected is the PropertyDefinitionOption 277226 (Windows 10 Pro)
    This is a option of the PropertyDefinition 6885 (Windows Version)
    This again is a definition in the PropertyGroup 13 (Operating System)
    '''
    parts = section.split('|')
    for part in parts:
        prefix, property_definition_option_id = part.split(':')
        if '~' in prefix:
            continue
        property_group_id, property_definition_id = prefix.split('-')
        if 'PropertyGroup' not in result:
            result['PropertyGroup'] = dict()
        if property_group_id not in result:
            result['PropertyGroup'][property_group_id] = dict()
        if 'PropertyDefinition' not in result['PropertyGroup'][property_group_id]:
            result['PropertyGroup'][property_group_id]['PropertyDefinition'] = dict()
        if property_definition_id not in result['PropertyGroup'][property_group_id]['PropertyDefinition']:
            result['PropertyGroup'][property_group_id]['PropertyDefinition'][property_definition_id] = dict()
        if 'PropertyDefinitionOptionIds' in result['PropertyGroup'][property_group_id]['PropertyDefinition'][property_definition_id]:
            result['PropertyGroup'][property_group_id]['PropertyDefinition'][property_definition_id]['PropertyDefinitionOptionIds'].append(property_definition_option_id)
        else:
            result['PropertyGroup'][property_group_id]['PropertyDefinition'][property_definition_id]['PropertyDefinitionOptionIds'] = [property_definition_option_id]

    return result
    
def handle_section(section, result):
    if (section[:3] == 'opt'):
        result = handle_opt(section[4:], result)
    if (section[:3] == 'bra'):
        result = handle_bra(section[4:], result)
    if (section[:3] == 'rng' or section[:3] == 'rou'):
        result = handle_rng_rou(section[4:], result)
    if (section[:3] == 'pdo'):
        result = handle_pdo(section[4:], result)
    return result

def parse_query_string(query_string):
    try:
        sections = query_string.split('&')
        result = dict()
        for section in sections:
            result = handle_section(section, result)
        #print(json.dumps(result, indent=2))
        return(result)
    except:
       # print({})
        return ('{}')

example_qstring = 'opt=t44|m141:1,-2,3.14159,4,5|v3125-598080:6&bra=3301&nov=1:-30|2:15&rng=12-123:0.667,5|12-124:-2.7,1.414&rou=11-125:4,6.283|11-126:2.7,4&pdo=3126-598081:132|344-576:298&p=7667:1928|5123:1815&rfb=1&sale=1&pr=1&sr=1'
parse_query_string(example_qstring)

## STEP 6.a. parse the RequestURL to get list of fitlers and raw Json for answers

In [92]:
def filters_answers_per_requestURL(SessionId, RequestUrl):
    """
    for a given URL retrieve list of filters,[filters]
    array of dict of raw answers, {filters_id: [{dict_answers}]}
    list of filter group (maybe not needed) [property group] 
    """
    result = parse_query_string(RequestUrl) 
    d = result["PropertyGroup"]
    propgroup_list = [] # optional
    filters = []
    dict_dict_answers = {}
    for propgroup, group_dict in d.items():
        propgroup_list.append(propgroup)
        propdef_dict = group_dict['PropertyDefinition']
        for propdef, optProp in propdef_dict.items():
            filters.append(propdef) # PropertyDefinitionId
            # print(optProp) # Dict of answers still a nested dict need further functions to parse it.
            temp = []
            #for key, value in optProp.items():
                #print(key) # depending on the key do different things (add answer parsing function)
                #print(value)
            temp.append(optProp)
            dict_dict_answers.update({propdef: temp})
    return(propgroup_list, filters, dict_dict_answers) 

Test the function TO DO

In [None]:
df = traffic_purchased_6
SessionId  = df["SessionId"][1]
RequestUrl = df.loc[df["SessionId"] == SessionId, "RequestUrl"][0]
propgroup_list, filters, dict_dict_answers = filters_answers_per_requestURL(SessionId, RequestUrl)
print('group list')
print(propgroup_list)
print('filters list')
print(filters)
print('dict')
print(dict_dict_answers)

## STEP 6.b Process the JSON answer thing

Possible outputs to take into account 

In [95]:
def process_answers_filter(filter, answers_list_dict):
    answers = []
    for answers_item in answers_list_dict:
        to_categorize = False
        if isinstance(answers_item, dict): 
            for answerType, value in answers_item.items():
                print(answerType)
                if answerType == 'PropertyDefinitionOptionIds':
                    answers.extend(value)
                if answerType == 'Max':
                    max_value = value
                    to_categorize = True
                if answerType == 'Min':
                    min_value = value
            if to_categorize:
                answers.extend(categorize(filter, min_value, max_value))
        else:
            # case where we have directly the answer
            # no PropertyDefinitionsOptionsIDs but PropertyValue I guess
            answers.extend(categorize(filter, answers_item))
    # check that only one of both categroy is possible
    return(list(set(answers)))

In [96]:
def categorize(filter, filters_def_dict, type_filters, min_value, max_value=None):
    # to DO has to find which category 
    # need create a table for the correspondance filter, category
    if max_value==None:
        if type_filters[filter]=="value":
            return([min_value]) # nothing has to be done
        else:
            bins = filters_def_dict[filter]
            n= len(bins)-1
            j=0
            while (df.loc[i,"PropertyValue"]>=bins[j] and j<n):
                j=j+1
            return([bins[j-1]]) # find the right bin corresponding to the chosen value
    else:
        bins = filters_def_dict[filter]
        list_bins = []
        n = len(bins)
        i = 0
        j = 0
        while (min_value >= bins[i] and i<n):
            i+=1
        while (max_value <= bins[j] and j<n):
            j+=1
        return(bins[(i-1):j]) #find the bins corresponding to the chosen range

TO DO test the function

In [None]:
print(dict_dict_answers)
print(filter)
process_answers_filter(filter, answers_list_dict)