### Note: You must download brands.csv file from slack and add it to data folder

In [29]:
import pickle as pkl
import pandas as pd
import numpy as np

In [30]:
def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pkl.load(f)
    
def save_obj(obj, name):
    """ 
    Shortcut function to save an object as pkl
    Args:
        obj: object to save
        name: filename of the object
    """
    with open(name + '.pkl', 'wb') as f:
        pkl.dump(obj, f, pkl.HIGHEST_PROTOCOL)

In [31]:
def create_categories(df_category):
    """ Defines the new answers. 
    Note:
        This is a helper function.
    Args:
        df_category: the product table restricted to one single category.

    Returns:
        filters_def_dict: a dict mapping filtersId to new set of possible answers
        type_filters: a dict mapping filters to type of filters (option, bin or value or mixed)
                      {'questionid':'option'|'bin'|'value'|'mixed'|'no_answer'}
    """
    filters_def_dict = {}
    type_filters = {}
    c = 0
    q = 0
    for f in df_category["PropertyDefinitionId"].drop_duplicates().values:
        c+=1
        values_defOpt = df_category.loc[df_category["PropertyDefinitionId"]==f, \
                                    'PropertyDefinitionOptionId'].dropna().drop_duplicates().values
        valuesProp = df_category.loc[df_category["PropertyDefinitionId"]==f, \
                                    'PropertyValue'].dropna().drop_duplicates().values
        
        # Case filter is of 'option' type (i.e. answer is in defined set of possibilities)
        if (len(valuesProp)==0 and len(values_defOpt)>0):
            filters_def_dict.update({str(f): values_defOpt})
            type_filters.update({str(f): 'option'}) #case only optionId
        
        # Case filter is of type 'value' or 'bin' (i.e. answer is a value not an id)
        elif (len(values_defOpt)==0 and len(valuesProp)>0): 
            # Case over than 10 possibles values
            # New answers are 10 bins constructed based on percentiles.
            if len(valuesProp) > 10:
                bins = np.percentile(valuesProp, [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
                filters_def_dict.update({str(f): bins})
                type_filters.update({str(f): 'bin'})
                q+=1
            # Else keep the original answers
            else:
                filters_def_dict.update({str(f): valuesProp})
                type_filters.update({str(f): 'value'})
        
        # If the answers is sometimes stored as an id and sometimes as a value 
        # in the original dataframe. Keep the original answer.
        elif (len(values_defOpt)>0 and len(valuesProp)>0): # both filled -> put values in optId
            l = set(values_defOpt)
            l2 = set(valuesProp)
            filters_def_dict.update({str(f): np.array(l.union(l2))})
            type_filters.update({str(f): 'mixed'})
        # If there are no answer.
        else:
            print('No answer is provided for filter {}'.format(f))
            type_filters.update({str(f): 'no_answer'})
    print('Have to categorize {} filters out of {}'.format(q,c))
    return(filters_def_dict, type_filters)

def eliminate_filters_no_answers(df, type_filters):
    """To eliminate questions for which there are no 
    value available in the catalog.

    Note:
        First you need to create_categories in order to get the
        type_filters dictonary.

    Args:
        df: input product_catalog to clean
        type_filters: input type_filters dict (cf. create_category)
    
    Returns:
        new: new dataframe with the 'no_answer' filters.
    
    Example:
        >>> df = load_obj(products_cat, '../data/products_table')
        >>> filters_def_dict, type_filters = create_categories(df)
        >>> df = eliminate_filters_no_answers(df, type_filters)
    """
    new = df.copy()
    for f in type_filters:
        if type_filters[f]=='no_answer':
            ind_temp = new.loc[new["PropertyDefinitionId"]==float(f),].index.values
            new = new.drop(ind_temp)
    return(new)


def map_origAnswer_newAnswer(df, filters_def_dict, type_filters):
    """ Function to construct the final 'answer' column.
    Note:
        First run create_category to get filters_def_dcit and 
        type_filters
    
    Args:
        df: input product_catalog with the row answers
        filters_def_dict: as described in create_category
        type_filters: as described in create_category
    
    Returns:
        answers: array of values to be used as the new 'answer' column. 
                Ordered in the same order as the original df index.

    Example: 
        >>> df = load_obj(products_cat, '../data/products_table')
        >>> filters_def_dict, type_filters = create_categories(df)
        >>> df['answer'] = map_origAnswer_newAnswer(df, filters_def_dict, type_filters)
    """
    answers = []
    for i in df.index.values:
        # get current question
        filter = df.loc[i, "PropertyDefinitionId"]
        # construct the new answer depending on the type of question
        if type_filters[str(filter)]=='option':
            answers.append(df.loc[i, "PropertyDefinitionOptionId"])
        elif type_filters[str(filter)]=='value':
            answers.append(df.loc[i, "PropertyValue"])
        # if bin filter map original answer to corresponding bin
        elif type_filters[str(filter)]=='bin':
            bins = filters_def_dict[str(filter)]
            n = len(bins)-1
            j = 0
            while (df.loc[i,"PropertyValue"]>=bins[j] and j<n):
                j=j+1
            answers.append(bins[j-1])
        # if mixed and answer is in id use id otherwise use value
        elif type_filters[str(filter)]=='mixed':
            if np.isnan(df.loc[i,"PropertyDefinitionOptionId"]):
                answers.append(df.loc[i,"PropertyValue"])
            else:
                answers.append(df.loc[i,"PropertyDefinitionOptionId"]) 
    return(answers)

In [32]:
def question_id_to_text(question, question_df):
    try:
        question_text = question_df.loc[question_df["PropertyDefinitionId"] == str(int(question)), "PropertyDefinition"].values[0]
    except IndexError:
        question_text = 'No text equivalent for question'
    return question_text

def answer_id_to_text(answer, question, answer_df):
    answer_list = []
    for i in answer:
        if i == 'idk':
            answer_list.append('idk')
        elif i == 'none':
            answer_list.append('none')
        else:
            try:
                answer_list.append(answer_df.loc[(answer_df["answer_id"] == i) & (answer_df["question_id"] == int(question)), "answer_text"].astype(str).values[0])
            except TypeError:
                answer_list.append(i)
            except IndexError:
                answer_list.append('Not Found')
    return (answer_list)

In [33]:
products_cat = load_obj('../data/products_table')
traffic_cat = load_obj('../data/traffic_table')
question_text_df = load_obj('../data/question_text_df')
answer_text = load_obj('../data/answer_text')

In [34]:
# Adding Brands (in a separate column) as filter in products_cat
print("Adding brands as property in dataframe...")
brandId = 99999
i=0
for p in products_cat["ProductId"].drop_duplicates():
    brand = products_cat.loc[products_cat['ProductId'] == p]["BrandId"].drop_duplicates()
    brand = brand.values[0]
    producttype_id = products_cat.loc[products_cat['ProductId'] == p]["ProductTypeId"].drop_duplicates().values[0]
    newrow = pd.Series([p, brand, producttype_id, brandId, brand],
                        index= ["ProductId", "BrandId", "ProductTypeId", "PropertyDefinitionId", "PropertyDefinitionOptionId"], 
                        name=str(int(i + len(products_cat) + 1)))
    products_cat = products_cat.append(newrow)
    i+=1
print("Added brands as property")
products_cat

Adding brands as property in dataframe...
Added brands as property


Unnamed: 0,ProductId,BrandId,ProductTypeId,PropertyValue,PropertyDefinitionId,PropertyDefinitionOptionId,answer
0,6299632.0,314.0,6.0,5306.0,19219.0,,5306.0
1,6300952.0,314.0,6.0,5306.0,19219.0,,5306.0
2,6843624.0,314.0,6.0,325949.0,19219.0,,325949.0
3,6973019.0,314.0,6.0,325949.0,19219.0,,325949.0
4,9391967.0,331.0,6.0,325949.0,19219.0,,325949.0
5,9559743.0,47.0,6.0,5306.0,19219.0,,5306.0
6,9559750.0,47.0,6.0,5306.0,19219.0,,5306.0
7,9559752.0,47.0,6.0,5306.0,19219.0,,5306.0
8,9559759.0,47.0,6.0,5306.0,19219.0,,5306.0
9,9559766.0,47.0,6.0,5306.0,19219.0,,5306.0


In [36]:
# New answer definition
filters_def_dict, type_filters  = create_categories(products_cat)
products_cat = eliminate_filters_no_answers(products_cat, type_filters)
products_cat["answer"] = map_origAnswer_newAnswer(products_cat, filters_def_dict, type_filters)


Have to categorize 22 filters out of 136


In [38]:
# Adding brand to answer_text
brand_text_df = pd.read_csv("../data/brands.csv")
brand_text_df['question_id'] = brandId
brand_text_df.columns=["answer_id", "answer_text", "question_id"]
answer_text = answer_text.append(brand_text_df)
answer_text.drop_duplicates(inplace=True)
answer_text

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  sort=sort)


Unnamed: 0,answer_id,answer_text,question_id
0,5306.0,Business,19219
2,325949.0,Privat,19219
1539,0.0,0.0,15415
1593,1.0,1.0,15415
3986,0.0,0.0,15430
4020,1.0,1.0,15430
5505,3379.0,Silber,10526
5506,6596.0,802.11n,8246
5507,7090.0,Intel,9649
5508,3988.0,Deutsch,7202


In [39]:
# Adding brand to question_text
newrow_question_text = pd.Series([str("Brand"), brandId],
                        index= ["PropertyDefinition", "PropertyDefinitionId"], 
                        name=str(len(question_text_df)))
question_text_df = question_text_df.append(newrow_question_text)

In [40]:
# Saving everything (ONLY RUN ONCE!)
save_obj(products_cat, '../data/products_table')
save_obj(filters_def_dict, '../data/filters_def_dict')
save_obj(type_filters, '../data/type_filters')
save_obj(question_text_df, '../data/question_text_df')
save_obj(answer_text, '../data/answer_text')