In [7]:
import numpy as np
import pickle as pkl
import pandas as pd

def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pkl.dump(obj, f, pkl.HIGHEST_PROTOCOL)


def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pkl.load(f)

In [8]:
        product_set = load_obj('../data/products_table')
        traffic_set = load_obj('../data/traffic_table')
        purchased_set = load_obj('../data/purchased_table')
        question_text_df = load_obj('../data/question_text_df')
        answer_text_df = load_obj('../data/answer_text')
        filters_def_dict = load_obj('../data/filters_def_dict')

In [197]:
filters_def_dict

{'19219.0': array({5306.0, 325949.0}, dtype=object),
 '15415.0': array([0., 1.]),
 '15430.0': array([0., 1.]),
 '10526.0': array({1122.0, 419.0, 1123.0, 1126.0, 1415.0, 1416.0, 1930.0, 3379.0, 181621.0, 1430.0, 1433.0, 1915.0, 1916.0, 1918.0, 1913.0},
       dtype=object),
 '8246.0': array({6592.0, 6593.0, 6594.0, 6595.0, 6596.0, 6591.0}, dtype=object),
 '9649.0': array({356800.0, 165445.0, 8138.0, 392299.0, 361841.0, 7090.0, 288377.0, 10367.0},
       dtype=object),
 '7202.0': array({180711.0, 180713.0, 180722.0, 180723.0, 3988.0, 3989.0, 3990.0, 3991.0, 3992.0, 180729.0},
       dtype=object),
 '60.0': array({325762.0, 1157.0, 333332.0, 328982.0, 189082.0, 326560.0, 295589.0, 326566.0, 334247.0, 333360.0, 437.0, 8118.0, 326076.0, 305350.0, 1098.0, 200656.0, 198613.0, 732.0, 325730.0, 1635.0, 362.0, 327278.0, 356594.0, 325239.0, 195065.0, 295290.0},
       dtype=object),
 '11299.0': array({166528.0, 285698.0, 325253.0, 193545.0, 301322.0, 325899.0, 289932.0, 166670.0, 351892.0, 365719

In [207]:
def get_onehot_question(question_list, filters_def_dict):
    """ Compute a list of one-hot vectors to represent the questions asked:
    Args:
         question_list: questions considered
         filters_def_dict: dict where key is questionId, value is array of all possible (modified) answers
    Returns:
        all_one_hot: a list of one-hot vectors indicating the questions asked
    """
    questions_sorted=np.asarray(sorted(filters_def_dict.keys()))
    print(questions_sorted)
    all_one_hot = []
    for q in question_list:
        print(q)
        i = np.where(questions_sorted==str(q))[0][0]
        all_one_hot.append(i)
    return np.asarray(all_one_hot)

In [211]:
state = {'1.0': [1,2]}
number_filters = len(filters_def_dict.keys())
question_asked = state.keys()
print(question_asked)
one_hot_questions_asked = get_onehot_question(question_asked, filters_def_dict)
print(one_hot_questions_asked)
mask = np.ones(number_filters)
for q in one_hot_questions_asked:  # If question was already asked, set corresponding mask value to 0
    print(q)
    mask[q] = 0
mask

dict_keys(['1.0'])
['1.0' '100.0' '10058.0' '10104.0' '10525.0' '10526.0' '10527.0' '10551.0'
 '10552.0' '10588.0' '10589.0' '10600.0' '10601.0' '10604.0' '10628.0'
 '10629.0' '10631.0' '10656.0' '108.0' '10990.0' '11075.0' '112.0'
 '11280.0' '11281.0' '11282.0' '11288.0' '11294.0' '11295.0' '11296.0'
 '11297.0' '11298.0' '11299.0' '11339.0' '11550.0' '12129.0' '12418.0'
 '13.0' '13176.0' '13177.0' '13181.0' '13201.0' '13202.0' '13211.0'
 '13212.0' '134.0' '137.0' '14221.0' '1475.0' '15101.0' '15102.0'
 '15415.0' '15430.0' '16.0' '182.0' '19219.0' '19424.0' '2049.0' '230.0'
 '25.0' '2564.0' '2667.0' '2747.0' '2748.0' '3011.0' '3014.0' '3039.0'
 '3106.0' '3124.0' '3139.0' '3142.0' '3159.0' '3173.0' '3191.0' '3195.0'
 '346.0' '347.0' '349.0' '4.0' '403.0' '404.0' '417.0' '421.0' '459.0'
 '460.0' '522.0' '540.0' '5543.0' '56.0' '59.0' '6.0' '60.0' '611.0'
 '635.0' '656.0' '657.0' '6885.0' '697.0' '7.0' '705.0' '714.0' '7202.0'
 '725.0' '727.0' '7302.0' '734.0' '746.0' '748.0' '797.0' '797

array([0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [185]:
purchased_set

Unnamed: 0,ProductId,UserId,OrderId,SessionId,Items_ProductId,Items_ItemCount
0,329015,2326810,14202712,1.834223e+09,329015,1
1,329015,2865292,14691437,1.960941e+09,329015,1
2,329015,1582035,14504466,1.920271e+09,329015,1
3,438618,1800738,14791067,1.981204e+09,438618,1
4,723305,1764422,13962451,1.781320e+09,723305,1
5,723305,646258,13677947,1.721101e+09,723305,1
6,723305,907368,13954398,1.780284e+09,723305,1
7,725087,1392481,15539859,2.144386e+09,725087,1
8,2468309,609492,15899375,2.220953e+09,2468309,1
9,2479908,1153054,14510966,1.921881e+09,2479908,1


In [177]:
get_proba_Y_distribution(product_set, purchased_set, alpha=1)

Unnamed: 0,uniform,proportion_sold,final_proba
6299632.0,0.000128,0.000169,0.000149
6300952.0,0.000128,0.000000,0.000064
6843624.0,0.000128,0.000407,0.000267
6973019.0,0.000128,0.000169,0.000149
9391967.0,0.000128,0.000000,0.000064
9559743.0,0.000128,0.000000,0.000064
9559750.0,0.000128,0.000000,0.000064
9559752.0,0.000128,0.000000,0.000064
9559759.0,0.000128,0.000000,0.000064
9559766.0,0.000128,0.000000,0.000064


In [181]:
p = get_proba_Y_distribution(product_set, purchased_set, alpha=1)

In [184]:
p = get_proba_Y_distribution(products_cat, purchased_set, alpha=1)["final_proba"].values
y_array = np.random.choice(products_cat["ProductId"].drop_duplicates().values, size = 200, p = p_y)

In [152]:
y_array = np.random.choice(products_cat["ProductId"].drop_duplicates().values, size = 200, p = p_y)

ValueError: a and p must have same size

In [170]:
def get_proba_Y_distribution(products_cat, purchased_cat, alpha=1):
    """Compute the probability of the products according to history:
        Args:
            products_cat: product table [ProductId	BrandId	ProductTypeId	PropertyValue	PropertyDefinitionId	PropertyDefinitionOptionId	answer]
            purchased_cat: purchased table [ProductId	UserId	OrderId	SessionId	Items_ProductId	Items_ItemCount]
            alpha: alpha = 0 means uniform distribution for all the products, otherwise the bigger it is the more history is taken into account
        Returns:
            distribution: probability of the products according to history
    """
    # step 1 compute uniform distribution
    distribution = pd.DataFrame()
    unique_ids = products_cat['ProductId'].drop_duplicates().values
    number_prod_category_6 = len(unique_ids)
    proba_u = 1.0/number_prod_category_6 #uniform distribution: each product has the same probability of being bought by the client
    distribution["uniform"] = np.repeat(proba_u, number_prod_category_6)
    distribution.index = unique_ids
    distribution["proportion_sold"] = 0.0 # init to 0

    # step 2 take history into accounts
    if len(purchased_cat) > 0:
        sold_by_product = purchased_cat.groupby('ProductId').sum()["Items_ItemCount"]
        prod_ids = sold_by_product.index.values
        total_sold = np.sum(sold_by_product.values)
        adjust_proba_by_product = sold_by_product.values/float(total_sold)
        distribution.loc[prod_ids, "proportion_sold"] = adjust_proba_by_product
    
    # step 3 add uniform and history*alpha and renormalize to get a probability
    unormalized_final_proba = distribution["uniform"].values + alpha*distribution["proportion_sold"].values 
    distribution["final_proba"] = unormalized_final_proba/np.sum(unormalized_final_proba)
    return(distribution)

In [142]:
traffic_cat

Unnamed: 0,SessionId,answers_selected,Items_ProductId
0,2.229595e+09,"{'1': [0.39115999999999995], '7302': ['5767']}",5967297
1,2.235801e+09,{'1': [0.39115999999999995]},4678279
2,2.237479e+09,"{'7302': ['185245'], '11297': [0.0]}",5993239
3,2.238819e+09,"{'746': ['278771', '306565', '365632'], '6885'...",5940321
4,2.238902e+09,{'16667': ['310362']},5967297
5,2.239168e+09,{'11550': ['311476']},5967297
6,2.239470e+09,"{'12129': ['185639'], '1': [0.35559999999999997]}",5967297
7,2.239906e+09,"{'12129': ['5306', '185639', '12936'], '1': [0...",5967297
8,2.241201e+09,"{'12129': ['185639'], '7302': ['5767'], '11297...",5967297
9,2.241959e+09,"{'11280': [1000000000.0], '6885': ['277226']}",5940321


In [92]:
def question_id_to_text(question, question_df):
    try:
        question_text = question_df.loc[question_df["PropertyDefinitionId"] == str(int(question)), "PropertyDefinition"].values[0]
    except IndexError:
        question_text = 'No text equivalent for question'
    return question_text

In [97]:
question_text = question_id_to_text('7302', question_text_df)
print(question_text)

Betriebssystem


In [25]:
traffic_cat["answers_selected"]

0         {'1': [0.39115999999999995], '7302': ['5767']}
1                           {'1': [0.39115999999999995]}
2                   {'7302': ['185245'], '11297': [0.0]}
3      {'746': ['278771', '306565', '365632'], '6885'...
4                                  {'16667': ['310362']}
5                                  {'11550': ['311476']}
6      {'12129': ['185639'], '1': [0.35559999999999997]}
7      {'12129': ['5306', '185639', '12936'], '1': [0...
8      {'12129': ['185639'], '7302': ['5767'], '11297...
9          {'11280': [1000000000.0], '6885': ['277226']}
10                                      {'1': [0.32004]}
11     {'16': ['294'], '746': ['306568', '365631', '2...
12     {'11297': [400000000000.0, 0.0], '6885': ['324...
13     {'9668': ['180435'], '7302': ['5767'], '11339'...
14                                      {'1': [0.42672]}
15                                      {'1': [0.42672]}
16             {'12129': ['185639'], '6885': ['277226']}
17                          {'1

In [16]:
traffic_cat["answers_selected"].iloc[3]

{'746': ['278771', '306565', '365632'], '6885': ['277226']}

Compute the list of all the filters used in history:

In [43]:
list_filters_used = []
i = 0
for t in traffic_cat["answers_selected"]:
    i += 1
    for k in t.keys():
        list_filters_used.append(k)

Number of different filters used:

In [44]:
unique_filters = set(list_filters_used)
len(unique_filters)

50

There are many filters that don't occur in product tables! I deleted them...

Names and frequency:

In [97]:
question_text_list = []
df_history = pd.DataFrame(columns=["QuestionId", "text", "frequency"])
total_freq = 0
for f in unique_filters:
    question_text = question_id_to_text(f, question_text_df)
    if not question_text =='No text equivalent for question':
        question_text_list.append(question_text)
        freq = list_filters_used.count(f)
        total_freq += freq
        df_history.loc[len(df_history)] = [f, question_text, freq]
df_history["frequency"] = df_history["frequency"] / total_freq
df_history

Unnamed: 0,ProductId,text,frequency
0,16,Bildschirmoberfläche,0.0271536
1,697,Speicherkartentyp,0.00093633
2,13,Tastaturlayout,0.00093633
3,8745,Prozessortyp,0.0290262
4,460,Speicherkapazität,0.00280899
5,1,Bildschirmgrösse,0.250936
6,12129,Notebook Anwendungsbereich,0.114232
7,60,Datenspeicher Schnittstelle,0.00374532
8,748,Bildschirmtechnologie,0.00468165
9,7302,Betriebssystem,0.144195


In [104]:
def create_history(traffic_cat):
    #Compute the list of all the filters used in history
    list_filters_used = []
    i = 0
    for t in traffic_cat["answers_selected"]:
        i += 1
        for k in t.keys():
            list_filters_used.append(k)
    unique_filters = set(list_filters_used)
    question_text_list = []
    df_history = pd.DataFrame(columns=["ProductId", "text", "frequency"])
    total_freq = 0
    for f in unique_filters:
        question_text = question_id_to_text(f, question_text_df)
        if not question_text == 'No text equivalent for question':
            question_text_list.append(question_text)
            freq = list_filters_used.count(f)
            total_freq += freq
            df_history.loc[len(df_history)] = [f, question_text, freq]
    df_history["frequency"] = df_history["frequency"] / total_freq
    return df_history

In [110]:
t = time.time()
df_history =create_history(traffic_cat,question_text_df)
print(time.time()-t)
df_history

0.08571958541870117


Unnamed: 0,questionId,text,frequency
0,6885,Windows Version,0.100187
1,182,Formfaktor,0.00187266
2,9668,Optisches Laufwerktyp,0.0131086
3,1,Bildschirmgrösse,0.250936
4,16,Bildschirmoberfläche,0.0271536
5,421,Dedizierter Grafikspeicher,0.00187266
6,7202,Sprache,0.00093633
7,11297,Kapazität SSD,0.0561798
8,697,Speicherkartentyp,0.00093633
9,7302,Betriebssystem,0.144195


In [111]:
t = time.time()
df_history2 =create_history2(traffic_cat, question_text_df)
print(time.time()-t)
df_history2

0.07949209213256836


Unnamed: 0,questionId,text,frequency
0,6885,Windows Version,0.100187
1,182,Formfaktor,0.00187266
2,9668,Optisches Laufwerktyp,0.0131086
3,1,Bildschirmgrösse,0.250936
4,16,Bildschirmoberfläche,0.0271536
5,421,Dedizierter Grafikspeicher,0.00187266
6,7202,Sprache,0.00093633
7,11297,Kapazität SSD,0.0561798
8,697,Speicherkartentyp,0.00093633
9,7302,Betriebssystem,0.144195


In [108]:
def create_history2(traffic_cat, question_text_df):
    """ Create history dataframe of filters used:
        Args:
            traffic_cat: traffic table [SessionId	answers_selected	Items_ProductId]
            question_text_df: table to link questionId to text [PropertyDefinition	PropertyDefinitionId]
        Returns:
            df_history: history dataframe of filters used [QuestionId	text	frequency]
    """
    """ did not manage to make it faster"""
    # Compute the list of all the filters used in history
    list_filters_used = []
    [list_filters_used.append(k) for t in traffic_cat["answers_selected"]for k in t.keys()]
    unique_filters = set(list_filters_used)
    df_history = pd.DataFrame(columns=["questionId", "text", "frequency"])
    total_freq = 0
    for f in unique_filters:
        question_text = question_id_to_text(f, question_text_df)
        if not question_text == 'No text equivalent for question':
            freq = list_filters_used.count(f)
            total_freq += freq
            df_history.loc[len(df_history)] = [f, question_text, freq]
    df_history["frequency"] = df_history["frequency"] / total_freq
    return df_history

In [130]:
df_history["ProductId"].values


array(['16', '697', '13', '8745', '460', '1', '12129', '60', '748',
       '7302', '11297', '746', '421', '11298', '11280', '11550', '11339',
       '7', '347', '4', '59', '9668', '6885', '182', '10526', '3195',
       '7202'], dtype=object)

In [151]:
alpha = 2
question_set = ["7202", "421", "2"]
Q_proba = np.zeros(len(question_set))
for i in range(len(question_set)):
    q_id = str(int(question_set[i]))
    Q_proba[i] = 1/len(question_set)
    if q_id in df_history["ProductId"].values:
        Q_proba[i] += alpha * df_history["frequency"].loc[df_history["ProductId"] == q_id].values[0]
Q_proba = Q_proba / Q_proba.sum()
print(Q_proba)

[0.33333333 0.33519553 0.33147114]


In [143]:
df_history["frequency"].loc[df_history["ProductId"] == str(int(MI_matrix[1,0]))].values[0]
#int(MI_matrix[0,0]) in df_history["ProductId"]

0.0018726591760299626

In [163]:
a = [1,2,3,2] 
a = set(a)
list(a)

[1, 2, 3]

In [16]:
def select_subset(product_set, traffic_set = [], question = None, answer = None, purchased_set = []):
    """
    new took 0.4918787479400635
    old took 0.5476908683776855

    function assumes you have already build the answer column
    """
    all_products = set(product_set["ProductId"].values)
    if np.array_equal(["idk"], answer): # case i don't know the answer return everything
        return(product_set, traffic_set, [])
    else:
        answer = [str(x) for x in answer]
        tmp = product_set.loc[(product_set["PropertyDefinitionId"]==int(question)) & (product_set["answer"].astype(str).isin(answer)), ]
        products_to_keep = np.unique(tmp["ProductId"])
        product_set = product_set.loc[product_set["ProductId"].isin(products_to_keep),]
        if len(traffic_set) != 0:
            traffic_set = traffic_set.loc[traffic_set["Items_ProductId"].isin(products_to_keep),]
        if len(purchased_set) != 0:
            purchased_set = purchased_set.loc[purchased_set["Items_ProductId"].isin(products_to_keep),]
        if len(products_to_keep)==0:
            print('problem')
            print(len(all_products))
        return(product_set, traffic_set, purchased_set)

In [17]:
def get_proba_Y_distribution(products_cat, purchased_cat, alpha=1):
    distribution = pd.DataFrame()
    unique_ids = products_cat['ProductId'].drop_duplicates().values
    number_prod_category_6 = len(unique_ids)
    proba_u = 1.0/number_prod_category_6 # if all products had the same probability to be bought
    distribution["uniform"] = np.repeat(proba_u, number_prod_category_6)
    distribution.index = unique_ids
    
    distribution["proportion_sold"] = 0.0 # init to 0
    # step 2 take history into accounts
    if len(purchased_cat) > 0:
        sold_by_product = purchased_cat.groupby('ProductId').sum()["Items_ItemCount"]
        prod_ids = sold_by_product.index.values
        total_sold = np.sum(sold_by_product.values)
        adjust_proba_by_product = sold_by_product.values/float(total_sold)
        distribution.loc[prod_ids, "proportion_sold"] = adjust_proba_by_product
    
    # step 3 add uniform and history and renormalize to get a proba
    unormalized_final_proba = distribution["uniform"].values + alpha*distribution["proportion_sold"].values 
    distribution["final_proba"] = unormalized_final_proba/np.sum(unormalized_final_proba)
    return(distribution)


In [72]:
def conditional_entropy(answer, question, product_set, traffic_set, purchased_set):
    product_set, traffic_set, purchased_set = select_subset(question=question, answer=answer,
                                                            product_set=product_set, traffic_set =traffic_set,
                                                            purchased_set = purchased_set)
    product_ids = product_set["ProductId"].drop_duplicates().values
    try:
        p_product_given_a = get_proba_Y_distribution(product_set, purchased_set, alpha=1)["final_proba"]
    except ZeroDivisionError:
        print('pbm only {} product left'.format(product_ids))
        print(answer)
        print(question)
    #t = time.time()
    prob_y_given_a = [p_product_given_a.loc[product] for product in product_ids]
    cond_entropy_y = np.sum(prob_y_given_a*np.log(prob_y_given_a))
    """ CHECKED new version is little faster (not much though)
    print('new {}'.format(time.time()-t))
    t = time.time()
    cond_entropy_y = 0
    for product in product_ids:
        prob_y_given_a = p_product_given_a.loc[product]
        cond_entropy_y += prob_y_given_a * np.log(prob_y_given_a)
    print(time.time()-t)
    print(cond_entropy_y)
    """
    return cond_entropy_y

def get_proba_A_distribution_none(question, products_cat, traffic_processed, alpha=1):
    """
    old took 0.9141941070556641
    new took 0.033429861068725586
    time divided by 27 !!!!!

    assumes answer is already constructed
    """
    distribution = pd.DataFrame()
    number_products_total = len(products_cat['ProductId'].drop_duplicates().values)
    
    if (number_products_total==0):
        print('Nothing to return there is no product left with this filter')
        return(distribution)
    
    # step 1: probas is number of product per answer to the question (no history)
    products_cat = products_cat.loc[products_cat["PropertyDefinitionId"]==int(question), ]
    nb_prod_with_answer = len(np.unique(products_cat["ProductId"])) # new
    distribution["nb_prod"] = products_cat[['ProductId','answer']].groupby(['answer']).count()["ProductId"]
    distribution.index = distribution.index.astype(float)
    nb_prod_without_answer = number_products_total - nb_prod_with_answer
    distribution["catalog_proba"] = distribution["nb_prod"]/number_products_total
    
    #step 2: add the history if available just for KNOWN answers
    distribution["history_proba"] = 0
    if (len(traffic_processed)>0):
        history_answered = []
        response = traffic_processed["answers_selected"].values
        for r_dict in response:
            if str(question) in r_dict:
                history_answered.extend(r_dict[str(question)])
        if not history_answered == []: 
            series = pd.Series(history_answered)
            add_probas = series.value_counts()
            s_add = sum(add_probas.values)
            add_probas = add_probas/s_add
            index = add_probas.index
            for i in index:
                if float(i) in distribution.index:
                    distribution.loc[float(i), "history_proba"] = add_probas.loc[i]
    distribution["final_proba"] = distribution["history_proba"].values + alpha*distribution["catalog_proba"].values
    # add the idk case JUST FROM CATALOG
    if nb_prod_without_answer!=0:
        distribution.loc["idk", "final_proba"] = nb_prod_without_answer/float(number_products_total)
    # renormalize everything
    distribution["final_proba"] = distribution["final_proba"]/distribution["final_proba"].sum()
    return(distribution)

In [88]:
import time
t = time.time()
proba_A = get_proba_A_distribution_none(697, product_set, traffic_set, alpha=1)["final_proba"]
possible_answers = proba_A.index
short_mutual_info = 0
for answer in possible_answers:
    short_mutual_info += proba_A.loc[answer]* \
                             conditional_entropy(np.asarray([answer]), 16, product_set, traffic_set, purchased_set)
print(time.time()-t)

  return bool(asarray(a1 == a2).all())


problem
7797
pbm only [] product left
[3450.]
16
problem
7797
pbm only [] product left
[3453.]
16
problem
7797
pbm only [] product left
[3454.]
16
problem
7797
pbm only [] product left
[3455.]
16
problem
7797
pbm only [] product left
[3456.]
16
problem
7797
pbm only [] product left
[3457.]
16
problem
7797
pbm only [] product left
[3458.]
16
problem
7797
pbm only [] product left
[3459.]
16
problem
7797
pbm only [] product left
[3460.]
16
problem
7797
pbm only [] product left
[3461.]
16
problem
7797
pbm only [] product left
[3462.]
16
problem
7797
pbm only [] product left
[3464.]
16
problem
7797
pbm only [] product left
[3465.]
16
problem
7797
pbm only [] product left
[6105.]
16
problem
7797
pbm only [] product left
[6107.]
16
problem
7797
pbm only [] product left
[8214.]
16
6.30974268913269


In [21]:
short_mutual_info

-7.281193481102179

In [44]:
def conditional_entropyA(a):
    return conditional_entropy(np.asarray([a]), 16, product_set, traffic_set, purchased_set)

In [84]:
    t = time.time()
    proba_A = get_proba_A_distribution_none(697, product_set, traffic_set, alpha=1)["final_proba"]
    possible_answers = proba_A.index
    print(possible_answers)
    short_mutual_info = proba_A.loc[possible_answers]
    conditional = list(map(lambda x: conditional_entropy(np.asarray([x]), 16, product_set, traffic_set, purchased_set), possible_answers))
    short_mutual_info = sum(short_mutual_info* conditional)
    print(time.time()-t)

Index([294.0, 394.0, 'idk'], dtype='object', name='answer')


  return bool(asarray(a1 == a2).all())


1.3045220375061035


In [62]:
answer = np.asarray([possible_answers[0]])
answer

array([294.])

In [117]:
        products_cat = load_obj('../data/products_table')
        traffic_processed = load_obj('../data/traffic_table')
        purchased_set = load_obj('../data/purchased_table')
        question_text_df = load_obj('../data/question_text_df')
        answer_text_df = load_obj('../data/answer_text')

In [None]:
0.08621597290039062

In [71]:
    product_set, traffic_set, purchased_set = select_subset(question=16, answer=answer,
                                                            product_set=product_set, traffic_set =traffic_set,
                                                            purchased_set = purchased_set)
    product_ids = product_set["ProductId"].drop_duplicates().values
    try:
        p_product_given_a = get_proba_Y_distribution(product_set, purchased_set, alpha=1)["final_proba"]
    except ZeroDivisionError:
        print('pbm only {} product left'.format(product_ids))
        print(answer)

    t = time.time()
    prob_y_given_a = [p_product_given_a.loc[product] for product in product_ids]
    cond_entropy_y = np.sum(prob_y_given_a*np.log(prob_y_given_a))
    print(time.time())
    cond_entropy_y

  return bool(asarray(a1 == a2).all())


0.09097599983215332


-7.162092207534139

In [133]:
    question = 16
    alpha = 1
    distribution = pd.DataFrame()
    number_products_total = len(products_cat['ProductId'].drop_duplicates().values)
    #step 1: probas is number of product per answer to the question (no history)
    products_cat = products_cat.loc[products_cat["PropertyDefinitionId"]==int(question), ]
    nb_prod_with_answer = len(np.unique(products_cat["ProductId"]))
    distribution["nb_prod"] = products_cat[['ProductId','answer']].groupby(['answer']).count()["ProductId"]
    print(distribution)
    distribution.index = distribution.index.astype(float)
    nb_prod_without_answer = number_products_total - nb_prod_with_answer
    distribution["catalog_proba"] = distribution["nb_prod"]/number_products_total

        nb_prod
answer         
294.0      6616
394.0      4456


In [137]:
products_cat[['ProductId','answer']].groupby(['answer']).count()["ProductId"]

answer
294.0    6616
394.0    4456
Name: ProductId, dtype: int64

In [141]:
products_cat[['answer','ProductId']].groupby(['answer']).count()["ProductId"]

answer
294.0    6616
394.0    4456
Name: ProductId, dtype: int64

In [132]:
products_cat

Unnamed: 0,ProductId,BrandId,ProductTypeId,PropertyValue,PropertyDefinitionId,PropertyDefinitionOptionId,answer
5521,7885879,8,6,,16,294.0,294.0
5559,7885938,8,6,,16,294.0,294.0
5585,7889431,8,6,,16,294.0,294.0
5614,8021422,5,6,,16,394.0,394.0
5647,8021439,5,6,,16,394.0,394.0
5687,7924831,8,6,,16,294.0,294.0
5713,7889449,8,6,,16,294.0,294.0
5745,7931823,8,6,,16,294.0,294.0
5792,7928558,8,6,,16,294.0,294.0
5820,7932450,8,6,,16,294.0,294.0


In [131]:
    #step 2: add the history if available just for KNOWN answers
    distribution["history_proba"] = 0
    if (len(traffic_processed)>0):
        history_answered = []
        response = traffic_processed["answers_selected"].values
        for r_dict in response:
            if str(question) in r_dict:
                history_answered.extend(r_dict[str(question)])
        if not history_answered == []: 
            series = pd.Series(history_answered)
            add_probas = series.value_counts()
            s_add = sum(add_probas.values)
            add_probas = add_probas/s_add
            index = add_probas.index
            for i in index:
                if float(i) in distribution.index:
                    distribution.loc[float(i), "history_proba"] = add_probas.loc[i]
    distribution

Unnamed: 0_level_0,nb_prod,catalog_proba,history_proba
answer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
294.0,6616,1.195303,0.933333
394.0,4456,0.805059,0.066667


In [None]:
    #step 3: combine the two probabilities as: p1 + alpha * p2
    distribution["final_proba"] = distribution["catalog_proba"].values + alpha*distribution["history_proba"].values
    # add the idk case JUST FROM CATALOG
    if nb_prod_without_answer!=0:
        distribution.loc["idk", "final_proba"] = nb_prod_without_answer/float(number_products_total)
    # renormalize everything
    distribution["final_proba"] = distribution["final_proba"]/distribution["final_proba"].sum()
    distribution

In [215]:
int(float('9264.0'))

9264

In [218]:
a = 2
while True:
    print(1)
    if a == 2:
        if a == 2:
            break

1


In [2]:
import tensorlayer as tl

  from ._conv import register_converters as _register_converters


In [12]:
data = tl.files.load_npy_to_any(name='../teacher_dagger/s500_p2a0.0_p3a0.0_pidk0.0_a1.0_1543674668_tmp.npy')
state_list = data['state_list'][0:10]
question_list = data['act'][0:20]
#print('Data found and loaded')

In [13]:
state_list

[{},
 {99999: [19.0]},
 {99999: [19.0], 8: [0.02455]},
 {},
 {99999: [19.0]},
 {99999: [19.0], 8: [0.02455]},
 {},
 {99999: ['idk']},
 {99999: ['idk'], 8: ['idk']},
 {99999: ['idk'], 8: ['idk'], 6: [0.321]}]

In [20]:
t = get_onehot_state(state_list, filters_def_dict)

[0.218]
[19.0]


In [19]:
def get_onehot_state(state, filters_def_dict):
    """ Compute the one-hot vector state from state:
    Args:
        state: {"q1":[a1,a2], "q2":[a3], ..}
        filters_def_dict:
    Returns:
        onehot_state: one-hot vector state ([0,0,1,1,0,0,...,0,0])
    """
    questions = [int(float(q)) for q in sorted(filters_def_dict.keys())]
    onehot_state = []
    for q in questions:
        # Get all sorted possible answers
        # some questions have an answer type object and other a normal array
        if filters_def_dict[str(float(q))].dtype == object:
            all_a = sorted(filters_def_dict[str(float(q))].item())
        else:
            all_a = sorted(filters_def_dict[str(float(q))])
        # if q has been answered in state
        if q in state.keys():
            a = state[q]  #get answers from that question
            print(a)
            if not isinstance(a,list):
                a = a.tolist()
            for a_h in all_a: #for all possible answers of q
                if a_h in a:
                    onehot_state.append(1)
                else:
                    onehot_state.append(0)
        # if q has NOT been answered in state
        else:
            [onehot_state.append(0) for i in range(len(all_a))]
    return onehot_state

In [245]:
question_list = [99999, 8, 6]
state_list = []
all_questions_list = []
        # Divide the entire trajectory in {state, action}
history = {}
state_list.append(history.copy())
for q in question_list[: -1]:
    answers = [34]
    history[q] = answers
    state_list.append(history.copy())
    all_questions_list.append(q)
all_questions_list.append(question_list[-1])
print(state_list)
all_questions_list

[{}, {99999: [34]}, {99999: [34], 8: [34]}]


[99999, 8, 6]

In [252]:
        question_list = [99999, 8, 6]
        state_list = []
        all_questions_list = []
        history = {}     # first state is state zero
        state_list.append(history.copy())
        for q in question_list[: -1]:
            answers = [33]
            history[q] = answers
            state_list.append(history.copy())
            all_questions_list.append(q)
        all_questions_list.append(question_list[-1])

In [254]:
all_questions_list

[99999, 8, 6]

In [255]:
question_list[-1]

6

In [19]:
import numpy as np
def process_all_teacher_files(list_filenames, outputname='_tmp.npy'):
    """This function is used to merge several data files from 
    several teacher runs.
    
    Args:
        list_filenames: list of filenames to be merged
        outputname: filename where to save the merged data
    
    Example:
        list_filenames = ['s200_p2a0.0_p3a0.0_pidk0.0_a1.0_tmp.npy', 's200_p2a0.2_p3a0.1_pidk0.1_a1.0_tmp.npy']
        process_all_teacher_files(list_filenames, outputname='test.npy')  
    """
    tmp = tl.files.load_npy_to_any(name=list_filenames[0])
    act = tmp['act']
    state = tmp['state_list']
    for d in list_filenames[1:]:
        tmp = tl.files.load_npy_to_any(name=d)
        act = np.append(act, tmp['act'])
        state = np.append(state, tmp['state_list'])
    tl.files.save_any_to_npy(save_dict={'state_list': state, 'act': act}, name=outputname)

In [22]:
list_filenames = ['../teacher_dagger/s200_p2a0.2_p3a0.1_pidk0.0_a1.0_1543608003_tmp.npy', 
                  '../teacher_dagger/s200_p2a0.2_p3a0.1_pidk0.0_a1.0_1543679755_tmp.npy', 
                  '../teacher_dagger/s200_p2a0.2_p3a0.1_pidk0.1_a1.0_1543627716_tmp.npy', 
                  '../teacher_dagger/s200_p2a0.2_p3a0.1_pidk0.1_a1.0_1543677266_tmp.npy', 
                  '../teacher_dagger/s200_p2a0.3_p3a0.2_pidk0.1_a1.0_1543617966_tmp.npy',
                    '../teacher_dagger/s200_p2a0.3_p3a0.2_pidk0.1_a1.0_1543667462_tmp.npy',
                  '../teacher_dagger/s500_p2a0.0_p3a0.0_pidk0.0_a1.0_1543674668_tmp.npy',
                  '../teacher_dagger/s500_p2a0.0_p3a0.0_pidk0.0_a1.0_1543703593_tmp.npy',
                  '../teacher_dagger/1000_tmp.npy'
                 ]

In [23]:
process_all_teacher_files(list_filenames, outputname='_tmp.npy')