In [3]:
import time
from collections import deque
import csv
import numpy as np
import tensorflow as tf
from six import next
from tensorflow.core.framework import summary_pb2
import pandas as pd
import dataio
import ops

In [4]:
np.random.seed(13575)

BATCH_SIZE = 100
USER_NUM =  206202
ITEM_NUM = 32
DIM = 15
EPOCH_MAX = 100
DEVICE = "/cpu:0"

In [5]:
def clip(x):
    return np.clip(x, 1.0, None)


def make_scalar_summary(name, val):
    return summary_pb2.Summary(value=[summary_pb2.Summary.Value(tag=name, simple_value=val)])


def get_data():
    df = dataio.read_process("data/user_basket_size.csv", sep=",")
    df['group_indicator'] = (df.ix[:,0] != df.ix[:,0].shift(-1)).astype(int)

    df_train = df.loc[df.group_indicator==0]
    df_train = df_train.drop('group_indicator', axis=1)

    df_test =  df.loc[df.group_indicator==1]
    df_test = df_test.drop('group_indicator', axis=1)
    df = df.drop('group_indicator', axis=1)

    return df_train, df_test

In [6]:
def svd(train, test):
    samples_per_batch = len(train) // BATCH_SIZE
    print test.head(10)
    iter_train = dataio.ShuffleIterator([train["user"],
                                         train["days_since_prior_order"],
                                         train["basket_size"]],
                                        batch_size=BATCH_SIZE)

    iter_test = dataio.OneEpochIterator([test["user"],
                                         test["days_since_prior_order"],
                                         test["basket_size"]],
                                        batch_size=-1)

    user_batch = tf.placeholder(tf.int32, shape=[None], name="id_user")
    days_since_prior_order_batch = tf.placeholder(tf.int32, shape=[None], name="id_days_since_prior_order")
    basket_size_batch = tf.placeholder(tf.float32, shape=[None])

    infer, regularizer = ops.inference_svd(user_batch, days_since_prior_order_batch, user_num=USER_NUM, item_num=ITEM_NUM, dim=DIM,
                                           device=DEVICE)
    global_step = tf.contrib.framework.get_or_create_global_step()
    _, train_op = ops.optimization(infer, regularizer, basket_size_batch, learning_rate=0.001, reg=0.05, device=DEVICE)

    init_op = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init_op)
        summary_writer = tf.summary.FileWriter(logdir="/tmp/svd/log", graph=sess.graph)
        print("{} {} {} {}".format("epoch", "train_error", "val_error", "elapsed_time"))
        errors = deque(maxlen=samples_per_batch)
        start = time.time()
        min = 100
        predList = []
        actList = []
        finalPred = []
        finalAct = []
        finalpr = []
        finalac = []
        for i in range(EPOCH_MAX * samples_per_batch):

            users, days_since_prior_orders, basket_sizes = next(iter_train)
            _, pred_batch = sess.run([train_op, infer], feed_dict={user_batch: users,
                                                                   days_since_prior_order_batch: days_since_prior_orders,
                                                                   basket_size_batch: basket_sizes})
            pred_batch = clip(pred_batch)
            errors.append(np.power(pred_batch - basket_sizes, 2))
            if i % samples_per_batch == 0:
                train_err = np.sqrt(np.mean(errors))
                test_err2 = np.array([])
                for users, days_since_prior_orders, basket_sizes in iter_test:
                    pred_batch = sess.run(infer, feed_dict={user_batch: users,
                                                            days_since_prior_order_batch: days_since_prior_orders})
                    #pred_batch = clip(pred_batch)
                    test_err2 = np.append(test_err2, np.power(pred_batch - basket_sizes, 2))

                    pr = pred_batch
                    ac = basket_sizes
                end = time.time()
                test_err = np.sqrt(np.mean(test_err2))
                print("{:3d} {:f} {:f} {:f}(s)".format(i // samples_per_batch, train_err, test_err,
                                                       end - start))
                train_err_summary = make_scalar_summary("training_error", train_err)
                test_err_summary = make_scalar_summary("test_error", test_err)
                summary_writer.add_summary(train_err_summary, i)
                summary_writer.add_summary(test_err_summary, i)
                start = end

                if train_err < min:
                    min = train_err
                    finalpr = pr
                    finalac = ac

        return finalpr, finalac

In [7]:
if __name__ == '__main__':
    df_train, df_test = get_data()
    pr, ac = svd(df_train, df_test)
    print pr, type(pr), ac, type(ac)
    prdf = pd.DataFrame(pr)
    acdf = pd.DataFrame(ac)
    print df_test.head(10)

    result = pd.concat([prdf, acdf], axis=1)

    user  days_since_prior_order  basket_size       st
5      1                      19          4.0  3367565
11   201                       6          9.0  3056620
16   401                      31          2.0  3088579
21   601                      30         14.0  2155743
27   801                      31          2.0  3007470
32  1001                      31         17.0  3020360
37  1201                      28          4.0  3332416
43  1401                      30          6.0  3018159
49  1601                       5          7.0  3136784
53  1801                       5          3.0  3233025
epoch train_error val_error elapsed_time
  0 11.435908 12.577744 0.066063(s)
  1 11.776064 12.500966 1.282444(s)
  2 11.843151 12.419222 1.268513(s)
  3 11.813773 12.324445 1.259443(s)
  4 12.051254 12.210741 1.282468(s)
  5 11.769066 12.076256 1.250977(s)
  6 11.757210 11.919237 1.253225(s)
  7 12.018563 11.740817 1.255026(s)
  8 11.607823 11.543197 1.264547(s)
  9 11.647133 11.324613 1.2631

In [8]:
import numpy as np # linear algebra
import pprint
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import csv

In [7]:
orders = pd.read_csv("data/orders_train_test.csv")
prior = pd.read_csv("data/order_products__prior.csv")
train = pd.read_csv("data/order_products__train.csv")

In [8]:
frames = [prior, train]
products = result = pd.concat(frames)

In [9]:
orders = orders.loc[orders['group_indicator'] == 1]

# find the products that were actually bought in the last order of every user
test_orders = pd.merge(orders, products, on='order_id')
print test_orders.head(10)

test_orders2 = test_orders[['user_id', 'order_id', 'product_id']]
print test_orders2.head(10)

# create a list of lists
# list of the last order of each user
# containing lists of the products that each user bought in his last order
test_orders2 =  test_orders2.groupby(['user_id', 'order_id'])['product_id'].apply(list)
print test_orders2.head(10)
#filename = 'actual_products.csv'
#test_orders2.to_csv(filename, index=False, encoding='utf-8', header=False)


test_set = pd.read_csv("data/test_set_.csv", names = ["user_id", "days_since_prior_order", "basket", "order_id"])
# the next dataset contains the predicted basket size of the next basket (output of svd_train_val.py)
preds = pd.read_csv("data/pred-actual.csv",  names = ["pred", "actual"])
# this dataset contains statistics concerning users' consumer behaviour
user_prod_stats = pd.read_csv("data/user_product_stats.csv")
#act_prods = pd.read_csv("data/actual_products.csv")


   order_id  user_id  days_since_prior_order  basket_size  order_seq  \
0   1187899        1                    14.0           11          5   
1   1187899        1                    14.0           11          5   
2   1187899        1                    14.0           11          5   
3   1187899        1                    14.0           11          5   
4   1187899        1                    14.0           11          5   
5   1187899        1                    14.0           11          5   
6   1187899        1                    14.0           11          5   
7   1187899        1                    14.0           11          5   
8   1187899        1                    14.0           11          5   
9   1187899        1                    14.0           11          5   

   group_indicator  product_id  add_to_cart_order  reordered  
0                1         196                  1          1  
1                1       25133                  2          1  
2                1

In [10]:
test_preds = pd.concat([test_set, preds], axis=1)

pred_prods =pd.DataFrame()
l=int(len(test_set))
c=int(1)
final_pred_prods = []
final_pred_prods2 = pd.DataFrame()

In [11]:
i = 0

# iterate through the dataframe containing the user_id and the predicted number of his next basket

# for every user check the predicted size of his next basket and accordingly predict the products that he buy

# the prediction of the next basket products depends on the following:
# 1. predicted basket size
# 2. the preferences of the user,
for index, row in test_preds.iterrows():
     user_stats = []
     basket_size = int(round(row['pred'],0))
     user = row['user_id']

     user_stats = user_prod_stats.loc[user_prod_stats['user_id'] == user]
     user_products = user_stats['product_id']

     pred_prods  =  user_products.head(basket_size)
     df_row = pred_prods.tolist()
     final_pred_prods.append(df_row)

     i = i+1

print type(final_pred_prods)
print type(final_pred_prods[1])

print 'results'
for xs in final_pred_prods:
    print ",".join(map(str, xs))


<type 'list'>
<type 'list'>
results
196,10258,25133,12427,46149,49235,13032
13712,329,27845,23339
24852,47766,13249,45066,9387,22312,486,49683,21903,47626,47119,1516,28985,15579,16185,39024,26209
39527,34050,30440,4793,27966,23233
30742,9681,47626,20574,47209,17027,11576,13176
33894,9508,14778,28199,45596,39832,4792,36978,17982,34287,31663,24799,20955,17743,41650
42450,23106,38241,7533,42598
10032,49296,15472,26910,49633,42016,23026,46674,287
15649,31717,41433,17794,5456,24852,39812,7948
13176,27845,47029,11843,2716,21137
49683,42450,7131,21903,28465,44359,34466,34214,31040,29487,25043,45066,18926,432,10749,24799
5077,19678,34126,10644,22935,9365,34358,14540,17316
9337,15803,26209,25890,25659,24964,24852,30776,21903,44632,43295,41259,35951,45007,24035,41149,49683,34137,13535
40516,10193,10555,30551,16797,21938
3957,49235,5876,42736,16617,40992,21903,46676,21405
24852,4210,38452,22035,30995,46166,27086,4605,21938
16696,38371
48364,8899,26618,9710,5916,19904,33174,2856,2846,43867,11071
1

In [13]:
print 'get results for 4789'
for xs in final_pred_prods:
    print ",".join(map(str, xs))

13712


In [14]:
# create a list of lists
# list of the last order of each user
# containing lists of the predicted products for this order
with open('data/pred_products.csv', 'wb') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    #wr = ",".join(map(str, wr))
    wr.writerow(final_pred_prods)

In [15]:

import csv

#preds = pd.read_csv("pred_products2.csv")
predss = []


# read CSV file & load into list
with open("data/pred_products.csv", 'r') as my_file:
    reader = csv.reader(my_file, delimiter='\t')
    preds = list(reader)

with open("data/actual_products.csv", 'r') as my_file:
    reader = csv.reader(my_file, delimiter='\t')
    acts = list(reader)

acts = [l[0] for l in acts]

In [16]:
TTP = 0
TFP = 0
TFN = 0
TT = 0

i= 0

In [17]:
for pred, act in zip(preds, acts):
        act = str(act)
        act = act.replace(" ", "")
        pred = str(pred)

        pred = pred.replace(" ", "")
        pred = pred.replace("'", "")
        pred = pred.replace("[", "")
        pred = pred.replace("]", "")
        act = act.replace("[", "")
        act = act.replace("]", "")

        act = act.split(",")
        pred = pred.split(",")


        pred = set(pred)
        act = set(act)

        TP = len(set.intersection(act, pred))

        UN = len(set.union(act, pred))

        FP = len(pred)-TP
        FN = len(act)-TP
        T = len(act)

        AC = TP/float(T)
        #print TP, UN, FP, FN, T
        TTP=TTP+TP
        TFP=TFP+FP
        TFN=TFN+FN
        TT=TT+T
        #print TTP, TFP, TFN, TT

In [18]:
TAC = TTP/float(TT)
#print TTP,TT
PRE = TTP/float((TTP+TFP))
REC = TTP/float((TTP+TFN))
F1 = (2*(PRE*REC))/float((PRE+REC))

i = i+1
print 'true positives', TTP, '\nfalse positives', TFP, '\nfalse negatives', TFN, '\ntotal products bought', TT
print '\naccuracy', TAC, '\nprecision', PRE, '\nrecall', REC, '\nf1', F1


true positives 10 
false positives 5143 
false negatives 1 
total products bought 11

accuracy 0.909090909091 
precision 0.00194061711624 
recall 0.909090909091 
f1 0.00387296669249
