In [49]:
import time
from collections import deque
import csv
import numpy as np
import tensorflow as tf
from six import next
from tensorflow.core.framework import summary_pb2
import pandas as pd
import dataio
import ops
import pymongo
from pymongo import MongoClient
import datetime


In [2]:
np.random.seed(13575)

BATCH_SIZE = 100
USER_NUM =  206202
ITEM_NUM = 32
DIM = 15
EPOCH_MAX = 100
DEVICE = "/cpu:0"

In [3]:
def clip(x):
    return np.clip(x, 1.0, None)


def make_scalar_summary(name, val):
    return summary_pb2.Summary(value=[summary_pb2.Summary.Value(tag=name, simple_value=val)])


def get_data():
    df = dataio.read_process("data/user_basket_size.csv", sep=",")
    df['group_indicator'] = (df.ix[:,0] != df.ix[:,0].shift(-1)).astype(int)

    df_train = df.loc[df.group_indicator==0]
    df_train = df_train.drop('group_indicator', axis=1)

    df_test =  df.loc[df.group_indicator==1]
    df_test = df_test.drop('group_indicator', axis=1)
    df = df.drop('group_indicator', axis=1)

    return df_train, df_test

In [4]:
def svd(train, test):
    samples_per_batch = len(train) // BATCH_SIZE
    print test.head(10)
    iter_train = dataio.ShuffleIterator([train["user"],
                                         train["days_since_prior_order"],
                                         train["basket_size"]],
                                        batch_size=BATCH_SIZE)

    iter_test = dataio.OneEpochIterator([test["user"],
                                         test["days_since_prior_order"],
                                         test["basket_size"]],
                                        batch_size=-1)

    user_batch = tf.placeholder(tf.int32, shape=[None], name="id_user")
    days_since_prior_order_batch = tf.placeholder(tf.int32, shape=[None], name="id_days_since_prior_order")
    basket_size_batch = tf.placeholder(tf.float32, shape=[None])

    infer, regularizer = ops.inference_svd(user_batch, days_since_prior_order_batch, user_num=USER_NUM, item_num=ITEM_NUM, dim=DIM,
                                           device=DEVICE)
    global_step = tf.contrib.framework.get_or_create_global_step()
    _, train_op = ops.optimization(infer, regularizer, basket_size_batch, learning_rate=0.001, reg=0.05, device=DEVICE)

    init_op = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init_op)
        summary_writer = tf.summary.FileWriter(logdir="/tmp/svd/log", graph=sess.graph)
        print("{} {} {} {}".format("epoch", "train_error", "val_error", "elapsed_time"))
        errors = deque(maxlen=samples_per_batch)
        start = time.time()
        min = 100
        predList = []
        actList = []
        finalPred = []
        finalAct = []
        finalpr = []
        finalac = []
        for i in range(EPOCH_MAX * samples_per_batch):

            users, days_since_prior_orders, basket_sizes = next(iter_train)
            _, pred_batch = sess.run([train_op, infer], feed_dict={user_batch: users,
                                                                   days_since_prior_order_batch: days_since_prior_orders,
                                                                   basket_size_batch: basket_sizes})
            pred_batch = clip(pred_batch)
            errors.append(np.power(pred_batch - basket_sizes, 2))
            if i % samples_per_batch == 0:
                train_err = np.sqrt(np.mean(errors))
                test_err2 = np.array([])
                for users, days_since_prior_orders, basket_sizes in iter_test:
                    pred_batch = sess.run(infer, feed_dict={user_batch: users,
                                                            days_since_prior_order_batch: days_since_prior_orders})
                    #pred_batch = clip(pred_batch)
                    test_err2 = np.append(test_err2, np.power(pred_batch - basket_sizes, 2))

                    pr = pred_batch
                    ac = basket_sizes
                end = time.time()
                test_err = np.sqrt(np.mean(test_err2))
                print("{:3d} {:f} {:f} {:f}(s)".format(i // samples_per_batch, train_err, test_err,
                                                       end - start))
                train_err_summary = make_scalar_summary("training_error", train_err)
                test_err_summary = make_scalar_summary("test_error", test_err)
                summary_writer.add_summary(train_err_summary, i)
                summary_writer.add_summary(test_err_summary, i)
                start = end

                if train_err < min:
                    min = train_err
                    finalpr = pr
                    finalac = ac

        return finalpr, finalac

In [5]:
if __name__ == '__main__':
    df_train, df_test = get_data()
    pr, ac = svd(df_train, df_test)
    print pr, type(pr), ac, type(ac)
    prdf = pd.DataFrame(pr)
    acdf = pd.DataFrame(ac)
    print df_test.head(10)

    result = pd.concat([prdf, acdf], axis=1)

    user  days_since_prior_order  basket_size       st
5      1                      19          4.0  3367565
11   201                       6          9.0  3056620
16   401                      31          2.0  3088579
21   601                      30         14.0  2155743
27   801                      31          2.0  3007470
32  1001                      31         17.0  3020360
37  1201                      28          4.0  3332416
43  1401                      30          6.0  3018159
49  1601                       5          7.0  3136784
53  1801                       5          3.0  3233025
epoch train_error val_error elapsed_time
  0 11.435908 12.478551 0.088045(s)
  1 11.776064 12.402349 1.293321(s)
  2 11.843151 12.323633 1.180987(s)
  3 11.813773 12.237721 1.178559(s)
  4 12.051254 12.140120 1.178191(s)
  5 11.769066 12.027857 1.197734(s)
  6 11.757401 11.898135 1.189355(s)
  7 12.023698 11.750636 1.188937(s)
  8 11.630015 11.585739 1.185149(s)
  9 11.698745 11.402171 1.2031

In [7]:
import numpy as np # linear algebra
import pprint
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import csv

In [8]:
orders = pd.read_csv("data/orders_train_test.csv")
prior = pd.read_csv("data/order_products__prior.csv")
train = pd.read_csv("data/order_products__train.csv")

In [9]:
frames = [prior, train]
products = result = pd.concat(frames)

In [10]:
orders = orders.loc[orders['group_indicator'] == 1]

# find the products that were actually bought in the last order of every user
test_orders = pd.merge(orders, products, on='order_id')
print test_orders.head(10)

test_orders2 = test_orders[['user_id', 'order_id', 'product_id']]
print test_orders2.head(10)

# create a list of lists
# list of the last order of each user
# containing lists of the products that each user bought in his last order
test_orders2 =  test_orders2.groupby(['user_id', 'order_id'])['product_id'].apply(list)
print test_orders2.head(10)
#filename = 'actual_products.csv'
#test_orders2.to_csv(filename, index=False, encoding='utf-8', header=False)


test_set = pd.read_csv("data/test_set_.csv", names = ["user_id", "days_since_prior_order", "basket", "order_id"])
# the next dataset contains the predicted basket size of the next basket (output of svd_train_val.py)
preds = pd.read_csv("data/pred-actual.csv",  names = ["pred", "actual"])
# this dataset contains statistics concerning users' consumer behaviour
user_prod_stats = pd.read_csv("data/user_product_stats.csv")
#act_prods = pd.read_csv("data/actual_products.csv")


   order_id  user_id  days_since_prior_order  basket_size  order_seq  \
0   1187899        1                    14.0           11          5   
1   1187899        1                    14.0           11          5   
2   1187899        1                    14.0           11          5   
3   1187899        1                    14.0           11          5   
4   1187899        1                    14.0           11          5   
5   1187899        1                    14.0           11          5   
6   1187899        1                    14.0           11          5   
7   1187899        1                    14.0           11          5   
8   1187899        1                    14.0           11          5   
9   1187899        1                    14.0           11          5   

   group_indicator  product_id  add_to_cart_order  reordered  
0                1         196                  1          1  
1                1       25133                  2          1  
2                1

In [11]:
test_preds = pd.concat([test_set, preds], axis=1)

pred_prods =pd.DataFrame()
l=int(len(test_set))
c=int(1)
final_pred_prods = []
final_pred_prods2 = pd.DataFrame()

In [24]:
i = 0

# iterate through the dataframe containing the user_id and the predicted number of his next basket

# for every user check the predicted size of his next basket and accordingly predict the products that he buy

# the prediction of the next basket products depends on the following:
# 1. predicted basket size
# 2. the preferences of the user,
sample_user = ''
sample_size = ''
sample_reco = ''
for index, row in test_preds.iterrows():
     user_stats = []
     basket_size = int(round(row['pred'],0))
     user = row['user_id']

     user_stats = user_prod_stats.loc[user_prod_stats['user_id'] == user]
     user_products = user_stats['product_id']

     pred_prods  =  user_products.head(basket_size)
     df_row = pred_prods.tolist()
     sample_user = user
     sample_size = basket_size
     sample_reco = pred_prods
     final_pred_prods.append(df_row)

     i = i+1

print sample_user
# print sample_size
# print sample_reco

206201.0


In [109]:
test_preds.user_id

0            1
1          201
2          401
3          601
4          801
5         1001
6         1201
7         1401
8         1601
9         1801
10        2001
11        2201
12        2401
13        2601
14        2801
15        3001
16        3201
17        3401
18        3601
19        3801
20        4001
21        4201
22        4401
23        4601
24        4801
25        5001
26        5201
27        5401
28        5601
29        5801
         ...  
1002    200401
1003    200601
1004    200801
1005    201001
1006    201201
1007    201401
1008    201601
1009    201801
1010    202001
1011    202201
1012    202401
1013    202601
1014    202801
1015    203001
1016    203201
1017    203401
1018    203601
1019    203801
1020    204001
1021    204201
1022    204401
1023    204601
1024    204801
1025    205001
1026    205201
1027    205401
1028    205601
1029    205801
1030    206001
1031    206201
Name: user_id, dtype: int64

In [33]:
user_stats = []
b = int(round(row['pred'],0))
user = 201

user_stats = user_prod_stats.loc[user_prod_stats['user_id'] == user]
user_products = user_stats['product_id']

pred_prods  =  user_products.head(basket_size)
df_row = pred_prods.tolist()
sample_user = user
sample_size = basket_size
sample_reco = pred_prods
final_pred_prods.append(df_row)

In [48]:
print 'User:', sample_user
print 'Basket size:', int(sample_size)
print  'recommeded products:',list(sample_reco)

User: 201
Basket size: 18
recommeded products: [13712, 329, 27845, 23339, 23734, 6203, 21137, 30700, 20597, 42016, 5025, 5550, 5451, 13176, 18176, 21162]
16


In [81]:
client = MongoClient()
client = MongoClient('localhost', 27017)
db = client.retail_db

In [82]:
users = db.users

In [83]:
use = list(users.find({"text": "Ishwar Sawale"}).sort([("date", pymongo.DESCENDING)]).limit(1))

In [86]:
print use[0]['text']

Ishwar Sawale


In [126]:
# ub = client.user_table

In [127]:
# user = {"user_name": "Sujata Dev", "user_id": 5801}
# unames = ub.users
# user_id = unames.insert_one(user).inserted_id

In [128]:
# unames = ub.users

In [129]:
# unames

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), u'user_table'), u'users')

In [132]:
# ub = client.user_table
# user = {"user_name": "Sujata Dev", "user_id": 5801}
# unames = ub.users
# user_id = unames.insert_one(user).inserted_id

In [133]:
# ub = client.user_table
# user = {"user_name": "Ishwar Sawale", "user_id": 201}
# unames = ub.users
# user_id = unames.insert_one(user).inserted_id

In [134]:
# ub = client.user_table
# user = {"user_name": "Gaurav Lotekar", "user_id": 1}
# unames = ub.users
# user_id = unames.insert_one(user).inserted_id

In [135]:
# ub = client.user_table
# user = {"user_name": "Savvy Jain", "user_id": 401}
# unames = ub.users
# user_id = unames.insert_one(user).inserted_id