In [1]:
import os

root_paths = [
    "/data/kaggle-instacart",
    "/Users/jiayou/Dropbox/珺珺的程序/Kaggle/Instacart",
    "/Users/jiayou/Dropbox/Documents/珺珺的程序/Kaggle/Instacart"
]
root = None
for p in root_paths:
    if os.path.exists(p):
        root = p
        break
path = root
sbpath = os.path.join(root, 'sb')

In [2]:
import pandas as pd
import numpy as np

gorders = pd.read_csv(os.path.join(path, "orders.csv"), dtype={'order_id': np.uint32,
                                                              'user_id': np.uint32,
                                                              'eval_set': 'category',
                                                              'order_number': np.uint8,
                                                              'order_dow': np.uint8,
                                                              'order_hour_of_day': np.uint8
                                                              })

gorder_prior = pd.read_csv(os.path.join(path, "order_products__prior.csv"), dtype={'order_id': np.uint32,
                                                                                  'product_id': np.uint16,
                                                                                  'add_to_cart_order': np.uint8,
                                                                                  'reordered': bool})
gorder_prior = gorder_prior.merge(gorders[['order_id', 'user_id']], on='order_id', how='left')

gorder_train = pd.read_csv(os.path.join(path, "order_products__train.csv"), dtype={'order_id': np.uint32,
                                                                                  'product_id': np.uint16,
                                                                                  'add_to_cart_order': np.uint8,
                                                                                  'reordered': bool})
gorder_train = gorder_train.merge(gorders[['order_id', 'user_id']], on='order_id', how='left')

In [3]:
def load():
    global gorders, gorder_prior, gorder_train
    return (gorders, gorder_prior, gorder_train)

In [5]:
import warnings
from concurrent.futures import ThreadPoolExecutor
from functools import reduce

import tensorflow as tf
import numpy as np
import math
from sklearn.utils import shuffle


class Product2VecSkipGram:
    def __init__(self, data, cv_data, batch_size, num_skips, skip_window, vocabulary_size, embedding_size=32,
                 num_negative_sampled=64, len_ratio = 0.5):
        self.data = data
        self.cv_data = cv_data
        self.data_index = 0
        self.batch_size = batch_size
        self.num_skips = num_skips
        self.skip_window = skip_window
        self.embedding_size = embedding_size
        self.num_negative_sampled = num_negative_sampled
        self.vocabulary_size = vocabulary_size
        self.len_ratio = len_ratio
        assert batch_size % num_skips == 0
        assert num_skips <= 2 * skip_window
        self.build_graph()

    def predict(self, products):
        result = []
        for i in range(0, len(products), self.batch_size):
            batch = products[i:i+self.batch_size]
            batch = self.sess.run(self.gathered, feed_dict={self.train_inputs: batch})
            result.append(batch)
        return np.concatenate(result, axis=0)

    def train(self, num_steps, cv_every_n_steps, cv_steps, lrs):
        with ThreadPoolExecutor(max_workers=2) as executor:
            average_loss = 0
            learning_rate = 1.0
            current = executor.submit(self.generate_batch)
            for step in range(num_steps):
                if step in lrs:
                    learning_rate = lrs[step]
                batch_inputs, batch_labels = current.result()
                current = executor.submit(self.generate_batch)
                feed_dict = {self.train_inputs: batch_inputs,
                             self.train_labels: batch_labels,
                             self.learning_rate: learning_rate}

                _, loss_val = self.sess.run([self.optimizer, self.loss], feed_dict=feed_dict)
                average_loss += loss_val

                if step % 2000 == 0:
                    if step > 0:
                        average_loss /= 2000
                    print('Average loss at step ', step, ': ', average_loss)
                    average_loss = 0
                if step % cv_every_n_steps == 0:
                    self.data = shuffle(self.data, random_state=0)
                    self.save_model(step)
                    cv_loss = 0
                    for batch_inputs, batch_labels in self.generate_test(cv_steps):
                        feed_dict = {self.train_inputs: batch_inputs,
                                     self.train_labels: batch_labels,
                                     self.learning_rate: learning_rate}
                        loss_val = self.sess.run(self.loss, feed_dict=feed_dict)
                        cv_loss += loss_val
                    print('CV',cv_loss / cv_steps)

    def save_model(self, step):
        self.saver.save(self.sess, 'sb/prod2vec_skip_gram', global_step=step)

    def load_model(self, path):
        self.saver.restore(self.sess, path)

    def build_graph(self):
        self.train_inputs = tf.placeholder(tf.int32, shape=[self.batch_size])
        self.train_labels = tf.placeholder(tf.int32, shape=[self.batch_size])
        self.learning_rate = tf.placeholder(tf.float32)

        # variables
        embeddings = tf.Variable(tf.random_uniform([self.vocabulary_size, self.embedding_size], -1.0, 1.0))

        softmax_weights = tf.Variable(tf.truncated_normal([self.embedding_size, self.vocabulary_size],
                                                          stddev=1.0 / math.sqrt(self.embedding_size)))
        softmax_biases = tf.Variable(tf.zeros([self.vocabulary_size]))

        self.gathered = tf.gather(embeddings, self.train_inputs)

        prediction = tf.matmul(self.gathered, softmax_weights) + softmax_biases
        self.loss = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.train_labels, logits=prediction))

        self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(self.loss)

        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver()


    def inc(self):
        self.data_index = (self.data_index + 1) % len(self.data)

    def inc_cv(self, data_index):
        return (data_index + 1) % len(self.cv_data)

    def generate_batch(self):
        batch = np.ndarray(shape=(self.batch_size), dtype=np.int32)
        labels = np.ndarray(shape=(self.batch_size), dtype=np.int32)
        counter = 0
        while counter < self.batch_size:
            current = self.data.iloc[self.data_index]
            if len(current) == 1:
                warnings.warn("lenght is one", RuntimeWarning)
                self.inc()
                continue

            span = min(2 * self.skip_window + 1, len(current))

            x = target = np.random.randint(0, len(current))

            targets_to_avoid = [x]

            for j in range(self.num_skips):  # target varies!!! X constant!
                while target in targets_to_avoid and len(targets_to_avoid) != span:
                    target = np.random.randint(0, span)
                if len(targets_to_avoid) == span or counter == self.batch_size:
                    break
                targets_to_avoid.append(target)
                batch[counter] = current[x]
                labels[counter] = current[target]
                counter += 1
            self.inc()

        return batch, labels

    def generate_test(self, num_steps):
        data_index = 0
        for _ in range(num_steps):
            batch = np.ndarray(shape=(self.batch_size), dtype=np.int32)
            labels = np.ndarray(shape=(self.batch_size), dtype=np.int32)

            counter = 0
            while counter < self.batch_size:
                current = self.cv_data.iloc[data_index]
                if len(current) == 1:
                    warnings.warn("lenght is one", RuntimeWarning)
                    data_index = self.inc_cv(data_index)
                    continue

                span = min(2 * self.skip_window + 1, len(current))

                x = target = np.random.randint(0, len(current))

                targets_to_avoid = [x]

                for j in range(self.num_skips):  # target varies!!! X constant!
                    while target in targets_to_avoid and len(targets_to_avoid) != span:
                        target = np.random.randint(0, span)
                    if len(targets_to_avoid) == span or counter == self.batch_size:
                        break
                    targets_to_avoid.append(target)
                    batch[counter] = current[x]
                    labels[counter] = current[target]
                    counter += 1
                data_index = self.inc_cv(data_index)

            yield batch, labels

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def prod2vec():
    orders, order_prior, order_train = load()
    
    data = order_prior.sort_values(['order_id']).groupby('order_id')['product_id']\
        .apply(lambda x: x.tolist()).to_frame('products').reset_index()
    data = pd.merge(data, orders, on='order_id')
    data.to_pickle(os.path.join(sbpath, 'prod2vec.pkl'))
    return data

def skip_gram_train():
    np.random.seed(2017)
    products = pd.read_csv(os.path.join(root, 'products.csv'))
    df = pd.read_pickle('sb/prod2vec.pkl').products
    print('initial size', len(df))

    df_train, df_cv = train_test_split(df, test_size=0.1, random_state=2017)
    batch_size = 1024
#     batch_size = 10 #########################
    rates = {100000: 0.5,
             200000: 0.25,
             500000: 0.1}
    model = Product2VecSkipGram(df_train, df_cv, batch_size, 1, 1, np.max(products.product_id) + 1)
    model.train(120001, 20000, len(df_cv) // batch_size, rates)
#     model.train(601, 200, len(df_cv) // batch_size, rates)
    
def skip_gram_get():
    np.random.seed(2017)
    products = pd.read_csv(os.path.join(root, 'products.csv'))
    df = pd.read_pickle('sb/prod2vec.pkl').products.tolist()
    print('initial size', len(df))

    df_train, df_cv = train_test_split(df, test_size = 0.1, random_state=2017)
    model = Product2VecSkipGram(df_train, df_cv, len(products), 1, 1, np.max(products.product_id) + 1)
    model.load_model('sb/prod2vec_skip_gram-120000')
#     model.load_model('sb/prod2vec_skip_gram-600')
    embd = model.predict(products.product_id.values)
    products = pd.concat([products, pd.DataFrame(embd)], axis=1)
    products.to_pickle('sb/product_embeddings.pkl')
    return products

In [31]:
prod2vec()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,order_id,products,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2339,"[21709, 40001, 28465, 1559, 35164, 37301, 2303...",103000,prior,7,0,23,11.0
1,3048,"[26767, 36866, 34278, 37011, 23744, 43990, 333...",90000,prior,19,2,17,9.0
2,4179,"[47209, 29993, 44359, 43961, 35883, 8615, 3228...",46000,prior,31,4,18,12.0
3,4484,"[36730, 21903, 47491, 13454, 18027, 23576, 496...",74000,prior,21,2,7,30.0
4,5191,"[47936, 46522, 48425, 40598, 14852, 30599, 190...",116000,prior,5,0,13,26.0
5,5531,"[14491, 19627, 14211, 10749, 31635, 49026, 133...",122000,prior,10,1,20,2.0
6,5670,"[27344, 16185, 44233, 28597, 37011, 21009]",16000,prior,3,5,13,4.0
7,6729,"[30975, 25144, 40348, 45271, 15655, 10724]",112000,prior,6,1,14,14.0
8,6915,"[48043, 16797, 46969, 44422, 32029, 4920, 2485...",185000,prior,12,3,16,11.0
9,8752,"[34358, 46979, 40604, 21616, 28535, 32465]",54000,prior,5,0,22,15.0


In [45]:
tf.reset_default_graph()
skip_gram_train()

initial size 3479
Average loss at step  0 :  10.9303941727




CV 10.9818634426




CV 10.4397347394
CV 10.1194904552
CV 9.86514258385


In [6]:
tf.reset_default_graph()
prod_embed = skip_gram_get()

initial size 3479
INFO:tensorflow:Restoring parameters from sb/prod2vec_skip_gram-600


In [7]:
prod_embed

Unnamed: 0,product_id,product_name,aisle_id,department_id,0,1,2,3,4,5,...,22,23,24,25,26,27,28,29,30,31
0,1,Chocolate Sandwich Cookies,61,19,-0.970010,0.171590,0.798789,-0.133580,-0.176275,0.581903,...,0.971217,0.819943,-0.651784,0.084176,0.789331,-0.215438,0.849891,0.730186,-0.785567,-0.259129
1,2,All-Seasons Salt,104,13,-0.596838,0.980348,-0.838716,-0.811713,-0.744768,0.313960,...,-0.212219,0.624846,-0.260462,0.777511,0.552866,0.998635,-0.360216,0.327688,-0.541538,-0.455822
2,3,Robust Golden Unsweetened Oolong Tea,94,7,-0.762892,0.236987,0.153728,-0.927299,-0.781095,-0.323117,...,-0.305328,0.598888,0.488858,0.410444,-0.575706,-0.342449,-0.442767,-0.009856,-0.323770,-0.684442
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,-0.824512,0.561596,0.631227,0.026149,0.577660,0.797567,...,-0.118528,-0.493572,0.043844,0.581200,0.258920,0.460504,-0.666104,0.678550,0.055810,-0.283577
4,5,Green Chile Anytime Sauce,5,13,-0.810503,0.001060,-0.434116,0.119153,-0.303208,-0.559618,...,-0.572686,-0.792660,-0.762249,-0.806787,-0.789166,-0.906198,-0.706342,-0.077416,0.237468,-0.787732
5,6,Dry Nose Oil,11,11,0.182834,-0.587086,0.699687,-0.479459,0.959657,-0.641762,...,0.807987,0.597940,0.027971,0.658686,-0.388490,0.274201,-0.795846,-0.801478,-0.079233,0.790985
6,7,Pure Coconut Water With Orange,98,7,-0.637949,-0.566242,0.931436,-0.346844,0.978590,-0.001374,...,0.886792,0.091645,-0.036522,-0.312167,-0.058305,0.041019,0.638304,0.729886,-0.446153,0.859040
7,8,Cut Russet Potatoes Steam N' Mash,116,1,0.229798,-0.446561,-0.259312,-0.791635,0.359986,0.733481,...,0.409371,0.265234,0.127857,-0.986935,0.002816,0.636082,0.740284,-0.062685,0.861429,0.084230
8,9,Light Strawberry Blueberry Yogurt,120,16,0.627415,0.609792,0.012339,0.700814,0.928576,0.228739,...,0.898590,-0.551851,0.684792,-0.144335,0.023301,-0.855848,0.153975,-0.347752,-0.920489,0.117424
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,0.167046,-0.713977,0.251581,-0.709931,0.071411,-0.231655,...,-0.839222,0.515164,-0.574791,-1.007391,-1.013350,-0.480932,0.020191,0.692434,-0.267219,-0.110825
