# **Import libraries and set random seed**

In [None]:
import time
t = time.time()
import string
import random
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk, re, time
from nltk.corpus import stopwords
# from nltk.stem.snowball import SnowballStemmer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
# np_rand_seed = random.randint(0,100)
# tf_rand_seed = random.randint(0,100)
np_rand_seed = 44
tf_rand_seed = 40
np.random.seed(np_rand_seed)
tf.set_random_seed(tf_rand_seed)
print("numpy random seed: ",np_rand_seed)
print("tensorflow random seed: ", tf_rand_seed)

# **Loading and inspecting the data**

In [None]:
data = pd.read_csv("../input/train.tsv", delimiter="\t")
test_data = pd.read_csv("../input/test.tsv", delimiter="\t")

In [None]:
data.head(10)

In [None]:
test_data.head(10)

In [None]:
data.shape

In [None]:
test_data.shape

# **Feature Engineering**

In [None]:
# data['brand_name'] = data['brand_name'].fillna("Nobrand")
# test_data['brand_name'] = test_data['brand_name'].fillna("Nobrand")

In [None]:
# brand_name = pd.concat((data['brand_name'], test_data['brand_name']), axis=0)
# label_encoder = LabelBinarizer()
# label_encoder.fit(brand_name)
# brand_label_encoded_train = label_encoder.transform(data['brand_name'])
# brand_label_encoded_test = label_encoder.transform(test_data['brand_name'])

In [None]:
# print("Shape of brand_label_encoded_train:",brand_label_encoded_train.shape)
# print("Shape of brand_label_encoded_test:",brand_label_encoded_test.shape)

In [None]:
sw = set(stopwords.words("english"))
# stemmer = SnowballStemmer("english")
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))

## **Clean Text**

In [None]:
def clean_text(text):
    '''a function for removing punctuation'''
    
    text = str(text)
    
    # replacing the punctuations with no space,which in effect deletes the punctuation marks 
    text = text.translate(translator)
    
    # remove stop word
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    text = " ".join(text)
    
    # stemming
#     text = [stemmer.stem(word) for word in text.split()]
#     text = " ".join(text) 
    
    # Clean the text
    text = re.sub(r"<br />", " ", text)
    text = re.sub(r"[^a-z]", " ", text)
    text = re.sub(r"   ", " ", text) # Remove any extra spaces
    text = re.sub(r"  ", " ", text)
    
    return(text)

In [None]:
t1 = time.time()
data['name'] = data['name'].apply(clean_text)
print("Finished cleaning the name of train set.", "Time needed:", time.time()-t1)
t2 = time.time()
data['category_name'] = data['category_name'].apply(clean_text)
print("Finished cleaning the catogory name of train set.", "Time needed:", time.time()-t2)
t3 = time.time()
data['item_description'] = data['item_description'].apply(clean_text)
print("Finished cleaning the description of train set","Time needed:", time.time()-t3)

In [None]:
t1 = time.time()
test_data['name'] = test_data['name'].apply(clean_text)
print("Finished cleaning the name of train set.", "Time needed:", time.time()-t1)
t2 = time.time()
test_data['category_name'] = test_data['category_name'].apply(clean_text)
print("Finished cleaning the catogory name of train set.", "Time needed:", time.time()-t2)
t3 = time.time()
test_data['item_description'] = test_data['item_description'].apply(clean_text)
print("Finished cleaning the description of train set","Time needed:", time.time()-t3)

## **Tokenize Text**

In [None]:
all_description = pd.concat((data['item_description'], test_data['item_description']),axis=0)
tokenizer1 = Tokenizer()
tokenizer1.fit_on_texts(all_description)
del(all_description)

In [None]:
description = tokenizer1.texts_to_sequences(data['item_description'])
print("train description is complete.")
description_test = tokenizer1.texts_to_sequences(test_data['item_description'])
print("test description is complete")

In [None]:
max_length = 120
description = pad_sequences(description, maxlen = max_length)
print("train decription pad is complete.")
description_test = pad_sequences(description_test, maxlen = max_length)
print("test decription pad is complete.")

In [None]:
all_name = pd.concat((data['name'], test_data['name']),axis=0)
tokenizer2 = Tokenizer()
tokenizer2.fit_on_texts(all_name)
del(all_name)

In [None]:
name = tokenizer2.texts_to_sequences(data['name'])
print("train name is complete.")
name_test = tokenizer2.texts_to_sequences(test_data['name'])
print("test name is complete")

In [None]:
max_length = 15
name = pad_sequences(name, maxlen = max_length)
print("train name pad is complete.")
name_test = pad_sequences(name_test, maxlen = max_length)
print("test name pad is complete.")

In [None]:
all_category = pd.concat((data['category_name'], test_data['category_name']),axis=0)
tokenizer3 = Tokenizer()
tokenizer3.fit_on_texts(all_category)
del(all_category)

In [None]:
category = tokenizer3.texts_to_sequences(data['category_name'])
print("train category is complete.")
category_test = tokenizer3.texts_to_sequences(test_data['category_name'])
print("test category is complete")

In [None]:
max_length = 15
category = pad_sequences(category, maxlen = max_length)
print("train category pad is complete.")
category_test = pad_sequences(category_test, maxlen = max_length)
print("test category pad is complete.")

In [None]:
extra_data = data[['item_condition_id','shipping']].as_matrix()
extra_data_test = test_data[['item_condition_id','shipping']].as_matrix()
price = data['price'].as_matrix()
test_id = test_data['test_id'].as_matrix()

In [None]:
del(data, test_data)

# **Create RNN for the 3 sequences**

In [None]:
tf.reset_default_graph()

In [None]:
n_words1 = len(tokenizer1.word_index)+1
n_words2 = len(tokenizer2.word_index)+1
n_words3 = len(tokenizer3.word_index)+1
embed_size1 = 60
embed_size2 = 10
embed_size3 = 10
n_neurons1 = 60
n_neurons2 = 10
n_neurons3 = 10
name_1 = "1"
name_2 = "2"
name_3 = "3"
n_layers = 2
rnn_dropout = tf.placeholder(tf.float32)

In [None]:
def build_rnn(inputs, n_words, embed_size, n_neurons, n_layers, name):
    
    # placeholders for embeddings
    with tf.variable_scope("embedding"+name):
        embedding = tf.Variable(tf.truncated_normal((n_words, embed_size), -0.1, 0.1))
#         padding_zeros = tf.zeros(shape=[embed_size])
#         embedding = tf.concat(0, [[padding_zeros], embedding])
        embed = tf.nn.embedding_lookup(embedding, inputs, validate_indices=False)

    with tf.variable_scope("rnn"+name):
        lstm_cells = [tf.contrib.rnn.BasicLSTMCell(num_units=n_neurons)
                      for layer in range(n_layers)]
        multi_cell = tf.contrib.rnn.MultiRNNCell(lstm_cells)
        output, state = tf.nn.dynamic_rnn(multi_cell, embed, dtype=tf.float32)
        top_layer_h_state = state[-1][1]
        
    return top_layer_h_state

In [None]:
input1 = tf.placeholder(tf.int32, [None, None])
rnn_output1 = build_rnn(input1, n_words1, embed_size1, n_neurons1, n_layers, name_1)
rnn_output1

In [None]:
input2 = tf.placeholder(tf.int32, [None, None])
rnn_output2 = build_rnn(input2, n_words2, embed_size2, n_neurons2, n_layers, name_2)
rnn_output2

In [None]:
input3 = tf.placeholder(tf.int32, [None, None])
rnn_output3 = build_rnn(input3, n_words3, embed_size3, n_neurons3, n_layers, name_3)
rnn_output3

In [None]:
rnn_output_combined = tf.concat((rnn_output1, rnn_output2, rnn_output3), axis=1)
rnn_output_combined

# **Create Fully Connected Layers**

In [None]:
input4 = tf.placeholder(tf.float32, [None, 2])
prices = tf.placeholder(tf.float32, [None, 1])
keep_prob  = tf.placeholder(tf.float32)

In [None]:
def create_weights(shape):
    '''a function to create weight tensor'''
    return tf.Variable(tf.truncated_normal(shape, stddev=0.05))
 
def create_biases(size):
    '''a function to create bias tensor'''
    return tf.Variable(tf.constant(0.05, shape=[size]))

In [None]:
def create_fc_layer(input,          
                    num_inputs,    
                    num_outputs,
                    use_relu=True,
                    batch_norm = True,
                    dropout = False, 
                    keep_prob = 0.2):
    
    '''a function for creating fully connected layer'''
    
    #Let's define trainable weights and biases.
    weights = create_weights(shape=[num_inputs, num_outputs])
    biases = create_biases(num_outputs)
    
    # matrix multiplication between input and weight matrix
    layer = tf.matmul(input, weights)
    
    # batch normalization if wanted
    if batch_norm:
        layer = tf.layers.batch_normalization(layer, training=True)
        
    # add the bias to the convolutional layer
    layer += biases
    
    # add relu activation if wanted
    if use_relu:
        layer = tf.nn.relu(layer)
        
    # if dropout is wanted add dropout
    if dropout:        
        layer = tf.nn.dropout(layer, keep_prob)
    
    # return layer
    return layer

In [None]:
fc_size1 = 25
fc_size2 = 5
fc_size3 = 5

In [None]:
fully_connected_layer1 = create_fc_layer(rnn_output_combined,
                                         rnn_output_combined.get_shape()[1].value,
                                         fc_size1,
                                        use_relu=True,
                                        batch_norm = True,
                                        dropout =True,
                                        keep_prob = keep_prob)
fully_connected_layer1

In [None]:
fully_connected_layer2 = create_fc_layer(input4,
                                         input4.get_shape()[1].value,
                                         fc_size2,
                                            use_relu=True,
                                            batch_norm = True,
                                            dropout =True,
                                            keep_prob = keep_prob)
fully_connected_layer2

In [None]:
combined_layer = tf.concat((fully_connected_layer1, fully_connected_layer2), axis=1)
combined_layer

In [None]:
fully_connected_layer3 = create_fc_layer(combined_layer,
                                         combined_layer.get_shape()[1].value,
                                         fc_size3,
                                            use_relu=True,
                                            batch_norm = True,
                                            dropout =True,
                                            keep_prob = keep_prob)
fully_connected_layer3

In [None]:
outputs = create_fc_layer(fully_connected_layer3,
                         fc_size3,
                         1,
                            use_relu=False,
                            batch_norm = False,
                            dropout =False)
outputs

In [None]:
loss = tf.sqrt(tf.reduce_mean(tf.square(tf.subtract(tf.log(outputs+1),tf.log(prices+1)))))

In [None]:
# learning rate of optimizer
learning_rate = (1e-1)*3
# train step
train_step = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

# **Train model**

## **Split data**

In [None]:
train_indices = np.random.choice(len(price), round(len(price)*0.993), replace=False)
validation_indices = np.array(list(set(range(len(price))) - set(train_indices)))

description_train = description[train_indices]
description_validation = description[validation_indices]

name_train = name[train_indices]
name_validation = name[validation_indices]

category_train = category[train_indices]
category_validation = category[validation_indices]

extra_data_train = extra_data[train_indices]
extra_data_validation = extra_data[validation_indices]

price_train = price[train_indices]
price_validation = price[validation_indices]

del(description, name, category, extra_data, price)

In [None]:
print("Shape of description_train:",description_train.shape)
print("Shape of description_validation:",description_validation.shape)
print("Shape of name_train:",name_train.shape)
print("Shape of name_validation:",name_validation.shape)
print("Shape of category_train:",category_train.shape)
print("Shape of category_validation:",category_validation.shape)
print("Shape of extra_data_train:",extra_data_train.shape)
print("Shape of extra_data_validation:",extra_data_validation.shape)
print("Shape of price_train:",price_train.shape)
print("Shape of price_validation:",price_validation.shape)

In [None]:
# lists to store the train loss, validation loss, validation accuracy at each iteration
train_loss = []
valid_loss = []

# batch size
batch_size = 100
# max iteration
max_iter = 1000

## **Train and save the best model**

In [None]:
# create a saver object
saver = tf.train.Saver(max_to_keep=1)

# variables to store the accuracy, loss, iteration of our best model
best_loss = 1000000
best_iteration = None

iteration = 0

# create a graph session and optimize under it
with tf.Session() as sess:
    
    # initialize variables
    sess.run(tf.global_variables_initializer())

    # while 57 minutes have not elapsed (to finish before the kernel is killed)
    while (time.time()-t) < 45*60:
        
        # break if max iteration is reached
        if iteration >= max_iter:
            break

        # randomly choosing the indices of the batch 
        rand_index = np.random.choice(price_train.shape[0], size=batch_size)

        # extract the batch image and labels
        description_train_rand = description_train[rand_index]
        name_train_rand = name_train[rand_index]
        category_train_rand = category_train[rand_index]
        extra_data_train_rand = extra_data_train[rand_index]
        price_train_rand = price_train[rand_index]

        # feed dictionary for batch
        feed_dict_batch =  {input1: description_train_rand,
                            input2: name_train_rand,
                            input3: category_train_rand,
                            input4: extra_data_train_rand,
                            prices: np.transpose([price_train_rand]),
                            keep_prob: 0.8}
        # feed dictionary for train
        feed_dict_train =  {input1: description_train_rand,
                            input2: name_train_rand,
                            input3: category_train_rand,
                            input4: extra_data_train_rand,
                            prices: np.transpose([price_train_rand]),
                            keep_prob: 1.0}
        # feed dictionary for validation
        feed_dict_validation =  {input1: description_validation,
                                input2: name_validation,
                                input3: category_validation,
                                input4: extra_data_validation,
                                prices: np.transpose([price_validation]),
                                keep_prob: 1.0}
        
        # execute optimization step
        sess.run(train_step, feed_dict=feed_dict_batch)
        
        # calculate temporary train loss and append it to the designated list
        temp_train_loss = loss.eval(session=sess, feed_dict=feed_dict_train)
        train_loss.append(temp_train_loss)
        # calculate temporary validation loss and append it to the designated list
        temp_validation_loss = loss.eval(session=sess, feed_dict=feed_dict_validation)
        valid_loss.append(temp_validation_loss)
        
        if (temp_validation_loss < best_loss):
            best_loss = temp_validation_loss
            best_iteration = iteration           
            saver.save(sess, './my-model', global_step = best_iteration)
            
         # print metric info
        print("iterations:",iteration,
              "| train_loss:", temp_train_loss,
              "| validation_loss:", temp_validation_loss)
        
        # increment iteration
        iteration = iteration+1

In [None]:
print("Best loss:", best_loss)

## **Compute predictions for test set**

In [None]:
with tf.Session() as sess:  
    
    t_start = time.time()
    
    # restore the best model
    model_path = "./"+"my-model-"+str(best_iteration)
    saver.restore(sess, model_path)
    
    # break the test set into k folds other wise kernel will be out of memory
    n = len(test_id)
    k = 100
    step = n//k
    
    # array to store the prediction
    preds = np.array([])

    # iterate through each fold
    for i in range(k):

        # start and end indices of the fold
        if i != (k-1):
            start = (step*i)
            end = (step*(i+1)) 
        else:
            start = (step*i)
            end = len(test_id)

        # feed dictionary for the fold
        feed_dict_test =  {input1: description_test[start:end],
                            input2: name_test[start:end],
                            input3: category_test[start:end],
                            input4: extra_data_test[start:end],
                            keep_prob: 1.0}

        # evaluate predictions of the fold
        fold_preds = outputs.eval(session=sess, feed_dict = feed_dict_test)
        # append the predictions of the fold to the designated array
        preds = np.append(preds, fold_preds)
        
        print("Computed predictions for fold",i)
    
    # save the submission csv file
    submission_path = "./sample_submission.csv"
    submission = pd.DataFrame({"test_id ": test_id, "price": preds})
    submission.to_csv(submission_path, header = True, index=False)
    
    print("Time required to compute prediction:", time.time()-t_start)

## **Plot of vs iterations**

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(16, 8), dpi= 80, facecolor='w', edgecolor='k')
iterations = list(range(1,iteration+1))
plt.plot(iterations, train_loss, label = "train loss")
plt.plot(iterations, valid_loss, label = "valid loss")
plt.title("Loss")
plt.xlabel("iter")
plt.ylabel("loss")
plt.legend()
plt.grid()
plt.show()