In [1]:
import os
import csv
import math
import pickle
import random
import numpy as np
import pandas as pd
import pprint as pp
import tensorflow as tf
from Data_Operations import*
from Buyer_Persona_Clustering import*
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

##  Autoencoder class

Function list:
1. init
2. construct model
3. loss_optimize
4. run session
5. GetNextPredProducts
6. run_model
7. store_model

In [2]:
class Autoencoder(object):

    def __init__(self,User_Product_Matrix):
        
        num_input = User_Product_Matrix.shape[1]  ## No. of products
        num_hidden_1 = 10
        num_hidden_2 = 5

        self.X = tf.placeholder(tf.float64, [None, num_input])

        self.weights = {
            'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1], dtype=tf.float64)),
            'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2], dtype=tf.float64)),
            'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1], dtype=tf.float64)),
            'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input], dtype=tf.float64)),
        }

        self.biases = {
            'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
            'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2], dtype=tf.float64)),
            'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
            'decoder_b2': tf.Variable(tf.random_normal([num_input], dtype=tf.float64)),
        }

    
    def construct_model(self):
        
        # Encoder: 2 Hidden layers with sigmoid activations
        layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(self.X, self.weights['encoder_h1']), self.biases['encoder_b1']))
        layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, self.weights['encoder_h2']), self.biases['encoder_b2']))
        self.encoder_op = layer_2
        
        # Decoder: 2 Hidden layers with sigmoid activations
        layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(self.encoder_op, self.weights['decoder_h1']), self.biases['decoder_b1']))
        layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, self.weights['decoder_h2']), self.biases['decoder_b2']))
        self.decoder_op = layer_2
        
        
    # Define loss and optimizer, minimize the squared error
    def loss_optimize(self):
        
        y_true = self.X
        y_pred = self.decoder_op
        self.loss = tf.losses.mean_squared_error(y_true, y_pred)
        self.optimizer = tf.train.RMSPropOptimizer(0.03).minimize(self.loss)

        # Define evaluation metrics
        eval_x = tf.placeholder(tf.int32, )
        eval_y = tf.placeholder(tf.int32, )
        pre, pre_op = tf.metrics.precision(labels=eval_x, predictions=eval_y)
        
        
    ## Batch-wise training of all users of the cluster
    def run_session(self, User_Product_Matrix):
        
        predictions = pd.DataFrame()
        init = tf.global_variables_initializer()
        local_init = tf.local_variables_initializer()
    
        with tf.Session() as session:
            epochs = 100
            batch_size = 50

            session.run(init)
            session.run(local_init)

            num_batches = math.ceil(User_Product_Matrix.shape[0] / batch_size)
            BatchIndex = 0

            for i in range(epochs):
                avg_cost = 0
                
                for batchNum in range(num_batches) :
                    if(batchNum == num_batches-1):
                        batch = User_Product_Matrix.iloc[batchNum :, :]
                    else :
                        batch = User_Product_Matrix.iloc[batchNum : batchNum+batch_size, :]
                        
                    _, l = session.run([self.optimizer, self.loss], feed_dict={self.X: batch})
                    
                    avg_cost += l
                    batchNum += batch_size

                avg_cost /= num_batches
                
#                 print("Epoch: {} Loss: {}".format(i + 1, avg_cost))
#             print("Predictions...")

            preds = session.run(self.decoder_op, feed_dict={self.X: User_Product_Matrix})
            predictions = predictions.append(pd.DataFrame(preds))
        return predictions
    
    
    def GetNextPredProduct(self, N, Pred_Matrix, Customer_Ids):

        # returns top N most likely to buy products for each user 
        # Data structure : Dict with userID key and list of N productIDs as value
  
        Pred_Products = {}
    
        No_Users = Pred_Matrix.shape[0]
        No_Products = Pred_Matrix.shape[1]
    
        for user in range(No_Users) :
            Pred_Row = np.array(Pred_Matrix.iloc[user, :])
            Sorted_Row = np.argsort(Pred_Row)[::-1][:N]
            Sorted_Row += 1
    
            userID = Customer_Ids[user]
            Pred_Products[userID] = Sorted_Row
    
        return Pred_Products
    
    
    def run_model(self, User_Product_Matrix, Cluster):
    
        Cluster_Num = Cluster[0]
        Cust_Id_List = Cluster[1]
        
        print("Constructing model with 2 encoder layer and 2 decoder layers..")
        self.construct_model()

        print ('Building optimizer, defining losses..')
        self.loss_optimize()
        
        print ('Ready to run..')
        predictions = self.run_session(User_Product_Matrix)
        
        print ('Got the predictions!')
        Pred_Products = self.GetNextPredProduct(10, predictions, Cust_Id_List)
        
        print ("Writing to file..")
        pp.pprint(Pred_Products)
        write_dict_to_csv(Pred_Products, str(Cluster_Num)+"_preds.csv")
        
        print ('Success!')
        
    
#     def store_model(self, path, model_num):
#         if not os.path.exists(path):
#             os.mkdir(path)
#         pickle.dump(self, open(path + '/' + str(model_num) + '_AE.pkl', 'wb'))

In [3]:
def write_dict_to_csv(dict, path):
    w = csv.writer(open(path, "w"))
    for key, val in dict.items():
        w.writerow([key, val])

In [4]:
## Train all clusters and get predictions
def Train_all_Clusters(Clusters):
    
    for Cluster in Clusters.items():
        
        Cluster_Num = Cluster[0]
        Cust_Id_List = Cluster[1]
        print ('\n====================================================================')
        print ('Processing begins for cluster number: ',+ Cluster_Num)
        Combined_Data = Combine_Data(Sales_data, User_data, Products_data, Cust_Id_List, UserID_Col, ProductID_Col)
        Processed_Data = Data_Processing(Combined_Data)
        
        FilePath = './Cluster_Matrices/' + str(Cluster_Num) + 'UP_Mat.csv'
        
        if os.path.exists(FilePath):
            print ('Loading User_Product_Matrix..')
            User_Product_Matrix = pd.read_csv(FilePath, sep=',')
        else:
            
            print ('Building User_Product_Matrix..')
            User_Product_Mat = CF_Matrix()
            User_Product_Mat.Build_User_Product_Matrix(Processed_Data, UserID_Col, ProductID_Col, Quantity_Col, Binary_Flag=1)
            User_Product_Matrix = pd.DataFrame(User_Product_Mat.Matrix)
            
            print ('Storing Matrix..')
            if not os.path.exists('./Cluster_Matrices'):
                os.mkdir('./Cluster_Matrices')
            User_Product_Matrix.to_csv(FilePath)
        
        
        print ('Matrix shape: ', User_Product_Matrix.shape)
        
        print ('Initializing Autoencoder parameters: weights, biases, X..')
        autoencoder = Autoencoder(User_Product_Matrix)
        autoencoder.run_model(User_Product_Matrix, Cluster)
        
#         print ('Storing model..')
#         path = './Autoencoders'
#         store_model(autoencoder, path, Cluster_num)


## Product predictions on test data

In [5]:
def fetch_stored_pred(pred_df, user_id):
    temp = pd.DataFrame()
    temp = pred_df[pred_df[0] == user_id]
    
    all_prod = temp.iloc[0,1]
    
    return all_prod

In [6]:
def Products_prediction(Cluster, user_id, Test_Vector, Col_List):

    cluster_num = Cluster[0]
    all_cust = Cluster[1]
 
    # read the predictions for this cluster
    pred_csv_path = './' + str(cluster_num) + "_preds.csv"
    pred_df = pd.read_csv(pred_csv_path, header=None)
    
    temp = []
    if user_id in all_cust:
        # existing user, fetch the prediction
        temp = fetch_pred(pred_df, user_id)
    else:
        # load User_Product_Matrix corresponding to cluster
        FilePath = './Cluster_Matrices/' + str(cluster_num) + 'UP_Mat.csv'
        User_Product_Matrix = pd.read_csv(FilePath, sep=',')
        
        # Determine User with highest cosine similarity wrt this user
        max_sim = -1
        for i in range(len(User_Product_Matrix)):
            sim = cosine_similarity([Test_Vector[Col_List]],[User_Product_Matrix.iloc[i,1:]])[0][0]
            if sim>max_sim:
                max_sim = sim
                similar_user = int(User_Product_Matrix.iloc[i,0])

        temp = fetch_stored_pred(pred_df, all_cust[similar_user])
        
    temp = temp.replace("[", "")
    temp = temp.replace("]", "")
    temp = temp.split()
    
    results = list(map(int, temp))
    print ("Reccommended product ids for user_" + str(user_id) + ": ")
    print (similar_user,':   ', results)
    
    return results


In [7]:
def Prediction(test_file_path, sep, Clusters):
    with open("./Saved_Model.pkl", 'rb') as file:
        Saved_Model = pickle.load(file)

    Col_List = [str(i) for i in range(1,453)]
    Test_Data = pd.read_csv(test_file_path, sep=sep)
    Test_Matrix = Test_Data[Col_List]    
    
    Pca_Dimensions = Saved_Model['Dimensions']
    Model_Predict = Saved_Model['Model']

    if Pca_Dimensions != Test_Matrix.shape[1]:
        Reduced_Matrix = PCA_Transform(Test_Matrix, Pca_Dimensions)
    else:
        Reduced_Matrix = Test_Matrix
        
    p = []
    for i, row in Reduced_Matrix.iterrows():
        row = [row]
        cluster_num = Model_Predict.predict(row)[0]
        Cluster = (cluster_num,Clusters[cluster_num])
        p.append(Products_prediction(Cluster,Test_Data.iloc[i,1],Test_Data.iloc[i,:], Col_List))
        
    return p
    

## Execution begins here...

In [None]:
if __name__ == '__main__':
    Data_Folder_Path ="../Data/SalesDB/"
    UserID_Col = "CustomerID"
    ProductID_Col = "ProductID"
    Quantity_Col = "Quantity"

    Sales_filepath = Data_Folder_Path + "sales.csv"
    Sales_Col_list = ["SalesID", UserID_Col, "ProductID", "Quantity", "SalesDate"]

    Products_filepath = Data_Folder_Path + "products.csv"
    Products_Col_list = ["ProductID", "ProductName", "CategoryID", "IsAllergic"]

    Users_filepath = Data_Folder_Path + "customers.csv"
    Users_Col_list = [UserID_Col, "FirstName", "LastName", "CityID"]
    
    TopN = 500 
    
    ## Create datasets from locally stored data which would be used by all the clusters
    Sales_data = Load_Data(Sales_filepath, Sales_Col_list, seperator=";")
    Products_data = Load_Data(Products_filepath, Products_Col_list, seperator=";")
    User_data = Load_Data(Users_filepath, Users_Col_list, seperator=";")
    
    ## Load clusters of training data(Buyer_Persona_Clustering)
    Clusters = pickle.load(open("./Clusters.pkl","rb"))
    
    # Train autoencoders to get predictions for train data
    Train_all_Clusters(Clusters)
    
    # Predictions for test data
    print ('\n\n====================================================================')
    print ('Getting Predictions for Test users..')
    test_file_path = "./User_Product_Matrix_Test.csv"
    sep = ','
    Test_data_predictions = Prediction(test_file_path, sep, Clusters)
    
    print (Test_data_predictions)

Loading data from  ../Data/SalesDB/sales.csv
number of rows, cols  (6758125, 5)
Loading data from  ../Data/SalesDB/products.csv
number of rows, cols  (452, 4)
Loading data from  ../Data/SalesDB/customers.csv
number of rows, cols  (98759, 4)

Processing begins for cluster number:  0
Loading User_Product_Matrix..
Matrix shape:  (296, 453)
Initializing Autoencoder parameters: weights, biases, X..
Instructions for updating:
Colocations handled automatically by placer.
Constructing model with 2 encoder layer and 2 decoder layers..
Building optimizer, defining losses..
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Ready to run..
Got the predictions!
Writing to file..
{38: array([  1,   2, 131, 285, 224, 333,  55, 362, 154,  58]),
 282: array([  1,   2, 131, 285, 224, 333,  55, 362, 154,  58]),
 575: array([  1,   2, 273, 331,  33, 190,  39, 401, 156, 424]),
 740: array([  1,   2, 131, 285, 224, 333,  55, 362, 154

Constructing model with 2 encoder layer and 2 decoder layers..
Building optimizer, defining losses..
Ready to run..
Got the predictions!
Writing to file..
{5448: array([156, 216,   4,  39,  24, 224, 248, 208, 228, 124]),
 28008: array([156, 216,   4,  39,  24, 224, 208, 248, 228, 124]),
 66950: array([156, 216,   4,  39,  24, 208, 224, 248, 124,  99]),
 95972: array([156, 216,   4,  39, 248, 228, 208, 224,  24, 124])}
Success!


Getting Predictions for Test users..
Reccommended product ids for user_34503: 
0 :    [1, 2, 59, 429, 409, 144, 61, 190, 359, 183]
Reccommended product ids for user_39625: 
65 :    [1, 2, 131, 285, 224, 333, 55, 362, 154, 58]
Reccommended product ids for user_18502: 
10 :    [1, 2, 57, 245, 95, 328, 437, 166, 257, 296]
Reccommended product ids for user_28607: 
3 :    [1, 19, 326, 222, 2, 183, 368, 33, 26, 414]
Reccommended product ids for user_95971: 
4 :    [1, 2, 390, 121, 328, 107, 450, 30, 418, 433]
Reccommended product ids for user_19067: 
5 :    [1, 2, 25