# RECOMMENDER USING NLP ON PRODUCT TITLE

5 phases
  1. Import libraries and loading dataset
  2. Preparing dataset by data exploration and clean-up
  3. Data preprocessing using Stop-word removal
  4. Creating IDF weighted Word2Vec for title and Brand using One Hot Encoder


## 1 Importing Libraries and Preparing Dataset 

### 1.1 Importing Libraries

In [None]:
# disable auto-scrolling

%%javascript
IPython.OutputArea.prototype._should_scroll = 9999;

In [None]:
from PIL import Image as PilImage
import itertools
import requests
from io import BytesIO
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import gzip
import json
import warnings
import random
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import math
import time
import re
import os
import seaborn as sns
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.metrics import pairwise_distances
from matplotlib import gridspec
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
import pickle
#Deep-Learning Library
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dropout, Flatten, Dense
from keras import applications
import plotly
import plotly.figure_factory as ff
from plotly.graph_objs import Scatter, Layout

from IPython.display import display, Image, SVG, Math, YouTubeVideo

plotly.offline.init_notebook_mode(connected=True)
warnings.filterwarnings("ignore")


### 1.2 Loading Dataset

In [None]:
path = '../../../Data/meta_Clothing_Shoes_and_Jewelry.json.gz'

In [None]:
f = gzip.open(path, 'rt', encoding='utf8')

lines = []
lines.append(f.readline())
lines

In [None]:
# load json

dataset = []
for i in random.sample(range(0, 2685000, 1), 90000):
    dataset.append(json.loads(f.readline()))

In [None]:
# turn dataset into a dataframe
data = pd.DataFrame.from_dict(dataset)

In [None]:
data.head()

In [None]:
# look at the columns of the dataframe
data.columns

In [None]:
data['similar_item']

## 2 Preparing Data

### 2.1 Removing Unnecessary features and Nan rows

In [None]:
# load only necessary columns for the model
data = data[['asin', 'brand', 'description', 'feature','image', 'title', 'price']]
print(data.head())
print(data.shape)

In [None]:
# remove rows without images

data = data.dropna(subset=['image'])
data.shape

In [None]:
# only take the first image available for each product

data.loc[:, 'image'] = data['image'].map(lambda x: x[0])

In [None]:
pd.set_option('display.max_colwidth', -1)
print(data.loc[:,'image'])

### 1.3 Data Exploration

In [None]:
print(data['brand'].describe())

In [None]:
brand_count = Counter(list(data['brand']))
brand_count.most_common(10)

In [None]:
print(data['title'].describe())

In [None]:
data = data.loc[~data['title'].isnull()]
data.shape

In [None]:
data.columns

In [None]:
data = data.loc[~data['image'].isnull()]
data.shape

### 2.3 Removing duplicates for 'Title'

In [None]:
print(sum(data.duplicated('title')))

In [None]:
# Remove All products with very few words in title
data_sorted = data[data['title'].apply(lambda x: len(x.split())>4)]
print("After removal of products with short description:", data_sorted.shape[0])

In [None]:
indices = []
for i,row in data_sorted.iterrows():
    indices.append(i)
stage1_dedupe_asins = []
i = 0
j = 0
num_data_points = data_sorted.shape[0]
while i < num_data_points and j < num_data_points:
    
    previous_i = i

    # store the list of words of ith string in a, ex: a = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'X-Large']
    a = data['title'].loc[indices[i]].split()

    # search for the similar products sequentially 
    j = i+1
    while j < num_data_points:

        # store the list of words of jth string in b, ex: b = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'Small']
        b = data['title'].loc[indices[j]].split()

        # store the maximum length of two strings
        length = max(len(a), len(b))

        # count is used to store the number of words that are matched in both strings
        count  = 0

        # itertools.zip_longest(a,b): will map the corresponding words in both strings, it will appened None in case of unequal strings
        # example: a =['a', 'b', 'c', 'd']
        # b = ['a', 'b', 'd']
        # itertools.zip_longest(a,b): will give [('a','a'), ('b','b'), ('c','d'), ('d', None)]
        for k in itertools.zip_longest(a,b): 
            if (k[0] == k[1]):
                count += 1

        # if the number of words in which both strings differ are > 2 , we are considering it as those two apperals are different
        # if the number of words in which both strings differ are < 2 , we are considering it as those two apperals are same, hence we are ignoring them
        if (length - count) > 2: # number of words in which both sensences differ
            # if both strings are differ by more than 2 words we include the 1st string index
            stage1_dedupe_asins.append(data_sorted['asin'].loc[indices[i]])

            # if the comaprision between is between num_data_points, num_data_points-1 strings and they differ in more than 2 words we include both
            if j == num_data_points-1: stage1_dedupe_asins.append(data_sorted['asin'].loc[indices[j]])

            # start searching for similar apperals corresponds 2nd string
            i = j
            break
        else:
            j += 1
    if previous_i == i:
        break

In [None]:
data = data.loc[data['asin'].isin(stage1_dedupe_asins)]
print('Number of data points : ', data.shape[0])

In [None]:
data.head()

## 3 Data Preprocessing by Removing Stop Words

In [None]:
stop_words = set(stopwords.words('english'))
print ('list of stop words:', stop_words)
def nlp_preprocessing(total_text, index, column):
    if type(total_text) is not int:
        string = ""
        for words in total_text.split():
            # remove the special chars in review like '"#$@!%^&*()_+-~?>< etc.
            word = ("".join(e for e in words if e.isalnum()))
            # Conver all letters to lower-case
            word = word.lower()
            # stop-word removal
            if not word in stop_words:
                string += word + " "
        data[column][index] = string

In [None]:
start_time = time.clock()
# we take each title and we text-preprocess it.
for index, row in data.iterrows():
    nlp_preprocessing(row['title'], index, 'title')
# we print the time it took to preprocess whole titles 
print(time.clock() - start_time, "seconds")

In [None]:
data.head()

In [None]:
print(data.shape[0])

In [None]:
title_vectorizer = CountVectorizer()
title_features   = title_vectorizer.fit_transform(data['title'])
title_features.get_shape()

## 4 Creating the IDF weighted Word2Vec Title, Brand Using One Hot Encoder

### Defining the features 

### 4.1 IDF Title Features and W2V Title Features

In [None]:
idf_title_vectorizer = CountVectorizer()
idf_title_features = idf_title_vectorizer.fit_transform(data['title'])


#### Defining the Features for Brands and Color 

In [None]:
# some of the brand values are empty. 
# Need to replace Null with string "NULL"
data['brand'].fillna(value="Not given", inplace=True )

# replace spaces with hypen
brands = [x.replace(" ", "-") for x in data['brand'].values]

# colors = [x.replace(" ", "-") for x in data['color'].values]

#One-Hot Encoding using Count vectorizer
brand_vectorizer = CountVectorizer()
brand_features = brand_vectorizer.fit_transform(brands)

# color_vectorizer = CountVectorizer()
# color_features = color_vectorizer.fit_transform(colors)

#### Defining the Features for Visuals

In [None]:
#load the features and corresponding ASINS info.
# bottleneck_features_train = np.load('Apparel-Recommendation-System-master/data_1_cnn_features.npy')
# asins = np.load('Apparel-Recommendation-System-master/data_1_cnn_feature_asins.npy')
# asins = list(asins)

# load the original 16K dataset
# data = pd.read_pickle('pickels/16k_apperal_data_preprocessed')
# df_asins = list(data['asin'])

#### 4.1.1 Utility Functions for the IDF Word2Vec Weighted Title

In [None]:
word2vec_model = r'Apparel-Recommendation-System-master/word2vec_model'

In [None]:
def get_word_vec(sentence, doc_id, m_name):
    # sentence : title of the apparel
    # doc_id: document id in our corpus
    # m_name: model information it will take two values
        # if  m_name == 'avg', we will append the model[i], w2v representation of word i
        # if m_name == 'weighted', we will multiply each w2v[word] with the idf(word)
    vec = []
    for i in sentence.split():
        if i in vocab:
            if m_name == 'weighted' and i in  idf_title_vectorizer.vocabulary_:
                vec.append(idf_title_features[doc_id, idf_title_vectorizer.vocabulary_[i]] * model[i])
            elif m_name == 'avg':
                vec.append(model[i])
        else:
            # if the word in our courpus is not there in the google word2vec corpus,just ignoring it
            vec.append(np.zeros(shape=(300,)))
    # return a numpy array of shape (#number of words in title * 300 ) 300 = len(w2v_model[word])
    # each row represents the word2vec representation of each word (weighted/avg) in given sentance 
    return  np.array(vec)

In [None]:
# this function will add the vectors of each word and returns the avg vector of given sentance
def build_avg_vec(sentence, num_features, doc_id, m_name):
    # sentace: its title of the apparel
    # num_features: the lenght of word2vec vector, its values = 300
    # m_name: model information it will take two values
        # if  m_name == 'avg',append the model[i], w2v representation of word i
        # if m_name == 'weighted',multiply each w2v[word] with the idf(word)

    featureVec = np.zeros((num_features,), dtype="float32")
    #intialize a vector of size 300 with all zeros
    #add each word2vec(wordi) to this fetureVec
    nwords = 0
    
    for word in sentence.split():
        nwords += 1
        if word in vocab:
            if m_name == 'weighted' and word in  idf_title_vectorizer.vocabulary_:
                featureVec = np.add(featureVec, idf_title_features[doc_id, idf_title_vectorizer.vocabulary_[word]] * model[word])
            elif m_name == 'avg':
                featureVec = np.add(featureVec, model[word])
    if(nwords>0):
        featureVec = np.divide(featureVec, nwords)
    # returns the avg vector of given sentance, its of shape (1, 300)
    return featureVec

In [None]:
data.to_pickle('pickles/50K_data')

In [None]:
data = pd.read_pickle('pickles/50K_data')

In [None]:
with open(word2vec_model, 'rb') as handle:
    model = pickle.load(handle)
# vocab = stores all the words that are there in google w2v model
vocab = model.keys()
doc_id = 0
w2v_title_weight = []
# for every title build a weighted vector representation
for i in data['title']:
    w2v_title_weight.append(build_avg_vec(i, 300, doc_id,'weighted'))
    doc_id += 1
# w2v_title = np.array(# number of doc in courpus * 300), each row corresponds to a doc 
w2v_title_weight = np.array(w2v_title_weight)

In [None]:
def n_containing(word):
    # return the number of documents which had the given word
    return sum(1 for blob in data['title'] if word in blob.split())

def idf(word):
    # idf = log(#number of docs / #number of docs which had the given word)
    return math.log(data.shape[0] / (n_containing(word)))

In [None]:
data.to_pickle('pickles/50K_data')

In [None]:
data = pd.read_pickle('pickles/50K_data')

In [None]:
# need to convert the values into float
idf_title_features  = idf_title_features.astype(np.float)

for i in idf_title_vectorizer.vocabulary_.keys():
    # for every word in whole corpus will find its idf value
    idf_val = idf(i)
    
    # to calculate idf_title_features need to replace the count values with the idf values of the word
    # idf_title_features[:, idf_title_vectorizer.vocabulary_[i]].nonzero()[0] will return all documents in which the word i present
    for j in idf_title_features[:, idf_title_vectorizer.vocabulary_[i]].nonzero()[0]:
        
        # replace the count values of word i in document j with  idf_value of word i 
        # idf_title_features[doc_id, index_of_word_in_courpus] = idf value of word
        idf_title_features[j,idf_title_vectorizer.vocabulary_[i]] = idf_val

In [None]:
def get_distance(vec1, vec2):
    # vec1 = np.array(#number_of_words_title1 * 300), each row is a vector of length 300 corresponds to each word in give title
    # vec2 = np.array(#number_of_words_title2 * 300), each row is a vector of length 300 corresponds to each word in give title
    
    final_dist = []
    # for each vector in vec1  caluclate the distance(euclidean) to all vectors in vec2
    for i in vec1:
        dist = []
        for j in vec2:
            # np.linalg.norm(i-j) will result the euclidean distance between vectors i, j
            dist.append(np.linalg.norm(i-j))
        final_dist.append(np.array(dist))
    # final_dist = np.array(#number of words in title1 * #number of words in title2)
    # final_dist[i,j] = euclidean distance between vectors i, j
    return np.array(final_dist)

In [None]:
# def display_img(url,ax,fig):
#     try:
#         #get the url of the apparel and download it
#         response = requests.get(url)
#         img = PilImage.open(BytesIO(response.content))
#         #display it in notebook 
#         plt.imshow(img)
#     except 

In [None]:
img_name = []

In [None]:
#The Below code is is for the weighted IDF Word2Vec 
def heat_map_w2v(sentence1, sentence2, url, asin1, asin2, doc_id1, doc_id2, model):
    # sentance1 : title1, input apparel
    # sentance2 : title2, recommended apparel
    # url: apparel image url
    # doc_id1: document id of input apparel
    # doc_id2: document id of recommended apparel
    # model: it can have two values, 1. avg 2. weighted
    
    #s1_vec = np.array(#number_of_words_title1 * 300), each row is a vector(weighted/avg) of length 300 corresponds to each word in give title
    s1_vec = get_word_vec(sentence1, doc_id1, model)
    #s2_vec = np.array(#number_of_words_title1 * 300), each row is a vector(weighted/avg) of length 300 corresponds to each word in give title
    s2_vec = get_word_vec(sentence2, doc_id2, model)

    # s1_s2_dist = np.array(#number of words in title1 * #number of words in title2)
    # s1_s2_dist[i,j] = euclidean distance between words i, j
    s1_s2_dist = get_distance(s1_vec, s2_vec)

    
    
    # devide whole figure into 2 parts 1st part displays heatmap 2nd part displays image of apparel
#     gs = gridspec.GridSpec(2, 2, width_ratios=[4,1],height_ratios=[2,1]) 
    fig = plt.figure(figsize=(10,10))
    
#     ax = plt.subplot(gs[0])
    # ploting the heap map based on the pairwise distances
    ax = sns.heatmap(np.round(s1_s2_dist,4), annot=True)
    # set the x axis labels as recommended apparels title
    ax.set_xticklabels(sentence2.split())
    # set the y axis labels as input apparels title
    ax.set_yticklabels(sentence1.split())
    # set title as recommended apparels title
    ax.set_title(sentence2)
    
#     filedir = './w2v_matrices/'
#     img_name = filedir + asin1 + '-' + asin2 + '.jpg'
    
#     print(f'saving {img_name}')
#     plt.savefig(img_name)
    
#     ax = plt.subplot(gs[1])
    # remove all grids and axis labels for image
#     ax.grid(False)
#     ax.set_xticks([])
#     ax.set_yticks([])
#     display_img(url, ax, fig)
    
    
    plt.show()
    

In [None]:
feat_asin_list = []
rec_asin_list = []
distance_list = []
# image_list = []

In [None]:
def weighted_w2v_model(doc_id, num_results):
    # doc_id: apparel's id in given corpus
    
    # pairwise_dist will store the distance from given input apparel to all remaining apparels
    # the metric used here is cosine, the coside distance is mesured as K(X, Y) = <X, Y> / (||X||*||Y||)
    # http://scikit-learn.org/stable/modules/metrics.html#cosine-similarity
    pairwise_dist = pairwise_distances(w2v_title_weight, w2v_title_weight[doc_id].reshape(1,-1))

    # np.argsort will return indices of 9 smallest distances
    indices = np.argsort(pairwise_dist.flatten())[0:num_results]
    #pdists will store the 9 smallest distances
    pdists  = np.sort(pairwise_dist.flatten())[0:num_results]

    #data frame indices of the 9 smallest distace's
    df_indices = list(data.index[indices])
    
    for i in range(0, len(indices)):
        heat_map_w2v(data['title'].loc[df_indices[0]],data['title'].loc[df_indices[i]], data['image'].loc[df_indices[i]], data['asin'].loc[df_indices[0]], data['asin'].loc[df_indices[i]], indices[0], indices[i], 'weighted')
        
        feat_asin_list.append(data['asin'].loc[df_indices[0]])
        rec_asin_list.append(data['asin'].loc[df_indices[i]])
        distance_list.append(pdists[i])

        print('ASIN :',data['asin'].loc[df_indices[i]])
        print('Brand :',data['brand'].loc[df_indices[i]])
        print('euclidean distance from input :', pdists[i])
        print('='*125)

In [None]:
from PIL import Image as PilImage

In [None]:
# weighted_w2v_model(4000,20)

In [None]:
for i in range(49662):
    print('')
    print(i)
    weighted_w2v_model(i, 5)

In [None]:
len(feat_asin_list)

In [None]:
len(rec_asin_list)

In [None]:
len(distance_list)

In [None]:
df = pd.DataFrame(columns = ['Featured Item', 'Recommended Item', 'Eucledian Distance'])

In [None]:
df['Featured Item'] = feat_asin_list

In [None]:
df['Recommended Item'] = rec_asin_list

In [None]:
df['Eucledian Distance'] = distance_list

In [None]:
df.head()