In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import numpy as np
import pandas as pd

In [None]:
fashion = pd.read_json('/kaggle/input/amazon-apparel-dataset/tops_fashion.json')
print('Number of Data Points: ', fashion.shape[0])
print('Number of Features   : ', fashion.shape[1])

> we had a json file which has shape (183138, 19) and now below we will check which and all data to keep and which and all to be ingnored depending on the importance 

In [None]:
fashion.columns

**since this is a project to basically build an end to end clothing recommendation system lets use the following features:**

* asin -> Amazon Standard Identification Number
* product_type_name -> Type of the apperal, ex: SHIRT/TSHIRT
* formatted_price -> Price of the product
* color -> Color information of apparel, it can contain many colors as a value ex: red and black stripes
* brand -> Brand to which the product belongs to
* title -> Title of the product
* medium_image_url -> url of the image

we can use other data as well but since it won't matter much

In [None]:
fashion_data = fashion[['asin', 'brand', 'color', 'medium_image_url', 'product_type_name', 'title', 'formatted_price']]

In [None]:
print('Number of Data Points: ', fashion_data.shape[0])
print('Number of Features   : ', fashion_data.shape[1])

In [None]:
fashion_data.head()

# **Now let's look at the products that we have**

In [None]:
fashion_data['product_type_name'].describe()

In [None]:
# so let's see all the unique fashion that we have
fashion_data['product_type_name'].unique()

In [None]:
# let's look at the 10 most frequent products
from collections import Counter

count_product_type = Counter(list(fashion_data['product_type_name']))
count_product_type.most_common(10)

# **Nice, now let's look at the brands**

In [None]:
fashion_data['brand'].nunique()
fashion_data['brand'].describe()

In [None]:
# ok we have some missing values in brands. let's see the most frequent brand names

count_product_type = Counter(list(fashion_data['brand']))
count_product_type.most_common(10)

# **Colors**

In [None]:
fashion_data['color'].describe()

In [None]:
#183138 - 64956 = 118182 missing values, approx. 64.5% missing colors
#find the 10 most frequent colors
#118182 values are None (missing)

count_colors = Counter(list(fashion_data['color']))
count_colors.most_common(10)


# **What about price?**

In [None]:
fashion_data['formatted_price'].describe()

In [None]:
#183138 - 28395 = 154743 missing values, approx. 84.50% missing price😱


count_prices = Counter(list(fashion_data['formatted_price']))
count_prices.most_common(10)


# **Title**

In [None]:
fashion_data['title'].describe()

#all of the products have a title & they are fairly descriptive of what the product is
#we use titles extensively as they are short and informative

Now let's perform data cleaning/preprocessing. Here price has the highest null values so let's drop all the rows having null values

In [None]:
fashion_data = fashion_data.loc[~fashion_data['formatted_price'].isnull()]
print("Number of data points after eliminating null price values: ",fashion_data.shape[0])

In [None]:
fashion_data = fashion_data.loc[~fashion_data['color'].isnull()]
print("Number of data points after eliminating null color values: ",fashion_data.shape[0])

> so basically from 100% of the data available after removing null values we got 
15.499% of data which is still sufficient but it would have been better if we had full data without null values

**Now lets save this data using pickle and also download the images for products from the URLs provided**

In [None]:
import pickle
fashion_data.to_pickle("/kaggle/working/28k_dataset")

In [None]:
fashion_data = pd.read_pickle("/kaggle/working/28k_dataset")

In [None]:
fashion_data.head()

In [None]:
# let's find the number of products having same titles since it isn't visible above
sum(fashion_data.duplicated('title'))

tooo many so let's start removing the duplicate titles
first approach would be to remove titles having less than 5 words.

In [None]:
sorted_fashion_data = fashion_data[fashion_data['title'].apply(lambda x: len(x.split())>4)]
print("After removal of products with short description: ",sorted_fashion_data.shape[0])

In [None]:
# now sort the whole data based on title in alphabetical order
sorted_fashion_data.sort_values('title', inplace=True, ascending=False)
sorted_fashion_data.head()

In [None]:
import itertools

indices = []
for i, row in sorted_fashion_data.iterrows():
    indices.append(i)

duplicate_ID = []
i = 0
j = 0

num_data_points = sorted_fashion_data.shape[0]

while (i < num_data_points) and (j < num_data_points):
    previous_i = i
    a = sorted_fashion_data['title'].loc[indices[i]].split()
    j = i+1
    while (j < num_data_points):
        b = sorted_fashion_data['title'].loc[indices[j]].split()
        length = max(len(a), len(b))
        count = sum(k[0] == k[1] for k in itertools.zip_longest(a, b))
        if (length - count) > 2:
            duplicate_ID.append(sorted_fashion_data['asin'].loc[indices[i]])
            i = j
            break
        else:
            j += 1
    if previous_i == i:
        break


The above code is trying to find similar products based on their titles. Here's a simplified explanation of what the code does:

1. It creates an empty list called `indices` to store the row indices of the fashion data.
2. It loops through the rows of the fashion data and appends the indices to the `indices` list.
3. It initializes an empty list called `duplicate_ID` to store the duplicate product IDs.
4. It sets up variables `i` and `j` to keep track of indices.
5. It determines the total number of data points in the fashion data.
6. It enters a while loop that continues as long as `i` and `j` are within the data points range.
7. Inside the loop, it splits the title of the current row into a list of words, stored in variable `a`.
8. It enters another while loop to sequentially search for similar products.
9. Inside the second loop, it splits the title of the next row into a list of words, stored in variable `b`.
10. It calculates the maximum length between the two lists of words.
11. It iterates over the corresponding words in `a` and `b`, counting the number of matching words.
12. If the difference between the maximum length and the count of matching words is greater than 2, it considers the products different.
13. It appends the product ID of the first item (`indices[i]`) to the `duplicate_ID` list.
14. It updates the value of `i` to the index of the second item (`indices[j]`).
15. If the difference is not greater than 2, it increments `j` to check the next item.
16. If the value of `previous_i` is equal to `i`, it means no new similar apparels were found, and the outer loop is exited.
17. Finally, the code generates a list (`duplicate_ID`) containing the product IDs of the similar apparel products based on title comparison.

In summary, the code compares the titles of products and identifies duplicate or similar items by counting the number of words that differ. It collects the product IDs of the similar products into a list for further analysis or processing.

In [None]:
# now we removed the duplicates which differ only at the end

fashion_data = sorted_fashion_data.loc[sorted_fashion_data['asin'].isin(duplicate_ID)]

In [None]:
print("Number of data points after removing duplicates which differ only at the end: ",fashion_data.shape[0])

**We don't know how many images are actually present in the website at present, so let's only use those rows which have working urls**

In [None]:
from PIL import Image
import requests
from io import BytesIO

for index, row in fashion_data.iterrows():
    try:
        url = row['medium_image_url']
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))

    except Exception as e:
        print(f"Error processing image for row {index}: {e}")
        fashion_data.drop(index, inplace=True)

print("Number of data points after removing rows having error in retrieving images: ",fashion_data.shape[0])

17592-17509 = 83 is the amount of images that are not there anymore. **GOOD**

In the previous cell, we sorted whole data in alphabetical order of titles. Then, we removed titles which are adjacent and very similar title.

But there are some products whose titles are not adjacent but very similar.

Examples:
Titles-1
 UltraClub Women's Classic Wrinkle-Free Long Sleeve Oxford Shirt, Pink, XX-Large
 UltraClub Ladies Classic Wrinkle-Free Long-Sleeve Oxford Light Blue XXL

Titles-2
 EVALY Women's Cool University Of UTAH 3/4 Sleeve Raglan Tee
 EVALY Women's Unique University Of UTAH 3/4 Sleeve Raglan Tees
 EVALY Women's New University Of UTAH 3/4-Sleeve Raglan Tshirt
 
let's try to remove these types of titles as well

In [None]:
#this code snippet takes significant amount of time
#O(n^2) time. It takes about an hour to run on a decent system

indices = list(fashion_data.index)
duplicate_ID = []

while len(indices) != 0:
    i = indices.pop()
    duplicate_ID.append(fashion_data['asin'].loc[i])

    a = fashion_data['title'].loc[i].split()
    
    for j in indices:
        b = fashion_data['title'].loc[j].split()
        length = max(len(a), len(b))
        count = sum(k[0] == k[1] for k in itertools.zip_longest(a, b))
        
        if (length - count) < 3:
            indices.remove(j)


Explanation:

1. It creates a list called `indices` containing the indices of the rows in the fashion data.
2. It initializes an empty list called `duplicate_ID` to store the duplicate product IDs.
3. It enters a while loop that continues until the `indices` list is empty.
4. Inside the loop, it pops the last index from the `indices` list and assigns it to `i`.
5. It appends the product ID (`fashion_data['asin'].loc[i]`) to the `duplicate_ID` list.
6. It splits the title of the current row into a list of words and assigns it to `a`.
7. It enters a nested loop to compare the title of the current row with the remaining rows in the `indices` list.
8. Inside the nested loop, it splits the title of the current compared row into a list of words and assigns it to `b`.
9. It calculates the maximum length between the lists `a` and `b`.
10. It counts the number of matching words between the two lists using `itertools.zip_longest` and assigns it to `count`.
11. If the difference between the maximum length and the count of matching words is less than 3, it considers the products the same and removes the compared row index from the `indices` list.
12. After the nested loop, the code continues with the next iteration of the outer loop.
13. Finally, the code will have populated the `duplicate_ID` list with the product IDs of the similar apparel products based on title comparison, and the `indices` list will contain only the indices of unique items.

In summary, the code efficiently compares the titles of products using a nested loop and removes the indices of similar products, resulting in the `duplicate_ID` list containing the product IDs of the unique items.

In [None]:
#from whole previous products we will consider only the products that are found in previous cell 
fashion_data = fashion_data.loc[fashion_data['asin'].isin(duplicate_ID)]

In [None]:
print("Number of data points after removing duplicates: ",fashion_data.shape[0])

In [None]:
fashion_data.info()

let's drop null values, if any are present and most null values are present in brand

In [None]:
fashion_data = fashion_data.dropna()

In [None]:
fashion_data.shape[0]

Now that we have removed duplicate data let's perform some text-preprocessing since 
python can't read text.

In [None]:
fashion_data.to_pickle("/kaggle/working/16k_dataset")

In [None]:
import math
import time
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
import PIL
from pathlib import Path
from PIL import UnidentifiedImageError
from PIL import Image
import requests
from io import BytesIO

**There might be many stop words like 'are, is to..' etc and these should not be present while training the model and also special characters should also be removed from the model.**

In [None]:
New_fashion_data = pd.read_pickle('/kaggle/working/16k_dataset')

In [None]:
#we use the list of stop words that are downloaded from nltk lib
stop_words = set(stopwords.words('english'))

def nlp_preprocessing(total_text, index, column):
    if type(total_text) is not int:
        string = ""
        for words in total_text.split():
            #remove the special chars in review like '"#$@!%^&*()_+-~?>< etc.
            word = ("".join(e for e in words if e.isalnum()))
            
            #conver all letters to lower-case
            word = word.lower()
            
            #stop-word removal
            if not word in stop_words:
                string += word + " "
        New_fashion_data[column][index] = string

In [None]:
start_time = time.perf_counter()
#we take each title and we text-preprocess it

for index, row in New_fashion_data.iterrows():
    nlp_preprocessing(row['title'], index, 'title')
    
#we print the time it took to preprocess whole titles 
print(time.perf_counter() - start_time, "Sec")

In [None]:
New_fashion_data.head()

In [None]:
New_fashion_data.to_pickle("/kaggle/working/16k_dataset_no_stop_words")

# let's write some utility functions

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import gridspec
from collections import Counter
from PIL import Image
import requests
from io import BytesIO

def display_img(url, ax, fig):
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    plt.imshow(img)

def plot_heatmap(keys, values, labels, url, text):
    gs = gridspec.GridSpec(2, 2, width_ratios=[4, 1], height_ratios=[4, 1]) 
    fig = plt.figure(figsize=(25, 3))
    
    ax = plt.subplot(gs[0])
    ax = sns.heatmap(np.array([values]), annot=np.array([labels]))
    ax.set_xticklabels(keys)
    ax.set_title(text)
    
    ax = plt.subplot(gs[1])
    ax.grid(False)
    ax.set_xticks([])
    ax.set_yticks([])
    
    display_img(url, ax, fig)
    
    plt.show()

def plot_heatmap_image(doc_id, vec1, vec2, url, text, model):
    intersection = set(vec1.keys()) & set(vec2.keys())
    
    for i in vec2:
        if i not in intersection:
            vec2[i] = 0
    
    keys = list(vec2.keys())
    values = [vec2[x] for x in vec2.keys()]
    
    if model == 'bag_of_words':
        labels = values
    elif model == 'tfidf':
        labels = []
        for x in vec2.keys():
            if x in tfidf_title_vectorizer.vocabulary_:
                labels.append(tfidf_title_features[doc_id, tfidf_title_vectorizer.vocabulary_[x]])
            else:
                labels.append(0)
    elif model == 'idf':
        labels = []
        for x in vec2.keys():
            if x in idf_title_vectorizer.vocabulary_:
                labels.append(idf_title_features[doc_id, idf_title_vectorizer.vocabulary_[x]])
            else:
                labels.append(0)
    
    plot_heatmap(keys, values, labels, url, text)

def text_to_vector(text):
    word = re.compile(r'\w+')
    words = word.findall(text)
    return Counter(words)

def get_result(doc_id, content_a, content_b, url, model):
    text1 = content_a
    text2 = content_b
    vector1 = text_to_vector(text1)
    vector2 = text_to_vector(text2)
    plot_heatmap_image(doc_id, vector1, vector2, url, text2, model)


Explanation:

1. The `display_img` function takes a URL as input, downloads the image from the URL, and displays it.
2. The `plot_heatmap` function takes lists of keys, values, and labels, along with an image URL and text as inputs. It plots a heatmap with the given data and displays the image.
3. The `plot_heatmap_image` function takes the document ID, two vector dictionaries, an image URL, text, and a model type as inputs. It finds the intersection of keys in both vectors, sets the values of non-intersecting keys to zero, and then calls the `plot_heatmap` function to plot the heatmap and display the image.
4. The `text_to_vector` function takes a text string as input, splits it into words, and counts the occurrences of each word. It returns a dictionary with words as keys and their respective counts as

 values.
5. The `get_result` function takes the document ID, two content strings, an image URL, and a model type as inputs. It converts the content strings into vector dictionaries using the `text_to_vector` function and then calls the `plot_heatmap_image` function to plot the heatmap and display the image.

Overall, these utility functions are used to visualize the comparison between two text documents by plotting a heatmap of common words and displaying an associated image. It provides a visual representation of the similarity between the documents based on the words they share.

1. Bag of Words (BoW) Based Product Similarity

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
title_vectorizer = CountVectorizer()
title_features = title_vectorizer.fit_transform(New_fashion_data['title'])

from sklearn.metrics import pairwise_distances

def bag_of_words_model(doc_id, num_results):
    pairwise_dist = pairwise_distances(title_features, title_features[doc_id])
    indices = np.argsort(pairwise_dist.flatten())[0:num_results]
    pdists = np.sort(pairwise_dist.flatten())[0:num_results]
    df_indices = list(New_fashion_data.index[indices])
    
    for i in range(len(indices)):
        get_result(indices[i], New_fashion_data['title'].loc[df_indices[0]], New_fashion_data['title'].loc[df_indices[i]], New_fashion_data['medium_image_url'].loc[df_indices[i]], 'bag_of_words')
        print('ASIN:', New_fashion_data['asin'].loc[df_indices[i]])
        print('Brand:', New_fashion_data['brand'].loc[df_indices[i]])
        print('Title:', New_fashion_data['title'].loc[df_indices[i]])
        print('Euclidean Similarity With The Query Image:', pdists[i])
        print('-.'*60)

bag_of_words_model(921, 20)


Explanation:

1. The code uses the `CountVectorizer` from scikit-learn to convert the titles of fashion data into a feature matrix (`title_features`), where each row represents a document and each column represents a word in the corpus. The shape of `title_features` is the number of data points (rows) by the number of words in the corpus (columns).
2. The `pairwise_distances` function from scikit-learn calculates the pairwise cosine distances between the feature vectors of the titles. It measures the similarity between the input apparel and all other apparels in the data.
3. The indices of the smallest distances (`num_results`) are sorted and stored in `indices`, and the corresponding distances are stored in `pdists`.

        np.argsort will return indices of 9 smallest distances
        pdists will store the 9 smallest distances
        
        
4. The loop iterates over the indices and calls the `get_result` function to plot a heatmap and display an associated image for each similar apparel. It also prints the ASIN, brand, title, and Euclidean similarity with the query image.
5. Finally, the `bag_of_words_model` function is called with a specific document ID (921) and the number of similar results desired (20).
6. In the output heat map each value represents the count value of the label word, the color represents the intersection with inputs title

change the index and number of results and check for yourself.

Overall, this code performs a bag-of-words model using the cosine similarity metric to find similar products based on their titles. It visualizes the results by plotting heatmaps and displaying images of the similar apparels.

2. TF IDF Based Product Similarity

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_title_vectorizer = TfidfVectorizer(min_df=0)
tfidf_title_features = tfidf_title_vectorizer.fit_transform(New_fashion_data['title'])

from sklearn.metrics import pairwise_distances

def tfidf_model(doc_id, num_results):
    pairwise_dist = pairwise_distances(tfidf_title_features, tfidf_title_features[doc_id])
    indices = np.argsort(pairwise_dist.flatten())[0:num_results]
    pdists = np.sort(pairwise_dist.flatten())[0:num_results]
    df_indices = list(New_fashion_data.index[indices])

    for i in range(len(indices)):
        get_result(indices[i], New_fashion_data['title'].loc[df_indices[0]], New_fashion_data['title'].loc[df_indices[i]], New_fashion_data['medium_image_url'].loc[df_indices[i]], 'tfidf')
        print('ASIN:', New_fashion_data['asin'].loc[df_indices[i]])
        print('Brand:', New_fashion_data['brand'].loc[df_indices[i]])
        print('Title:', New_fashion_data['title'].loc[df_indices[i]])
        print('Euclidean Similarity With The Query Image:', pdists[i])
        print('-.'*60)

tfidf_model(921, 20)


Explanation:

1. The code uses the `TfidfVectorizer` from scikit-learn to convert the titles of fashion data into a feature matrix (`tfidf_title_features`), where each row represents a document and each column represents a word in the corpus. The shape of `tfidf_title_features` is the number of data points (rows) by the number of words in the corpus (columns).
2. The `pairwise_distances` function from scikit-learn calculates the pairwise cosine distances between the TF-IDF feature vectors of the titles. It measures the similarity between the input apparel and all other apparels in the data.
3. The indices of the smallest distances (`num_results`) are sorted and stored in `indices`, and the corresponding distances are stored in `pdists`.
4. The loop iterates over the indices and calls the `get_result` function to plot a heatmap and display an associated image for each similar apparel. It also prints the ASIN, brand, title, and Euclidean similarity with the query image.
5. Finally, the `tfidf_model` function is called with a specific document ID (921) and the number of similar results desired (20).

Overall, this code performs a TF-IDF model using the cosine similarity metric to find similar products based on their titles. It visualizes the results by plotting heatmaps and displaying images of the similar apparels.

3. IDF Based Product Similarity i.e., let's drop TF and see

In [None]:

idf_title_vectorizer = CountVectorizer()
idf_title_features = idf_title_vectorizer.fit_transform(New_fashion_data['title'])
idf_title_features.get_shape()

def n_containing(word):
    return sum(1 for blob in New_fashion_data['title'] if word in blob.split())

def idf(word):
    return math.log(New_fashion_data.shape[0] / (n_containing(word)))

idf_title_features = idf_title_features.astype(np.float)

for i in idf_title_vectorizer.vocabulary_.keys():
    idf_val = idf(i)
    for j in idf_title_features[:, idf_title_vectorizer.vocabulary_[i]].nonzero()[0]:
        idf_title_features[j, idf_title_vectorizer.vocabulary_[i]] = idf_val

def idf_model(doc_id, num_results):
    pairwise_dist = pairwise_distances(idf_title_features, idf_title_features[doc_id])
    indices = np.argsort(pairwise_dist.flatten())[0:num_results]
    pdists = np.sort(pairwise_dist.flatten())[0:num_results]
    df_indices = list(New_fashion_data.index[indices])

    for i in range(len(indices)):
        get_result(indices[i], New_fashion_data['title'].loc[df_indices[0]], New_fashion_data['title'].loc[df_indices[i]], New_fashion_data['medium_image_url'].loc[df_indices[i]], 'idf')
        print('ASIN:', New_fashion_data['asin'].loc[df_indices[i]])
        print('Brand:', New_fashion_data['brand'].loc[df_indices[i]])
        print('Title:', New_fashion_data['title'].loc[df_indices[i]])
        print('Euclidean Distance From The Query Image:', pdists[i])
        print('-.'*60)

idf_model(921, 20)


Explanation:

1. The code uses the `CountVectorizer` from scikit-learn to convert the titles of fashion data into a feature matrix (`idf_title_features`), where each row represents a document and each column represents a word in the corpus. The shape of `idf_title_features` is the number of data points (rows) by the number of words in the corpus (columns).
2. The `n_containing` function calculates the number of documents in the corpus that contain a specific word.
3. The `idf` function calculates the inverse document frequency (IDF) for a given word. It is computed as the logarithm of the total number of documents divided by the number of documents that contain the word.
4. The feature matrix `idf_title_features` is converted to float type.
5. The loop iterates over the vocabulary of the `idf_title_vectorizer`, calculates the IDF value for each word, and replaces the corresponding count values in the feature matrix with the IDF values.
6. The `idf_model` function is defined to find similar apparel based on the IDF-weighted title features. It calculates the pairwise distances between the IDF-weighted feature vectors of the titles using cosine similarity.
7. The indices of the smallest distances (`num_results`) are sorted and stored in `indices`, and the corresponding distances are stored in `pdists`.
8. The loop iterates over the indices and calls the `get_result` function to plot a heatmap and display an associated image for each similar apparel. It also prints the ASIN, brand, title, and Euclidean distance from the query image.
9. Finally, the `idf_model` function is called with a specific document ID (921

) and the number of similar results desired (20).

Overall, this code performs an IDF model using cosine similarity to find similar products based on their titles. It applies IDF weighting to the count-based title features, calculates the pairwise distances, and visualizes the results using heatmaps and images.


# let's use word2vec

what is word2vec?
Word2Vec is a widely used technique in natural language processing (NLP) that aims to represent words as dense vector embeddings in a continuous vector space. It was introduced by Tomas Mikolov et al. in 2013 and has since become a popular method for learning word representations.

The main idea behind Word2Vec is to learn word embeddings by training a neural network on large amounts of text data. There are two primary architectures used in Word2Vec: Continuous Bag of Words (CBOW) and Skip-gram.

1. Continuous Bag of Words (CBOW): In this architecture, the model predicts the current word based on the context of surrounding words. The input to the model is a set of context words, and the output is the target word. CBOW is useful when we want to predict missing words or estimate the probability of a word given its context.

2. Skip-gram: This architecture is the inverse of CBOW. It predicts the surrounding context words given a target word. The input to the model is a target word, and the output is a set of context words. Skip-gram is useful when we want to find words that are semantically related or to perform tasks such as word analogy.

Both CBOW and Skip-gram models are trained by feeding pairs of target and context words into a neural network. The network is trained to maximize the likelihood of predicting the correct context words given the target word. During training, the model adjusts the word vectors in such a way that similar words have similar vector representations in the embedding space.

The resulting word embeddings capture semantic and syntactic relationships between words. Words with similar meanings or that often appear in similar contexts will have similar vector representations. These word embeddings can be used in various NLP tasks such as sentiment analysis, machine translation, information retrieval, and more.

Word2Vec has been influential in the field of NLP because it provides a computationally efficient and effective way to learn high-dimensional word representations from large text corpora, enabling machines to better understand and process natural language.

In [None]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

model = KeyedVectors.load_word2vec_format('/kaggle/input/word-2-vec/GoogleNews-vectors-negative300.bin', binary=True)

#or use 

#model = KeyedVectors.load_word2vec_format('/kaggle/input/amazon-apparel-dataset/word2vec_model', binary=True)
# both are same it's just size difference

new utility functions

In [None]:
# another utility function

def get_word_vec(sentence, doc_id, m_name):
    vec = []
    for i in sentence.split():
        if i in vocab:
            if m_name == 'weighted' and i in idf_title_vectorizer.vocabulary_:
                vec.append(idf_title_features[doc_id, idf_title_vectorizer.vocabulary_[i]] * model[i])
            elif m_name == 'avg':
                vec.append(model[i])
        else:
            vec.append(np.zeros(shape=(300,)))
    return np.array(vec)

def get_distance(vec1, vec2):
    final_dist = []
    for i in vec1:
        dist = []
        for j in vec2:
            dist.append(np.linalg.norm(i-j))
        final_dist.append(np.array(dist))
    return np.array(final_dist)

def heat_map_w2v(sentence1, sentence2, url, doc_id1, doc_id2, model):
    s1_vec = get_word_vec(sentence1, doc_id1, model)
    s2_vec = get_word_vec(sentence2, doc_id2, model)
    s1_s2_dist = get_distance(s1_vec, s2_vec)
    
    gs = gridspec.GridSpec(2, 2, width_ratios=[4,1], height_ratios=[2,1]) 
    fig = plt.figure(figsize=(15,15))
    
    ax = plt.subplot(gs[0])
    ax = sns.heatmap(np.round(s1_s2_dist,4), annot=True)
    ax.set_title(sentence2)
    
    ax = plt.subplot(gs[1])
    ax.grid(False)
    ax.set_xticks([])
    ax.set_yticks([])
    display_img(url, ax, fig)
    
    plt.show()

vocab = model.key_to_index.keys() # if you are using google's word2vec

#vocab = model.keys() #if you are using the 56 MB one

def build_avg_vec(sentence, num_features, doc_id, m_name):
    featureVec = np.zeros((num_features), dtype="float32")
    nwords = 0
    
    for word in sentence.split():
        nwords += 1
        if word in vocab:
            if m_name == 'weighted' and word in idf_title_vectorizer.vocabulary_:
                featureVec = np.add(featureVec, idf_title_features[doc_id, idf_title_vectorizer.vocabulary_[word]] * model[word])
            elif m_name == 'avg':
                featureVec = np.add(featureVec, model[word])
    if(nwords > 0):
        featureVec = np.divide(featureVec, nwords)
    return featureVec


The utility functions in the above code serves the following purposes:

1. `get_word_vec`: Given a sentence, document ID, and model name ('avg' or 'weighted'), it returns a numpy array of word vectors. Each row of the array represents the word vector (weighted or average) of a word in the sentence.

2. `get_distance`: Given two arrays of word vectors (`vec1` and `vec2`), it calculates the pairwise Euclidean distance between the vectors and returns the distance matrix.

3. `heat_map_w2v`: This function visualizes the distance matrix as a heatmap using Seaborn. It also displays an image related to the recommended apparel.

4. `build_avg_vec`: Given a sentence, number of features, document ID, and model name, it builds and returns the average word vector of the sentence.

These utility functions are used in the code to calculate and visualize the word vector representations and distances between product titles, allowing for recommendation and comparison based on semantic similarity.

1. Average Word2Vec Based Product Similarity

In [None]:
doc_id = 0
w2v_title = []
for i in New_fashion_data['title']:
    w2v_title.append(build_avg_vec(i, 300, doc_id, 'avg'))
    doc_id += 1
w2v_title = np.array(w2v_title)

def avg_w2v_model(doc_id, num_results):
    pairwise_dist = pairwise_distances(w2v_title, w2v_title[doc_id].reshape(1,-1))
    indices = np.argsort(pairwise_dist.flatten())[0:num_results]
    pdists = np.sort(pairwise_dist.flatten())[0:num_results]
    df_indices = list(New_fashion_data.index[indices])
    
    for i in range(0, len(indices)):
        heat_map_w2v(New_fashion_data['title'].loc[df_indices[0]], New_fashion_data['title'].loc[df_indices[i]], New_fashion_data['medium_image_url'].loc[df_indices[i]], indices[0], indices[i], 'avg')
        print('ASIN:', New_fashion_data['asin'].loc[df_indices[i]])
        print('Brand:', New_fashion_data['brand'].loc[df_indices[i]])
        print('Title:', New_fashion_data['title'].loc[df_indices[i]])
        print('Euclidean Distance From The Query Image:', pdists[i])
        print('-.'*60)

avg_w2v_model(921, 20)


This code performs the following tasks:

1. It creates an array `w2v_title` to store the average word vector representations of each title in the fashion dataset.

2. The function `avg_w2v_model` takes a document ID and the desired number of results as input. It calculates the pairwise distances between the word vectors of the given document and all other documents using the `pairwise_distances` function. It then retrieves the indices and distances of the `num_results` smallest distances.

3. For each result, it displays a heatmap comparing the title of the given document with the title of the recommended document. It also prints additional information such as ASIN, brand, title, and the Euclidean distance from the query image.

Overall, this code utilizes average word vector representations and pairwise distances to find similar fashion items based on their titles. It visualizes the similarities and provides additional details for the recommended items.

2. IDF Weighted Word2Vec Based Product Similarity

In [None]:
doc_id = 0
w2v_title_weight = []
for i in New_fashion_data['title']:
    w2v_title_weight.append(build_avg_vec(i, 300, doc_id, 'weighted'))
    doc_id += 1
w2v_title_weight = np.array(w2v_title_weight)


def weighted_w2v_model(doc_id, num_results):
    pairwise_dist = pairwise_distances(w2v_title_weight, w2v_title_weight[doc_id].reshape(1, -1))
    indices = np.argsort(pairwise_dist.flatten())[0:num_results]
    pdists = np.sort(pairwise_dist.flatten())[0:num_results]
    df_indices = list(New_fashion_data.index[indices])

    split_titles = [New_fashion_data['title'].loc[df_indices[i]].split() for i in range(len(indices))]
    max_words = max(len(title) for title in split_titles)
    padded_titles = [title + [''] * (max_words - len(title)) for title in split_titles]

    title1 = ' '.join(padded_titles[0])

    for i in range(len(indices)):
        title2 = ' '.join(padded_titles[i])
        heat_map_w2v(title1, title2, New_fashion_data['medium_image_url'].loc[df_indices[i]], indices[0], indices[i], 'weighted')
        print('ASIN:', New_fashion_data['asin'].loc[df_indices[i]])
        print('Brand:', New_fashion_data['brand'].loc[df_indices[i]])
        print('Title:', New_fashion_data['title'].loc[df_indices[i]])
        print('Euclidean Distance From The Query Image:', pdists[i])
        print('-.' * 60)


weighted_w2v_model(921, 20)


This code performs the following tasks:

1. It creates an array `w2v_title_weight` to store the weighted word vector representations of each title in the fashion dataset.

2. The function `weighted_w2v_model` takes a document ID and the desired number of results as input. It calculates the pairwise distances between the weighted word vectors of the given document and all other documents using the `pairwise_distances` function. It then retrieves the indices and distances of the `num_results` smallest distances.

3. It prepares the titles of the recommended items for visualization by splitting them into individual words and padding them with empty strings to match the maximum number of words among all titles.

4. For each result, it displays a heatmap comparing the title of the given document with the title of the recommended document. It also prints additional information such as ASIN, brand, title, and the Euclidean distance from the query image.

Overall, this code uses weighted word vector representations and pairwise distances to find similar fashion items based on their titles. It visualizes the similarities and provides additional details for the recommended items.

Weighted Similarity Using Brand & Color

In [None]:
from scipy.sparse import hstack
New_fashion_data['brand'].fillna(value="Not Given", inplace=True)
brands = [x.replace(" ", "-") for x in New_fashion_data['brand'].values]
types = [x.replace(" ", "-") for x in New_fashion_data['product_type_name'].values]
colors = [x.replace(" ", "-") for x in New_fashion_data['color'].values]

brand_vectorizer = CountVectorizer()
brand_features = brand_vectorizer.fit_transform(brands)

type_vectorizer = CountVectorizer()
type_features = type_vectorizer.fit_transform(types)

color_vectorizer = CountVectorizer()
color_features = color_vectorizer.fit_transform(colors)

extra_features = hstack((brand_features, type_features, color_features)).tocsr()

import plotly
import plotly.figure_factory as ff

def heat_map_w2v_brand(sentance1, sentance2, url, doc_id1, doc_id2, df_id1, df_id2, model):
    s1_vec = get_word_vec(sentance1, doc_id1, model)
    s2_vec = get_word_vec(sentance2, doc_id2, model)
    s1_s2_dist = get_distance(s1_vec, s2_vec)

    data_matrix = [['Asin', 'Brand', 'Color', 'Product type'],
                   [New_fashion_data['asin'].loc[df_id1], brands[doc_id1], colors[doc_id1], types[doc_id1]],
                   [New_fashion_data['asin'].loc[df_id2], brands[doc_id2], colors[doc_id2], types[doc_id2]]]

    colorscale = [[0, '#1d004d'], [.5, '#f2e5ff'], [1, '#f2e5d1']]

    table = ff.create_table(data_matrix, index=True, colorscale=colorscale)
    plotly.offline.iplot(table, filename='simple_table')

    gs = gridspec.GridSpec(25, 15)
    fig = plt.figure(figsize=(25, 5))
    ax1 = plt.subplot(gs[:, :-5])
    ax1 = sns.heatmap(np.round(s1_s2_dist, 6), annot=True)
    ax1.set_xticks(np.arange(len(sentance2.split())))
    ax1.set_xticklabels(sentance2.split())
    ax1.set_yticks(np.arange(len(sentance1.split())))
    ax1.set_yticklabels(sentance1.split())
    ax1.set_title(sentance2)

    ax2 = plt.subplot(gs[:, 10:16])
    ax2.grid(False)
    ax2.set_xticks([])
    ax2.set_yticks([])

    display_img(url, ax2, fig)

    plt.show()


def idf_w2v_brand(doc_id, w1, w2, num_results):
    idf_w2v_dist = pairwise_distances(w2v_title_weight, w2v_title_weight[doc_id].reshape(1, -1))
    ex_feat_dist = pairwise_distances(extra_features, extra_features[doc_id].reshape(1, -1))
    pairwise_dist = (w1 * idf_w2v_dist + w2 * ex_feat_dist) / float(w1 + w2)

    indices = np.argsort(pairwise_dist.flatten())[0:num_results]
    pdists = np.sort(pairwise_dist.flatten())[0:num_results]
    df_indices = list(New_fashion_data.index[indices])

    title1 = New_fashion_data['title'].loc[df_indices[0]]

    for i in range(len(indices)):
        title2 = New_fashion_data['title'].loc[df_indices[i]]
        heat_map_w2v_brand(title1, title2, New_fashion_data['medium_image_url'].loc[df_indices[i]], indices[0], indices[i], df_indices[0], df_indices[i], 'weighted')
        print('ASIN:', New_fashion_data['asin'].loc[df_indices[i]])
        print('Brand:', New_fashion_data['brand'].loc[df_indices[i]])
        print('Title:', New_fashion_data['title'].loc[df_indices[i]])
        print('Euclidean Distance From The Query Image:', pdists[i])
        print('-.' * 60)


This code performs the following tasks:

1. It preprocesses the 'brand', 'product_type_name', and 'color' columns of the fashion dataset. It replaces any null values in the 'brand' column with the string "Not Given" and replaces spaces in each column with hyphens.

2. It uses `CountVectorizer` to create count-based features for the preprocessed brand, product type, and color values.

3. It combines the count-based features into a single sparse matrix using `hstack`.

4. The function `heat_map_w2v_brand` takes two input titles, image URL, document IDs, and other parameters. It calculates the word vectors and pairwise distances between the two titles and visualizes them using heatmaps and an image display.

5. The function `idf_w2v_brand` calculates the pairwise distances between the word vectors of the given document and all other documents using a weighted combination of word vector distances (`idf_w2v_dist`) and extra features distances (`ex_feat_dist`). It retrieves the indices and distances of the `num_results` smallest distances and visualizes them using the `heat_map_w2v_brand` function.

Overall, this code adds additional features to the fashion dataset based on brand, product type, and color. It then uses these features and word vectors to calculate pairwise distances and visualize similarities between fashion items.

In [None]:
#w1 - title vector weight = 5
#w2 - brand and color weight = 5

idf_w2v_brand(921, 5, 5, 20)
#in the give heat map, each cell contains the euclidean distance between words i, j

In [None]:
#w1 - title vector weight = 5
#w2 - brand and color weight = 50

idf_w2v_brand(921, 5, 50, 20)
#in the give heat map, each cell contains the euclidean distance between words i, j

# OK! all text related is done.
let's proceed with images.It's better to do the below things in local system since it will take a lot of time for both downloading and training the images in CNN

In [None]:
New_fashion_data = pd.read_pickle("/kaggle/working/16k_dataset_no_stop_words")

from PIL import Image
import requests
from io import BytesIO

for index, row in New_fashion_data.iterrows():
        url = row['medium_image_url']
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        img.save('../Images/'+row['asin']+'.jpeg') # store it in a file called Images

In [None]:
import numpy as np
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dropout, Flatten, Dense
from keras import applications
from sklearn.metrics import pairwise_distances
import matplotlib.pyplot as plt
import requests
from PIL import Image
import pandas as pd
import pickle
from IPython.display import display, Image, SVG

In [None]:
#dimensions of our images.
img_width, img_height = 224, 224

top_model_weights_path = 'bottleneck_fc_model.h5'
train_data_dir = 'Images/'
nb_train_samples = #the number that you get after saving the previous file
epochs = 50
batch_size = 1


def save_bottlebeck_features():
    #function to compute VGG-16 CNN for image feature extraction.
    
    asins = []
    datagen = ImageDataGenerator(rescale=1. / 255)
    
    #build the VGG16 network
    model = applications.VGG16(include_top=False, weights='imagenet')
    generator = datagen.flow_from_directory(
        train_data_dir,
        target_size=(img_width, img_height),
        batch_size=batch_size,
        class_mode=None,
        shuffle=False)

    for i in generator.filenames:
        asins.append(i[2:-5])

    bottleneck_features_train = model.predict_generator(generator, nb_train_samples // batch_size)
    bottleneck_features_train = bottleneck_features_train.reshape((16042,25088))
    
    # be sure togive proper path of the directory where you want the below files to be saved
    
    np.save(open('../16k_Products_Data_CNN.npy', 'wb'), bottleneck_features_train)
    np.save(open('../16k_Products_Data_CNN_asins.npy', 'wb'), np.array(asins))
    

save_bottlebeck_features()

#load the features and corresponding ASINS info
bottleneck_features_train = np.load('16k_Products_Data_CNN.npy')

asins = np.load('16k_Products_Data_CNN_asins.npy')
asins = list(asins)

let's see product similarity on visual features

In [None]:
df_asins = list(apparel_data['asin'])

#get similar products using CNN features (VGG-16)
def get_similar_products_cnn(doc_id, num_results):
    doc_id = asins.index(df_asins[doc_id])
    pairwise_dist = pairwise_distances(bottleneck_features_train, bottleneck_features_train[doc_id].reshape(1,-1))

    indices = np.argsort(pairwise_dist.flatten())[0:num_results]
    pdists  = np.sort(pairwise_dist.flatten())[0:num_results]

    for i in range(len(indices)):
        rows = apparel_data[['medium_image_url','title']].loc[apparel_data['asin']==asins[indices[i]]]
        for indx, row in rows.iterrows():
            display(Image(url=row['medium_image_url'], embed=True))
            print('Product Title: ', row['title'])
            print('Euclidean Distance From The Query Image:', pdists[i])
            print('Amazon URL: www.amzon.com/dp/'+ asins[indices[i]])
            
            
get_similar_products_cnn(921, 20)