In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('./Fashion.csv', usecols=['id','gender','masterCategory','subCategory','articleType','baseColour','season','year','usage','productDisplayName'])

# We will replace all empty data with the None string
df.productDisplayName.fillna('None', inplace=True)
df.head()

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt


In [3]:
df.productDisplayName.value_counts()

productDisplayName
Lucera Women Silver Earrings                       82
Lucera Women Silver Pendant                        56
Lucera Women Silver Ring                           50
Catwalk Women Black Heels                          48
Q&Q Men Black Dial Watch                           42
                                                   ..
Spykar Men Navy Blue Trevor Brief                   1
Nike Mens White Polo T-shirt                        1
Belmonte Men Solid Blue Shirts                      1
Puma Men's Toe Crusher Black T-shirt                1
Fossil Women Pink Dial Chronograph Watch ES3050     1
Name: count, Length: 31136, dtype: int64

In [5]:
print(f"Number of records with None: {len(df[df.productDisplayName=='None'])}")

Number of records with None: 7


In [6]:
# Load your dataset into a list of texts
texts = df.productDisplayName.tolist()

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer on your dataset
vectorizer.fit(texts)

# Transform the texts into vectors
text_vectors = vectorizer.transform(texts)

In [7]:
def similarity_tfidf(input_text):
    # Get the input text from the request
    # input_text = request.json['text']

    # Transform the input text into a vector
    input_vector = vectorizer.transform([input_text])

    # Compute the similarity scores between the input and all texts
    similarity_scores = text_vectors.dot(input_vector.T).toarray()

    # Get the indices of the top three most similar texts
    top_indices = similarity_scores.argsort(axis=0)[-3:][::-1].flatten()

    # Get the top three most similar texts and their scores
    results = []
    for idx in top_indices:
        text = texts[idx]
        score = similarity_scores[idx, 0]
        results.append({'text': text, 'score': score})

    # Return the most similar text as a response
    return {'most_similar_texts': results}

In [8]:
similarity_tfidf('Navy Blue Shirt')

{'most_similar_texts': [{'text': 'Nike Men Navy Blue T-shirt',
   'score': 0.8014861256998236},
  {'text': 'Nike Men Navy Blue T-shirt', 'score': 0.8014861256998236},
  {'text': 'Puma Men Navy Blue T-shirt', 'score': 0.7960720876525432}]}

In [9]:
similarity_tfidf('Gold Watch')

{'most_similar_texts': [{'text': 'Q&Q Women Gold Dial Watch',
   'score': 0.8227180734626993},
  {'text': 'Titan Women Gold Watch', 'score': 0.712016388546559},
  {'text': 'Titan Women Gold Watch', 'score': 0.712016388546559}]}

In [10]:
similarity_tfidf('paint brush')

{'most_similar_texts': [{'text': 'Converse Unisex Brush Print Green Casual Shoes',
   'score': 0.4301585310494203},
  {'text': 'Baggit Women Hero Brush Black Wallet',
   'score': 0.42128234509467444},
  {'text': 'Baggit Women Hero Brush Brown Wallet',
   'score': 0.41550798232829356}]}

In [11]:
# Define the path to the GloVe word vectors file
glove_file = './glove.6B/glove.6B.50d.txt'

# Read the word vectors from the file
word_vectors = {}
with open(glove_file, 'r', encoding='utf-8') as file:
    for line in file:
        values = line.split()
        word = values[0]
        vector = np.array(values[1:], dtype=np.float32)
        word_vectors[word] = vector

In [12]:
# Tokenize and compute word embeddings for each text in the dataset
embedding_size = 50
text_embeddings = np.zeros((len(texts), 50))
for idx, text in enumerate(texts):
    tokens = text.lower().split()
    embeddings = np.array([word_vectors[token] for token in tokens if token in word_vectors.keys()])
    if embeddings.size > 1:
        text_embedding = np.mean(embeddings, axis=0).reshape((1, embedding_size))
        text_embeddings[idx, :] = text_embedding

print(text_embeddings.shape)

(44446, 50)


In [13]:
def similarity_pretrained(input_text):
    # Tokenize the input text
    input_tokens = input_text.lower().split()

    # Compute the average word embedding for the input text
    input_embedding = np.mean([word_vectors[token] for token in input_tokens if token in word_vectors.keys()], axis=0)

    print(input_embedding.reshape(1, -1).shape)
    # Compute the cosine similarity between the input embedding and all text embeddings
    similarity_scores = cosine_similarity(input_embedding.reshape(1, -1), text_embeddings)

    # Get the indices of the top three most similar texts
    top_indices = similarity_scores.argsort(axis=1)[0][-3:][::-1]

    # Get the top three most similar texts and their scores
    results = []
    for idx in top_indices:
        text = texts[idx]
        score = similarity_scores[0, idx]
        results.append({'text': text, 'score': score})

    # Return the most similar text as a response
    return {'most_similar_texts': results}

In [14]:
similarity_pretrained('Navy Blue Shirt')

(1, 50)


{'most_similar_texts': [{'text': 'Fabindia Men Striped Navy Blue Shirt',
   'score': 0.974948162068894},
  {'text': 'Spykar Men Navy Blue  Shirt', 'score': 0.9689218819663892},
  {'text': 'Spykar Men Ranger Navy Blue Shirt', 'score': 0.9642384108873676}]}

In [15]:
similarity_pretrained('Gold Watch')

(1, 50)


{'most_similar_texts': [{'text': 'Titan Women Gold Watch',
   'score': 0.944164548689556},
  {'text': 'Titan Women Gold Watch', 'score': 0.944164548689556},
  {'text': 'Titan Women Gold Watch', 'score': 0.944164548689556}]}

In [16]:
similarity_pretrained('paint brush')

(1, 50)


{'most_similar_texts': [{'text': 'Colorbar I-Define Moss Green Eye Pencil 004',
   'score': 0.827217819573097},
  {'text': 'Just Natural Unisex Charcoal Rain Jacket',
   'score': 0.786268494870539},
  {'text': 'Cobblerz Women Charcoal Grey Wedges', 'score': 0.783575809749073}]}