In [123]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [124]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/yjalil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/yjalil/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [125]:
product_features = ['Uniqe Id', 'Product Name', 'Category','About Product',
       'Technical Details']

In [126]:
df_products = pd.read_csv('./data/product_details.csv', usecols=product_features)

In [127]:
df_products.isna().sum()

Uniqe Id               0
Product Name           0
Category             830
About Product        273
Technical Details    790
dtype: int64

# Missing Values

In [128]:
df_products.dropna(how='any', inplace=True)

In [129]:
df_products.isna().sum()

Uniqe Id             0
Product Name         0
Category             0
About Product        0
Technical Details    0
dtype: int64

# Data Prep

In [130]:
df_products['Product Details'] = df_products["Category"] + ' ' + df_products["About Product"] + ' ' + df_products["Technical Details"]

In [131]:
df_products.drop(["Category",'About Product','Technical Details'], axis=1, inplace=True)

In [132]:
def preprocessing(sentence):
    # $CHALLENGIFY_BEGIN

    # Removing whitespaces
    sentence = sentence.strip()
    # Lowercasing
    sentence = sentence.lower()
    # Removing numbers
    sentence = ''.join(char for char in sentence if not char.isdigit())
    # Removing punctuation
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '')
    # Tokenizing
    tokenized = word_tokenize(sentence)
    # Lemmatizing
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in tokenized]
    cleaned_sentence = " ".join(lemmatized)
    return cleaned_sentence


In [133]:
df_products['Product Details'] = df_products['Product Details'].apply(preprocessing)

# Tf Idf

In [134]:
tfidf = TfidfVectorizer(stop_words="english", min_df = 4)
tfidf_matrix = tfidf.fit_transform(df_products["Product Details"])

In [135]:
tfidf_matrix.shape

(8246, 9205)

In [136]:
similarity = cosine_similarity(tfidf_matrix)

In [137]:
similarity

array([[1.        , 0.04504323, 0.1666925 , ..., 0.07865713, 0.11953294,
        0.00519734],
       [0.04504323, 1.        , 0.0384108 , ..., 0.01660082, 0.0163182 ,
        0.0457304 ],
       [0.1666925 , 0.0384108 , 1.        , ..., 0.02811741, 0.00895818,
        0.06932615],
       ...,
       [0.07865713, 0.01660082, 0.02811741, ..., 1.        , 0.00591829,
        0.02070813],
       [0.11953294, 0.0163182 , 0.00895818, ..., 0.00591829, 1.        ,
        0.00616431],
       [0.00519734, 0.0457304 , 0.06932615, ..., 0.02070813, 0.00616431,
        1.        ]])

In [138]:
df_products['Uniqe Id'].iloc[0]

'66d49bbed043f5be260fa9f7fbff5957'

In [139]:
current_product = df_products['Uniqe Id'].iloc[0]

def get_index_from(id):
    return df_products[df_products['Uniqe Id'] == id].index[0]

product_index = get_index_from(current_product)

In [140]:
product_index

1

In [141]:
similar_products = list(enumerate(similarity[product_index]))

In [142]:
similar_products_sorted = sorted(similar_products, key = lambda x:x[1], reverse = True)

In [150]:
df_products.index

Index([    1,     2,     3,     4,     6,     8,    10,    11,    13,    14,
       ...
        9989,  9990,  9992,  9993,  9994,  9995,  9996,  9998,  9999, 10001],
      dtype='int64', length=8246)

In [144]:
similar_products_sorted

[(1, 1.0000000000000002),
 (6953, 0.5668782309391142),
 (2394, 0.5231465246794399),
 (1758, 0.22893868546265297),
 (1230, 0.1792010543156651),
 (7110, 0.16489465093197714),
 (2938, 0.1611974121218507),
 (4783, 0.15525432288551952),
 (7975, 0.13472494357937126),
 (1639, 0.12982960518935993),
 (7010, 0.12879122154515982),
 (5225, 0.12731621222406944),
 (1322, 0.12490114357454765),
 (3431, 0.11928782592909394),
 (1625, 0.11898007255631023),
 (6069, 0.11833095322056385),
 (5672, 0.11586630921967388),
 (1643, 0.11478840435276406),
 (3100, 0.11354081804259078),
 (7822, 0.11313429237632106),
 (2494, 0.11211354158186483),
 (7849, 0.11025110965000165),
 (2206, 0.10806653897043103),
 (2198, 0.10805595889204124),
 (6647, 0.10803651908778497),
 (2575, 0.10725290151315062),
 (191, 0.1069862833718629),
 (5936, 0.10665224106288679),
 (2334, 0.10646266723383105),
 (786, 0.10629576811922581),
 (3593, 0.1061955753826167),
 (7432, 0.1043219850311919),
 (1459, 0.10355201566391493),
 (6688, 0.1031816639299

In [153]:
print(f"Current product => {df_products.iloc[similar_products_sorted[0][0],1]}")
for product in similar_products_sorted[1:6]:
    print(df_products.iloc[product[0],1])

Current product => 3Doodler Create Flexy 3D Printing Filament Refill Bundle (X5 Pack, Over 1000'. of Extruded Plastics! - Innovate
3Doodler Create 4 Pack Plastic Bundle with Licensed Farnsworth House Stencil (3D Pen Not Included)
3Doodler Start Product Design Themed Activity Kit (3D Pen Not Included)
Tech 4 Kids 3D Magic Imagi Pen
DohVinci 6-Pack Drawing Compound - Mixed Colors
VTech Write and Learn Creative Center
