In [417]:
import pandas as pd
import numpy as np

In [418]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

import warnings; warnings.simplefilter('ignore')

In [419]:
ordersExport = pd.read_csv('orders_export.csv')
productsExport = pd.read_csv('products_export.csv')

In [420]:
# Name(Customer Id), Lineitem quantity, Lineitem name, lineItem sku(product id)

In [421]:
df_order = ordersExport[['Name', 'Lineitem quantity', 'Lineitem name', 'Lineitem sku']]
print(df_order.shape)
df_order.head()

(118, 4)


Unnamed: 0,Name,Lineitem quantity,Lineitem name,Lineitem sku
0,#2891,1,Betwa Kurta - XL,HOD0011
1,#2891,1,Bhagirathi pants - XXL,HOD0444
2,#2891,1,Kosi Kurta - XL,HOD0005
3,#2892,1,Panzara kurta - XXL,HOD0126
4,#2893,1,Betwa Kurta - S,HOD0008


In [422]:
#Handle, Title, Body (HTML), Custom Product Type, Tags, Variant SKU, SEO Title

In [423]:
df_product = productsExport[['Handle', 'Title', 'Custom Product Type', 'Tags', 'Variant SKU']]
print(df_product.shape)
df_product.head()

(874, 5)


Unnamed: 0,Handle,Title,Custom Product Type,Tags,Variant SKU
0,ishya-blockprinted-kurta-set,Ishya Blockprinted Kurta (Set of 2),Kurta Sets,"Category_Kurta Sets, Category_Women, Collectio...",HOD0772
1,ishya-blockprinted-kurta-set,,,,HOD0773
2,ishya-blockprinted-kurta-set,,,,HOD0774
3,ishya-blockprinted-kurta-set,,,,HOD0775
4,ishya-blockprinted-kurta-set,,,,HOD0776


In [424]:
final_product = df_product[df_product['Tags'].isnull() == False]
print(final_product.shape)
final_product

(163, 5)


Unnamed: 0,Handle,Title,Custom Product Type,Tags,Variant SKU
0,ishya-blockprinted-kurta-set,Ishya Blockprinted Kurta (Set of 2),Kurta Sets,"Category_Kurta Sets, Category_Women, Collectio...",HOD0772
10,ahaana-blockprinted-kurta-set,Ahaana Blockprinted Kurta (Set of 2),Kurta Sets,"Category_Kurta Sets, Category_Women, Collectio...",HOD0766
21,seher-blockprinted-kurta-set,Seher Blockprinted Kurta (Set of 2),Kurta Sets,"Category_Kurta Sets, Category_Women, Collectio...",HOD0760
29,mihira-blockprinted-kurta-set,Mihira Blockprinted Kurta (Set of 3),Kurta Sets,"Category_Kurta Sets, Category_Women, Collectio...",HOD0754
41,bahar-blockprinted-kurta-set,Bahar Blockprinted Kurta (Set of 3),Kurta Sets,"Category_Kurta Sets, Category_Women, Collectio...",HOD0748
...,...,...,...,...,...
844,a-line-white-and-blue-kurta,Sutlej Kurta,Kurta,"Category_Kurta, Category_Women, Collection_Nad...",HOD0025
850,long-straight-peach-and-white-kurta,Chenab Kurta,Kurta,"Category_Kurta, Category_Women, Collection_Nad...",HOD0019
856,long-straight-white-kurta,Alaknanda Kurta,Kurta,"Category_Kurta, Category_Women, Collection_Nad...",HOD0013
862,straight-blue-white-kurta,Betwa Kurta,Kurta,"Category_Kurta, Category_Women, Collection_Nad...",HOD0007


## Cleaning Handle

In [425]:
final_product['Handle']

0             ishya-blockprinted-kurta-set
10           ahaana-blockprinted-kurta-set
21            seher-blockprinted-kurta-set
29           mihira-blockprinted-kurta-set
41            bahar-blockprinted-kurta-set
                      ...                 
844            a-line-white-and-blue-kurta
850    long-straight-peach-and-white-kurta
856              long-straight-white-kurta
862              straight-blue-white-kurta
868                        peachful-desire
Name: Handle, Length: 163, dtype: object

In [426]:
final_product['Handle'] = final_product['Handle'].apply(lambda x: [str.lower(i.replace("-", "")) for i in x])
final_product['Handle']

0      [i, s, h, y, a, , b, l, o, c, k, p, r, i, n, t...
10     [a, h, a, a, n, a, , b, l, o, c, k, p, r, i, n...
21     [s, e, h, e, r, , b, l, o, c, k, p, r, i, n, t...
29     [m, i, h, i, r, a, , b, l, o, c, k, p, r, i, n...
41     [b, a, h, a, r, , b, l, o, c, k, p, r, i, n, t...
                             ...                        
844    [a, , l, i, n, e, , w, h, i, t, e, , a, n, d, ...
850    [l, o, n, g, , s, t, r, a, i, g, h, t, , p, e,...
856    [l, o, n, g, , s, t, r, a, i, g, h, t, , w, h,...
862    [s, t, r, a, i, g, h, t, , b, l, u, e, , w, h,...
868         [p, e, a, c, h, f, u, l, , d, e, s, i, r, e]
Name: Handle, Length: 163, dtype: object

In [427]:
final_product['Handle'] = final_product['Handle'].apply(lambda x: ''.join(x))

In [428]:
final_product['Handle']

0           ishyablockprintedkurtaset
10         ahaanablockprintedkurtaset
21          seherblockprintedkurtaset
29         mihirablockprintedkurtaset
41          baharblockprintedkurtaset
                    ...              
844            alinewhiteandbluekurta
850    longstraightpeachandwhitekurta
856            longstraightwhitekurta
862            straightbluewhitekurta
868                    peachfuldesire
Name: Handle, Length: 163, dtype: object

## Cleaning Title

In [429]:
final_product['Title']

0      Ishya Blockprinted Kurta  (Set of 2)
10     Ahaana Blockprinted Kurta (Set of 2)
21      Seher Blockprinted Kurta (Set of 2)
29     Mihira Blockprinted Kurta (Set of 3)
41      Bahar Blockprinted Kurta (Set of 3)
                       ...                 
844                            Sutlej Kurta
850                            Chenab Kurta
856                         Alaknanda Kurta
862                             Betwa Kurta
868                              Kosi Kurta
Name: Title, Length: 163, dtype: object

In [430]:
final_product['Title'] = final_product['Title'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
final_product['Title']

0      [i, s, h, y, a, , b, l, o, c, k, p, r, i, n, t...
10     [a, h, a, a, n, a, , b, l, o, c, k, p, r, i, n...
21     [s, e, h, e, r, , b, l, o, c, k, p, r, i, n, t...
29     [m, i, h, i, r, a, , b, l, o, c, k, p, r, i, n...
41     [b, a, h, a, r, , b, l, o, c, k, p, r, i, n, t...
                             ...                        
844                  [s, u, t, l, e, j, , k, u, r, t, a]
850                  [c, h, e, n, a, b, , k, u, r, t, a]
856         [a, l, a, k, n, a, n, d, a, , k, u, r, t, a]
862                     [b, e, t, w, a, , k, u, r, t, a]
868                        [k, o, s, i, , k, u, r, t, a]
Name: Title, Length: 163, dtype: object

In [431]:
final_product['Title'] = final_product['Title'].apply(lambda x: ''.join(x))
# final_product['Title'] = final_product['Title'].apply(lambda x: [str.lower(i.replace("(setof2)", "")) for i in x])
# final_product['Title'] = final_product['Title'].apply(lambda x: [str.lower(i.replace("(setof3)", "")) for i in x])
# final_product['Title'] = final_product['Title'].apply(lambda x: ''.join(x))

In [432]:
final_product['Title'][0:10]

0      ishyablockprintedkurta(setof2)
10    ahaanablockprintedkurta(setof2)
21     seherblockprintedkurta(setof2)
29    mihirablockprintedkurta(setof3)
41     baharblockprintedkurta(setof3)
55      keyablockprintedkurta(setof3)
70                         ektakaftan
76                        barnakaftan
82                   saukhayadakaftan
88                       chesnakaftan
Name: Title, dtype: object

## Custom Product Type Cleaning

In [433]:
# final_product['Custom Product Type'] = final_product['Custom Product Type'].tolist()
# final_product['Custom Product Type']

In [434]:
# final_product['Custom Product Type'] = final_product['Custom Product Type'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
# final_product['Custom Product Type']

In [435]:
# final_product['Custom Product Type'] = final_product['Custom Product Type'].apply(lambda x: ''.join(x))

In [436]:
# final_product['Custom Product Type']

## Cleaning Tags 

In [437]:
final_product['Tags'][0]

'Category_Kurta Sets, Category_Women, Collection_Rozana, kurta for Women, Neck Type_V Neck, Price_3K to 4K'

In [438]:
final_product['Tags'] = final_product['Tags'].apply(lambda x: [str.lower(i.replace("_", "")) for i in x])
final_product['Tags'] = final_product['Tags'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [439]:
final_product['Tags']

0      [c, a, t, e, g, o, r, y, , k, u, r, t, a, , s,...
10     [c, a, t, e, g, o, r, y, , k, u, r, t, a, , s,...
21     [c, a, t, e, g, o, r, y, , k, u, r, t, a, , s,...
29     [c, a, t, e, g, o, r, y, , k, u, r, t, a, , s,...
41     [c, a, t, e, g, o, r, y, , k, u, r, t, a, , s,...
                             ...                        
844    [c, a, t, e, g, o, r, y, , k, u, r, t, a, ,, ,...
850    [c, a, t, e, g, o, r, y, , k, u, r, t, a, ,, ,...
856    [c, a, t, e, g, o, r, y, , k, u, r, t, a, ,, ,...
862    [c, a, t, e, g, o, r, y, , k, u, r, t, a, ,, ,...
868    [c, a, t, e, g, o, r, y, , k, u, r, t, a, ,, ,...
Name: Tags, Length: 163, dtype: object

In [440]:
final_product['Tags'] = final_product['Tags'].apply(lambda x: ''.join(x))
final_product['Tags'] = final_product['Tags'].apply(lambda x: [str.lower(i.replace(",", " ")) for i in x])
final_product['Tags'] = final_product['Tags'].apply(lambda x: ''.join(x))

In [441]:
final_product['Tags'][0]

'categorykurtasets categorywomen collectionrozana kurtaforwomen necktypevneck price3kto4k'

# Combining the attributes

In [442]:
final_product['description'] = final_product['Handle'] + " " + final_product['Title'] + " " + final_product['Tags']
final_product['description'] = final_product['description'].fillna('')

In [443]:
final_product['description'][0]

'ishyablockprintedkurtaset ishyablockprintedkurta(setof2) categorykurtasets categorywomen collectionrozana kurtaforwomen necktypevneck price3kto4k'

In [444]:
import re

def clean_description(text):
    text = re.sub("\'", "", text)
#     text = re.sub("[^a-zA-Z]"," ",text)
    text = ' '.join(text.split())
    text = text.lower()
    return text

final_product['clean_description'] = final_product['description'].apply(lambda x: clean_description(x))
final_product['clean_description'][0]

'ishyablockprintedkurtaset ishyablockprintedkurta(setof2) categorykurtasets categorywomen collectionrozana kurtaforwomen necktypevneck price3kto4k'

In [450]:
final_product.head()

Unnamed: 0,Handle,Title,Custom Product Type,Tags,Variant SKU,description,clean_description
0,ishyablockprintedkurtaset,ishyablockprintedkurta(setof2),Kurta Sets,categorykurtasets categorywomen collectionroza...,HOD0772,ishyablockprintedkurtaset ishyablockprintedkur...,ishyablockprintedkurtaset ishyablockprintedkur...
10,ahaanablockprintedkurtaset,ahaanablockprintedkurta(setof2),Kurta Sets,categorykurtasets categorywomen collectionroza...,HOD0766,ahaanablockprintedkurtaset ahaanablockprintedk...,ahaanablockprintedkurtaset ahaanablockprintedk...
21,seherblockprintedkurtaset,seherblockprintedkurta(setof2),Kurta Sets,categorykurtasets categorywomen collectionroza...,HOD0760,seherblockprintedkurtaset seherblockprintedkur...,seherblockprintedkurtaset seherblockprintedkur...
29,mihirablockprintedkurtaset,mihirablockprintedkurta(setof3),Kurta Sets,categorykurtasets categorywomen collectionroza...,HOD0754,mihirablockprintedkurtaset mihirablockprintedk...,mihirablockprintedkurtaset mihirablockprintedk...
41,baharblockprintedkurtaset,baharblockprintedkurta(setof3),Kurta Sets,categorykurtasets categorywomen collectionroza...,HOD0748,baharblockprintedkurtaset baharblockprintedkur...,baharblockprintedkurtaset baharblockprintedkur...


In [446]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(final_product['description'])

In [447]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [448]:
cosine_sim[0]

array([1.        , 0.76470588, 0.76470588, 0.64705882, 0.64705882,
       0.64705882, 0.10114435, 0.10114435, 0.10114435, 0.10114435,
       0.10114435, 0.10114435, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.10114435, 0.10114435, 0.10114435, 0.10114435,
       0.10114435, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.05292561,
       0.05292561, 0.10585122, 0.11128298, 0.09701425, 0.10114435,
       0.10114435, 0.10585122, 0.10114435, 0.11764706, 0.05292561,
       0.05292561, 0.05292561, 0.05292561, 0.05057217, 0.05564149,
       0.05057217, 0.05564149, 0.05564149, 0.05057217, 0.05292

In [451]:
final_product = final_product.reset_index()

In [452]:
final_product

Unnamed: 0,index,Handle,Title,Custom Product Type,Tags,Variant SKU,description,clean_description
0,0,ishyablockprintedkurtaset,ishyablockprintedkurta(setof2),Kurta Sets,categorykurtasets categorywomen collectionroza...,HOD0772,ishyablockprintedkurtaset ishyablockprintedkur...,ishyablockprintedkurtaset ishyablockprintedkur...
1,10,ahaanablockprintedkurtaset,ahaanablockprintedkurta(setof2),Kurta Sets,categorykurtasets categorywomen collectionroza...,HOD0766,ahaanablockprintedkurtaset ahaanablockprintedk...,ahaanablockprintedkurtaset ahaanablockprintedk...
2,21,seherblockprintedkurtaset,seherblockprintedkurta(setof2),Kurta Sets,categorykurtasets categorywomen collectionroza...,HOD0760,seherblockprintedkurtaset seherblockprintedkur...,seherblockprintedkurtaset seherblockprintedkur...
3,29,mihirablockprintedkurtaset,mihirablockprintedkurta(setof3),Kurta Sets,categorykurtasets categorywomen collectionroza...,HOD0754,mihirablockprintedkurtaset mihirablockprintedk...,mihirablockprintedkurtaset mihirablockprintedk...
4,41,baharblockprintedkurtaset,baharblockprintedkurta(setof3),Kurta Sets,categorykurtasets categorywomen collectionroza...,HOD0748,baharblockprintedkurtaset baharblockprintedkur...,baharblockprintedkurtaset baharblockprintedkur...
...,...,...,...,...,...,...,...,...
158,844,alinewhiteandbluekurta,sutlejkurta,Kurta,categorykurta categorywomen collectionnadiyank...,HOD0025,alinewhiteandbluekurta sutlejkurta categorykur...,alinewhiteandbluekurta sutlejkurta categorykur...
159,850,longstraightpeachandwhitekurta,chenabkurta,Kurta,categorykurta categorywomen collectionnadiyank...,HOD0019,longstraightpeachandwhitekurta chenabkurta cat...,longstraightpeachandwhitekurta chenabkurta cat...
160,856,longstraightwhitekurta,alaknandakurta,Kurta,categorykurta categorywomen collectionnadiyank...,HOD0013,longstraightwhitekurta alaknandakurta category...,longstraightwhitekurta alaknandakurta category...
161,862,straightbluewhitekurta,betwakurta,Kurta,categorykurta categorywomen collectionnadiyank...,HOD0007,straightbluewhitekurta betwakurta categorykurt...,straightbluewhitekurta betwakurta categorykurt...


In [462]:
sku = final_product['Variant SKU']
sku

0      HOD0772
1      HOD0766
2      HOD0760
3      HOD0754
4      HOD0748
        ...   
158    HOD0025
159    HOD0019
160    HOD0013
161    HOD0007
162    HOD0001
Name: Variant SKU, Length: 163, dtype: object

In [455]:
indices = pd.Series(final_product.index, index=final_product['Title'])
indices

Title
ishyablockprintedkurta(setof2)       0
ahaanablockprintedkurta(setof2)      1
seherblockprintedkurta(setof2)       2
mihirablockprintedkurta(setof3)      3
baharblockprintedkurta(setof3)       4
                                  ... 
sutlejkurta                        158
chenabkurta                        159
alaknandakurta                     160
betwakurta                         161
kosikurta                          162
Length: 163, dtype: int64

In [463]:
def get_recommendations(title):
    idx = indices[title]
    
    #It will get the title with its index
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    
    # [2502, 7535, 4702, 889, 437]
    product_indices = [i[0] for i in sim_scores]
    
    return sku.iloc[product_indices]

In [464]:
title_with_index = get_recommendations("ishyablockprintedkurta(setof2)")
title_with_index

1    HOD0766
2    HOD0760
3    HOD0754
4    HOD0748
5    HOD0742
Name: Variant SKU, dtype: object

In [481]:
title_with_index = get_recommendations("chenabkurta")
skulist = title_with_index.tolist()
skulist

['HOD0001', 'HOD0181', 'HOD0013', 'HOD0145', 'HOD0049']

In [482]:
titlelist = []
for i in skulist:
    titlefromsku = final_product[final_product['Variant SKU'] == i][['Title']]
    titlelist.append(titlefromsku)
titlelist

[         Title
 162  kosikurta,
             Title
 99  amravatikurta,
               Title
 160  alaknandakurta,
             Title
 126  manjirakurta,
          Title
 154  beaskurta]