In [1]:
import pandas as pd
import numpy as np

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

import warnings; warnings.simplefilter('ignore')

In [3]:
# all lightfm imports 
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm import cross_validation
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

# imports re for text cleaning 
import re
from datetime import datetime, timedelta

# Loading Dataset

In [4]:
ordersExport = pd.read_csv('orders_export.csv')
productsExport = pd.read_csv('products_export.csv')

In [5]:
df_order = ordersExport[['Name', 'Lineitem quantity', 'Lineitem name', 'Lineitem sku']]
print(df_order.shape)
df_order.head()

(118, 4)


Unnamed: 0,Name,Lineitem quantity,Lineitem name,Lineitem sku
0,#2891,1,Betwa Kurta - XL,HOD0011
1,#2891,1,Bhagirathi pants - XXL,HOD0444
2,#2891,1,Kosi Kurta - XL,HOD0005
3,#2892,1,Panzara kurta - XXL,HOD0126
4,#2893,1,Betwa Kurta - S,HOD0008


In [6]:
df_product = productsExport[['Handle', 'Title', 'Custom Product Type', 'Tags', 'Variant SKU']]
print(df_product.shape)
df_product.head()

(874, 5)


Unnamed: 0,Handle,Title,Custom Product Type,Tags,Variant SKU
0,ishya-blockprinted-kurta-set,Ishya Blockprinted Kurta (Set of 2),Kurta Sets,"Category_Kurta Sets, Category_Women, Collectio...",HOD0772
1,ishya-blockprinted-kurta-set,,,,HOD0773
2,ishya-blockprinted-kurta-set,,,,HOD0774
3,ishya-blockprinted-kurta-set,,,,HOD0775
4,ishya-blockprinted-kurta-set,,,,HOD0776


In [7]:
final_product = df_product[df_product['Tags'].isnull() == False]
print(final_product.shape)
final_product

(163, 5)


Unnamed: 0,Handle,Title,Custom Product Type,Tags,Variant SKU
0,ishya-blockprinted-kurta-set,Ishya Blockprinted Kurta (Set of 2),Kurta Sets,"Category_Kurta Sets, Category_Women, Collectio...",HOD0772
10,ahaana-blockprinted-kurta-set,Ahaana Blockprinted Kurta (Set of 2),Kurta Sets,"Category_Kurta Sets, Category_Women, Collectio...",HOD0766
21,seher-blockprinted-kurta-set,Seher Blockprinted Kurta (Set of 2),Kurta Sets,"Category_Kurta Sets, Category_Women, Collectio...",HOD0760
29,mihira-blockprinted-kurta-set,Mihira Blockprinted Kurta (Set of 3),Kurta Sets,"Category_Kurta Sets, Category_Women, Collectio...",HOD0754
41,bahar-blockprinted-kurta-set,Bahar Blockprinted Kurta (Set of 3),Kurta Sets,"Category_Kurta Sets, Category_Women, Collectio...",HOD0748
...,...,...,...,...,...
844,a-line-white-and-blue-kurta,Sutlej Kurta,Kurta,"Category_Kurta, Category_Women, Collection_Nad...",HOD0025
850,long-straight-peach-and-white-kurta,Chenab Kurta,Kurta,"Category_Kurta, Category_Women, Collection_Nad...",HOD0019
856,long-straight-white-kurta,Alaknanda Kurta,Kurta,"Category_Kurta, Category_Women, Collection_Nad...",HOD0013
862,straight-blue-white-kurta,Betwa Kurta,Kurta,"Category_Kurta, Category_Women, Collection_Nad...",HOD0007


# Generating Function

In [8]:
def generate_int_id(dataframe, id_col_name):
    new_dataframe=dataframe.assign(
        int_id_col_name=np.arange(len(dataframe))
        ).reset_index(drop=True)
    return new_dataframe.rename(columns={'int_id_col_name': id_col_name})



def create_features(dataframe, features_name, id_col_name):
    features = dataframe[features_name].apply(
        lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features = list(zip(dataframe[id_col_name], features))
    return features



def generate_feature_list(dataframe, features_name):
    features = dataframe[features_name].apply(
        lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features = features.apply(pd.Series).stack().reset_index(drop=True)
    return features


def calculate_auc_score(lightfm_model, interactions_matrix, question_features, professional_features): 
    score = auc_score( 
        lightfm_model, interactions_matrix, 
        item_features=question_features, 
        user_features=professional_features, 
        num_threads=4).mean()
    return score

# Cleaning 

In [9]:
final_product['Handle'] = final_product['Handle'].apply(lambda x: [str.lower(i.replace("-", "")) for i in x])
final_product['Handle'] = final_product['Handle'].apply(lambda x: ''.join(x))
final_product['Handle']

0           ishyablockprintedkurtaset
10         ahaanablockprintedkurtaset
21          seherblockprintedkurtaset
29         mihirablockprintedkurtaset
41          baharblockprintedkurtaset
                    ...              
844            alinewhiteandbluekurta
850    longstraightpeachandwhitekurta
856            longstraightwhitekurta
862            straightbluewhitekurta
868                    peachfuldesire
Name: Handle, Length: 163, dtype: object

In [10]:
final_product['Title'] = final_product['Title'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
final_product['Title'] = final_product['Title'].apply(lambda x: ''.join(x))
final_product['Title'][0:10]

0      ishyablockprintedkurta(setof2)
10    ahaanablockprintedkurta(setof2)
21     seherblockprintedkurta(setof2)
29    mihirablockprintedkurta(setof3)
41     baharblockprintedkurta(setof3)
55      keyablockprintedkurta(setof3)
70                         ektakaftan
76                        barnakaftan
82                   saukhayadakaftan
88                       chesnakaftan
Name: Title, dtype: object

In [11]:
final_product['Tags'] = final_product['Tags'].apply(lambda x: [str.lower(i.replace("_", "")) for i in x])
final_product['Tags'] = final_product['Tags'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
final_product['Tags'] = final_product['Tags'].apply(lambda x: ''.join(x))
final_product['Tags'] = final_product['Tags'].apply(lambda x: [str.lower(i.replace(",", " ")) for i in x])
final_product['Tags'] = final_product['Tags'].apply(lambda x: ''.join(x))
final_product['Tags'][0]

'categorykurtasets categorywomen collectionrozana kurtaforwomen necktypevneck price3kto4k'

## Combining

In [12]:
final_product['description'] = final_product['Handle'] + " " + final_product['Title'] + " " + final_product['Tags']
final_product['description'] = final_product['description'].fillna('')
final_product['description'][0]

'ishyablockprintedkurtaset ishyablockprintedkurta(setof2) categorykurtasets categorywomen collectionrozana kurtaforwomen necktypevneck price3kto4k'

In [13]:
import re

def clean_description(text):
    text = re.sub("\'", "", text)
#     text = re.sub("[^a-zA-Z]"," ",text)
    text = ' '.join(text.split()).replace(' ', ',')
    text = text.lower()
    return text

final_product['clean_description'] = final_product['description'].apply(lambda x: clean_description(x))
final_product['clean_description'][0]

'ishyablockprintedkurtaset,ishyablockprintedkurta(setof2),categorykurtasets,categorywomen,collectionrozana,kurtaforwomen,necktypevneck,price3kto4k'

In [14]:
final_product.head()

Unnamed: 0,Handle,Title,Custom Product Type,Tags,Variant SKU,description,clean_description
0,ishyablockprintedkurtaset,ishyablockprintedkurta(setof2),Kurta Sets,categorykurtasets categorywomen collectionroza...,HOD0772,ishyablockprintedkurtaset ishyablockprintedkur...,"ishyablockprintedkurtaset,ishyablockprintedkur..."
10,ahaanablockprintedkurtaset,ahaanablockprintedkurta(setof2),Kurta Sets,categorykurtasets categorywomen collectionroza...,HOD0766,ahaanablockprintedkurtaset ahaanablockprintedk...,"ahaanablockprintedkurtaset,ahaanablockprintedk..."
21,seherblockprintedkurtaset,seherblockprintedkurta(setof2),Kurta Sets,categorykurtasets categorywomen collectionroza...,HOD0760,seherblockprintedkurtaset seherblockprintedkur...,"seherblockprintedkurtaset,seherblockprintedkur..."
29,mihirablockprintedkurtaset,mihirablockprintedkurta(setof3),Kurta Sets,categorykurtasets categorywomen collectionroza...,HOD0754,mihirablockprintedkurtaset mihirablockprintedk...,"mihirablockprintedkurtaset,mihirablockprintedk..."
41,baharblockprintedkurtaset,baharblockprintedkurta(setof3),Kurta Sets,categorykurtasets categorywomen collectionroza...,HOD0748,baharblockprintedkurtaset baharblockprintedkur...,"baharblockprintedkurtaset,baharblockprintedkur..."


## Adding Product ID 

In [15]:
final_product = generate_int_id(final_product, 'product_id_num')
final_product.head()

Unnamed: 0,Handle,Title,Custom Product Type,Tags,Variant SKU,description,clean_description,product_id_num
0,ishyablockprintedkurtaset,ishyablockprintedkurta(setof2),Kurta Sets,categorykurtasets categorywomen collectionroza...,HOD0772,ishyablockprintedkurtaset ishyablockprintedkur...,"ishyablockprintedkurtaset,ishyablockprintedkur...",0
1,ahaanablockprintedkurtaset,ahaanablockprintedkurta(setof2),Kurta Sets,categorykurtasets categorywomen collectionroza...,HOD0766,ahaanablockprintedkurtaset ahaanablockprintedk...,"ahaanablockprintedkurtaset,ahaanablockprintedk...",1
2,seherblockprintedkurtaset,seherblockprintedkurta(setof2),Kurta Sets,categorykurtasets categorywomen collectionroza...,HOD0760,seherblockprintedkurtaset seherblockprintedkur...,"seherblockprintedkurtaset,seherblockprintedkur...",2
3,mihirablockprintedkurtaset,mihirablockprintedkurta(setof3),Kurta Sets,categorykurtasets categorywomen collectionroza...,HOD0754,mihirablockprintedkurtaset mihirablockprintedk...,"mihirablockprintedkurtaset,mihirablockprintedk...",3
4,baharblockprintedkurtaset,baharblockprintedkurta(setof3),Kurta Sets,categorykurtasets categorywomen collectionroza...,HOD0748,baharblockprintedkurtaset baharblockprintedkur...,"baharblockprintedkurtaset,baharblockprintedkur...",4


In [21]:
final_productdf = final_product[['Variant SKU','product_id_num','Title','clean_description']]
final_productdf = final_productdf.rename(columns={'clean_description': 'item_tag_name'})
final_productdf

Unnamed: 0,Variant SKU,product_id_num,Title,item_tag_name
0,HOD0772,0,ishyablockprintedkurta(setof2),"ishyablockprintedkurtaset,ishyablockprintedkur..."
1,HOD0766,1,ahaanablockprintedkurta(setof2),"ahaanablockprintedkurtaset,ahaanablockprintedk..."
2,HOD0760,2,seherblockprintedkurta(setof2),"seherblockprintedkurtaset,seherblockprintedkur..."
3,HOD0754,3,mihirablockprintedkurta(setof3),"mihirablockprintedkurtaset,mihirablockprintedk..."
4,HOD0748,4,baharblockprintedkurta(setof3),"baharblockprintedkurtaset,baharblockprintedkur..."
...,...,...,...,...
158,HOD0025,158,sutlejkurta,"alinewhiteandbluekurta,sutlejkurta,categorykur..."
159,HOD0019,159,chenabkurta,"longstraightpeachandwhitekurta,chenabkurta,cat..."
160,HOD0013,160,alaknandakurta,"longstraightwhitekurta,alaknandakurta,category..."
161,HOD0007,161,betwakurta,"straightbluewhitekurta,betwakurta,categorykurt..."


# Content Based Recommender 

In [54]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(final_productdf['item_tag_name'])

In [55]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [56]:
sku = final_productdf['Variant SKU']
sku

0      HOD0772
1      HOD0766
2      HOD0760
3      HOD0754
4      HOD0748
        ...   
158    HOD0025
159    HOD0019
160    HOD0013
161    HOD0007
162    HOD0001
Name: Variant SKU, Length: 163, dtype: object

In [57]:
indices = pd.Series(final_product.index, index=final_product['Title'])
indices

Title
ishyablockprintedkurta(setof2)       0
ahaanablockprintedkurta(setof2)      1
seherblockprintedkurta(setof2)       2
mihirablockprintedkurta(setof3)      3
baharblockprintedkurta(setof3)       4
                                  ... 
sutlejkurta                        158
chenabkurta                        159
alaknandakurta                     160
betwakurta                         161
kosikurta                          162
Length: 163, dtype: int64

# Hybrid Recommender (LightFM)

In [28]:
df_order.head()

Unnamed: 0,Name,Lineitem quantity,Lineitem name,Lineitem sku
0,#2891,1,Betwa Kurta - XL,HOD0011
1,#2891,1,Bhagirathi pants - XXL,HOD0444
2,#2891,1,Kosi Kurta - XL,HOD0005
3,#2892,1,Panzara kurta - XXL,HOD0126
4,#2893,1,Betwa Kurta - S,HOD0008


In [29]:
df_merge = df_order.merge(final_productdf, how='inner', left_on='Lineitem sku', right_on='Variant SKU')
df_merge['Name'] = df_merge['Name'].str.replace('#', '')
print(df_merge.shape)
df_merge

(32, 8)


Unnamed: 0,Name,Lineitem quantity,Lineitem name,Lineitem sku,Variant SKU,product_id_num,Title,item_tag_name
0,2898,1,Green Car Cotton Shirt - S,HOD0607,HOD0607,40,greencarcottonshirt,"greencarcottonshirt,greencarcottonshirt,catego..."
1,2898,1,Light Blue Cotton Shirt - S,HOD0575,HOD0575,48,lightbluecottonshirt,"lightbluecottonshirt,lightbluecottonshirt,cate..."
2,2932,1,Light Blue Cotton Shirt - S,HOD0575,HOD0575,48,lightbluecottonshirt,"lightbluecottonshirt,lightbluecottonshirt,cate..."
3,2898,1,Olive Green Cotton Shirt - S,HOD0579,HOD0579,47,olivegreencottonshirt,"olivegreencottonshirt,olivegreencottonshirt,ca..."
4,2898,1,Red Flower Cotton Shirt - S,HOD0615,HOD0615,38,redflowercottonshirt,"redflowercottonshirt,redflowercottonshirt,cate..."
5,2898,1,Maroon Scooter Cotton Shirt - S,HOD0635,HOD0635,33,maroonscootercottonshirt,"maroonscootercottonshirt,maroonscootercottonsh..."
6,2906,1,Pavana dupatta,HOD0544,HOD0544,84,pavanadupatta,"pavanadupatta,pavanadupatta,categorydupatta,ca..."
7,2907,1,Vaigai kurta - XS,HOD0103,HOD0103,139,vaigaikurta,"vaigaikurta,vaigaikurta,categorykurta,category..."
8,2907,1,Vaighai dupatta,HOD0542,HOD0542,87,vaighaidupatta,"vaighaidupatta,vaighaidupatta,categorydupatta,..."
9,2939,1,Vaighai dupatta,HOD0542,HOD0542,87,vaighaidupatta,"vaighaidupatta,vaighaidupatta,categorydupatta,..."


In [31]:
user_tag = df_merge.groupby(['Name'])['item_tag_name'].apply(','.join).reset_index()
user_tag = user_tag.rename(columns={'item_tag_name': 'user_tag_name'})
user_tag = generate_int_id(user_tag, 'order_id_num')
user_tag

Unnamed: 0,Name,user_tag_name,order_id_num
0,2898,"greencarcottonshirt,greencarcottonshirt,catego...",0
1,2906,"pavanadupatta,pavanadupatta,categorydupatta,ca...",1
2,2907,"vaigaikurta,vaigaikurta,categorykurta,category...",2
3,2909,"pinkcottonshirt,pinkcottonshirt,categorymen,ca...",3
4,2910,"taptipant,taptipants,categorypants,categorywom...",4
5,2921,"baharblockprintedkurtaset,baharblockprintedkur...",5
6,2923,"pranhitadupatta,pranhitadupatta,categorydupatt...",6
7,2928,"narmadadupatta,chaliyardupatta,categorydupatta...",7
8,2929,"longstraightbrowncolor,dodakurta,categorykurta...",8
9,2932,"lightbluecottonshirt,lightbluecottonshirt,cate...",9


In [32]:
final_merge = df_merge.merge(user_tag, how='inner', left_on='Name', right_on='Name')
print(final_merge.shape)
final_merge.head()

(32, 10)


Unnamed: 0,Name,Lineitem quantity,Lineitem name,Lineitem sku,Variant SKU,product_id_num,Title,item_tag_name,user_tag_name,order_id_num
0,2898,1,Green Car Cotton Shirt - S,HOD0607,HOD0607,40,greencarcottonshirt,"greencarcottonshirt,greencarcottonshirt,catego...","greencarcottonshirt,greencarcottonshirt,catego...",0
1,2898,1,Light Blue Cotton Shirt - S,HOD0575,HOD0575,48,lightbluecottonshirt,"lightbluecottonshirt,lightbluecottonshirt,cate...","greencarcottonshirt,greencarcottonshirt,catego...",0
2,2898,1,Olive Green Cotton Shirt - S,HOD0579,HOD0579,47,olivegreencottonshirt,"olivegreencottonshirt,olivegreencottonshirt,ca...","greencarcottonshirt,greencarcottonshirt,catego...",0
3,2898,1,Red Flower Cotton Shirt - S,HOD0615,HOD0615,38,redflowercottonshirt,"redflowercottonshirt,redflowercottonshirt,cate...","greencarcottonshirt,greencarcottonshirt,catego...",0
4,2898,1,Maroon Scooter Cotton Shirt - S,HOD0635,HOD0635,33,maroonscootercottonshirt,"maroonscootercottonshirt,maroonscootercottonsh...","greencarcottonshirt,greencarcottonshirt,catego...",0


In [33]:
user_feature_list = generate_feature_list(user_tag,['user_tag_name'])

item_feature_list = generate_feature_list(final_productdf,['item_tag_name'])

In [34]:
user_tag['user_features'] = create_features(user_tag, ['user_tag_name'], 'order_id_num')
final_productdf['item_features'] = create_features(final_productdf,['item_tag_name'],'product_id_num')

In [35]:
dataset = Dataset()
dataset.fit(
    set(user_tag['order_id_num']),
    set(final_productdf['product_id_num']), 
    item_features=item_feature_list, 
    user_features=user_feature_list)

In [36]:
final_merge['user_item_id_tuple'] = list(zip(final_merge.order_id_num, final_merge.product_id_num))

In [37]:
interactions, weights = dataset.build_interactions(final_merge['user_item_id_tuple'])

In [39]:
item_features = dataset.build_item_features(final_productdf['item_features'])
user_features = dataset.build_user_features(user_tag['user_features'])

In [40]:
model = LightFM(
    no_components=150,
    learning_rate=0.05,
    loss='warp',
    random_state=2019)

model.fit(
    interactions,
    item_features=item_features,
    user_features=user_features, sample_weight=weights,
    epochs=5, num_threads=4, verbose=True)

Epoch: 100%|████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 104.23it/s]


<lightfm.lightfm.LightFM at 0x2bff8f07dc0>

In [41]:
calculate_auc_score(model, interactions, item_features, user_features)

0.87400055

In [86]:
from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

def recommend_item(input):
    if (type(input) == int):

            # print their previous answered question title
            previous_item_num = final_merge.loc[final_merge['order_id_num'] == input][:3]['product_id_num']
            df_previous_items = final_productdf.loc[final_productdf['product_id_num'].isin(previous_item_num)]
            print('User Id (' + str(input) + "): Previous Item Purchased")
            display_side_by_side(
                df_previous_items[['product_id_num', 'item_features']],
                user_tag.loc[user_tag.order_id_num == input][['order_id_num','user_tag_name']])

            # predict
            discard_qu_id = df_previous_items['product_id_num'].values.tolist()
            df_use_for_prediction = final_productdf.loc[~final_productdf['product_id_num'].isin(discard_qu_id)]
            items_id_for_predict = df_use_for_prediction['product_id_num'].values.tolist()

            scores = model.predict(
                input,
                items_id_for_predict,
                item_features=item_features,
                user_features=user_features)

            df_use_for_prediction['scores'] = scores
            df_use_for_prediction = df_use_for_prediction.sort_values(by='scores', ascending=False)[:8]
            print()
            print('User Id (' + str(input) + "): Recommended Item: ")
            display(df_use_for_prediction[['product_id_num', 'item_features', 'Variant SKU']])

    else:
            idx = indices[input]

            #It will get the title with its index
            sim_scores = list(enumerate(cosine_sim[idx]))

            sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
            sim_scores = sim_scores[1:6]

            # [2502, 7535, 4702, 889, 437]
            product_indices = [i[0] for i in sim_scores]

            print(sku.iloc[product_indices])






In [87]:
recommend_item("chenabkurta")

162    HOD0001
99     HOD0181
160    HOD0013
126    HOD0145
154    HOD0049
Name: Variant SKU, dtype: object


In [82]:
recommend_item(7)

User Id (7): Previous Item Purchased


Unnamed: 0,product_id_num,item_features
92,92,"(92, [narmadadupatta, chaliyardupatta, categorydupatta, categorywomen, collectionnadiyankinare, colororange, price1kto2k, sale, stylekotadoriya])"

Unnamed: 0,order_id_num,user_tag_name
7,7,"narmadadupatta,chaliyardupatta,categorydupatta,categorywomen,collectionnadiyankinare,colororange,price1kto2k,sale,stylekotadoriya"



User Id (7): Recommended Item: 


Unnamed: 0,product_id_num,item_features,Variant SKU
4,4,"(4, [baharblockprintedkurtaset, baharblockprin...",HOD0748
3,3,"(3, [mihirablockprintedkurtaset, mihirablockpr...",HOD0754
0,0,"(0, [ishyablockprintedkurtaset, ishyablockprin...",HOD0772
5,5,"(5, [keyablockprintedkurtaset, keyablockprinte...",HOD0742
1,1,"(1, [ahaanablockprintedkurtaset, ahaanablockpr...",HOD0766
2,2,"(2, [seherblockprintedkurtaset, seherblockprin...",HOD0760
48,48,"(48, [lightbluecottonshirt, lightbluecottonshi...",HOD0575
47,47,"(47, [olivegreencottonshirt, olivegreencottons...",HOD0579
