In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("/kaggle/input/bigbasket-entire-product-list-28k-datapoints/BigBasket Products.csv")
df.head()

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...
1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ..."
2,3,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.0,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m..."
3,4,Cereal Flip Lid Container/Storage Jar - Assort...,Cleaning & Household,Bins & Bathroom Ware,Nakoda,149.0,176.0,"Laundry, Storage Baskets",3.7,Multipurpose container with an attractive desi...
4,5,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Nivea,162.0,162.0,Bathing Bars & Soaps,4.4,Nivea Creme Soft Soap gives your skin the best...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27555 entries, 0 to 27554
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   index         27555 non-null  int64  
 1   product       27554 non-null  object 
 2   category      27555 non-null  object 
 3   sub_category  27555 non-null  object 
 4   brand         27554 non-null  object 
 5   sale_price    27555 non-null  float64
 6   market_price  27555 non-null  float64
 7   type          27555 non-null  object 
 8   rating        18929 non-null  float64
 9   description   27440 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 2.1+ MB


## Handling Missing Values

In [4]:
df.drop(columns = ['index'], axis = 1)
df = df[
    (~df['product'].isnull()) &
    (~df['brand'].isnull())
].reset_index(drop = True)
df['index'] = df.index
avg_ratings = df.groupby(by = "category")['rating'].mean()

In [5]:
'''
Imputing average ratings for 'Eggs, Meat & Fish', 'Fruits & Vegetables' 
by considering 'Snacks & Branded Foods' -->3.98 and 'Gourmet & World Food'-->3.98 as closest categories
'''
avg_ratings['Eggs, Meat & Fish'] = 3.98
avg_ratings['Fruits & Vegetables'] = 3.98

avg_ratings

category
Baby Care                   4.023790
Bakery, Cakes & Dairy       3.911128
Beauty & Hygiene            3.930655
Beverages                   4.084676
Cleaning & Household        3.956667
Eggs, Meat & Fish           3.980000
Foodgrains, Oil & Masala    4.062198
Fruits & Vegetables         3.980000
Gourmet & World Food        3.984410
Kitchen, Garden & Pets      3.734715
Snacks & Branded Foods      3.983313
Name: rating, dtype: float64

In [6]:
def impute_ratings(x):
    if(np.isnan(x['rating'])):
        return avg_ratings[x['category']]
    else:
        return x['rating']

df['rating'] = df.apply(lambda x : impute_ratings(x), axis = 1)

In [7]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr/share/nltk_data

In [8]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

#to avoid all non alpha-numeric characters
tokenizer = RegexpTokenizer(r'\w+')

#lemmatization to covert the plurals into the original word(lot of plurals were observed to be as separate categories)
lemmatizer = WordNetLemmatizer()

def create_categorical_soup(x):
    category = tokenizer.tokenize(x['category'])
    sub_category = tokenizer.tokenize(x['sub_category'])
    _type = tokenizer.tokenize(x['type'])

    final_words = [lemmatizer.lemmatize(word.lower()) for word in list(category + sub_category + _type)]
    
    #return ' '.join([x.lower() for x in list(category + sub_category + _type)])
    return ' '.join(final_words)

df['categorical_soup'] = df.apply(lambda x : create_categorical_soup(x), axis = 1) 

In [9]:
from gensim.models import Word2Vec

sequences = [x.split(' ') for x in df['categorical_soup']]

model = Word2Vec(sentences = sequences, vector_size = 32, min_count = 1, window = 3, sg = 1, seed = 42)

In [10]:
#form the embeddings and store them in the dataframe
def combined_embedding(x):
    words = x.split(' ')
    return np.array([model.wv[word] for word in words]).mean(axis = 0).astype(np.float32)

df['embedding'] = df['categorical_soup'].map(lambda x : combined_embedding(x))

## Content based Recommender System

In [11]:
#make the similarity score function
def similarity_score(a, b):
    return a @ b

queries = [
    #(idx, name) tuples
    (6734, 'Pet Belly Container Set - Silver'),
    (1,	'Water Bottle - Orange'),
    (23, 'Cleanse Green Tea - Whole Leaf Loose Tea'),
    (5132,	'Super Snack - Pudina Party'),
    (799, 'Harippa Roasted Seeds - Pumpkin, Cheesy Onion'),
    (20568,	'Heat to Eat - Spicy Vegetable Biryani'),
    (115, 'Sapota - Organically Grown')
]

for query in queries:
    #get the query product's embedding
    idx, name = query
    q_embedding = df.at[idx, 'embedding']
    
    #compute the scores and store it in a pandas dataframe
    scores = df['embedding'].apply(lambda x : similarity_score(q_embedding, x))
    pdt_idx = scores.sort_values(ascending = False).index
    
    #top 10 recommendations(top 10 most similar products)
    print('------------------------------------------------------')
    print(f"query is : {name}")
    print("The Top 10 Recommendations are :")
    print('\n'.join(df.loc[pdt_idx[:10], 'product'].to_list()))

------------------------------------------------------
query is : Pet Belly Container Set - Silver
The Top 10 Recommendations are :
Steel Storage/Lunch Container - No.13, Pink, Klip It
Steel Storage/Lunch Container - No.20, Blue, Klip It
Storage Steel Airtight Round Container - Klip Lock
Steel Storage Storage Container/Bowl Set - Rose Gold Finish
Steel Storage/Lunch Container - No.10, Grey, Klip It
Stainless Steel Tea/Sugar/Coffee Container - Mirror Finish
Stainless Steel Tea Sugar Canister/Container Set
Steel Deep Dabba/Storage Container - No. 14, Ubha Russian
Turn & Lock Set
Oil Dispenser
------------------------------------------------------
query is : Water Bottle - Orange
The Top 10 Recommendations are :
Glass Bottle With Removable Fabric Cover - Grey, BB1317
Trendy Stainless-Steel Bottle With Sipper Cap Dark Red Colour Pxp 1002 Dq
Pacific PET Fridge Plastic Water Bottle - Red
Bottle - Bri Glass, With Cover, Yellow & Red Flowers
Spray Glass Water Bottle With Cork - Violet
Glass Bo

## Building the model for pricing detection

In [12]:
df['diff'] = (df['market_price'] != df['sale_price']).astype(float)

In [13]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, precision_score, f1_score
from xgboost import XGBClassifier

In [24]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)

X = df[['embedding','rating', 'market_price']]
Y = df[['diff']]

def create_final_numpy(x):
    embeddings = np.vstack(x['embedding'].values)
    x = np.concatenate((embeddings, 
        np.expand_dims(x['rating'].to_numpy(), axis = 1),
        np.expand_dims(x['market_price'].to_numpy(), axis = 1),
    ), axis = 1).astype(np.float32)
    return x

models = []

for fold, (train_idx, valid_idx) in enumerate(kf.split(X, Y)):

    x_train = X.iloc[train_idx, :].reset_index(drop = True)
    x_valid = X.iloc[valid_idx, :].reset_index(drop = True)
    x_train = create_final_numpy(x_train)
    x_valid = create_final_numpy(x_valid)

    y_train = Y.iloc[train_idx, :].reset_index(drop = True)
    y_valid = Y.iloc[valid_idx, :].reset_index(drop = True)
    
    y_train_target = y_train['diff'].to_numpy()
    y_valid_target = y_valid['diff'].to_numpy()

    model = XGBClassifier(
        n_estimators=400,
        max_depth=6,
        learning_rate=0.02,
        objective='binary:logistic',
        eval_metric='logloss',
        use_label_encoder=False,
        colsample_bytree = 0.4,
        random_state=42
    )
    
    model.fit(x_train, y_train_target)
    models.append(model)
    
    preds_train = model.predict(x_train)
    preds_valid = model.predict(x_valid)
    
    print(f"---------------fold :{fold}-------------------")
    print("train metrics--------------------------")
    
    print(f"precision : {precision_score(y_train_target, preds_train)}")
    print(f"recall : {recall_score(y_train_target, preds_train)}")
    print(f"F1 Score :{f1_score(y_train_target, preds_train)}")
    
    print("validation metrics---------------------")
    print(f"precision : {precision_score(y_valid_target, preds_valid)}")
    print(f"recall : {recall_score(y_valid_target, preds_valid)}")
    print(f"F1 Score :{f1_score(y_valid_target, preds_valid)}")

---------------fold :0-------------------
train metrics--------------------------
precision : 0.779259726935522
recall : 0.8098089067497745
F1 Score :0.7942406692406693
validation metrics---------------------
precision : 0.7480680061823802
recall : 0.7973640856672158
F1 Score :0.7719298245614036
---------------fold :1-------------------
train metrics--------------------------
precision : 0.7796610169491526
recall : 0.8074831053238833
F1 Score :0.7933282053358163
validation metrics---------------------
precision : 0.7642328723062077
recall : 0.7679379444085327
F1 Score :0.7660809285829437
---------------fold :2-------------------
train metrics--------------------------
precision : 0.7765261914139425
recall : 0.809891554387118
F1 Score :0.792858004584389
validation metrics---------------------
precision : 0.7632497619803237
recall : 0.786976439790576
F1 Score :0.7749315289189626
---------------fold :3-------------------
train metrics--------------------------
precision : 0.77494181536074