# Import Libraries

In [1]:
import pandas as pd
import nltk
import re

In [2]:
df = pd.read_csv('clothing_data.csv')

In [3]:
df.shape

(26960, 4)

In [4]:
df.sample(20)

Unnamed: 0,brand,description,url,img
16591,Fashion market,Sweater Man Boys 2 In1 Colour Nevy Blue Grey S...,https://www.amazon.in/Fashion-market-Sweater-C...,https://m.media-amazon.com/images/I/41tF7GIgYN...
4470,Mirayya,Women Regular Fit Black Cotton Blend Trousers,https://www.flipkart.com/mirayya-regular-fit-w...,https://rukminim1.flixcart.com/image/612/612/x...
9738,BLIVE,Men's Round Neck Full Sleeve T-Shirt | Printed...,https://www.amazon.in/BLIVE-Sleeve-T-Shirt-Pri...,https://m.media-amazon.com/images/I/51QeCqn2+9...
24562,Khadija's,Women's Banarasi Kora Orgenza Silk Saree With ...,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,https://m.media-amazon.com/images/I/31j+CM8uF8...
22393,Generic,"khanak Fashion Women`S Afgani Suit Set,Grey Fl...",https://www.amazon.in/khanak-Fashion-Women%60S...,https://m.media-amazon.com/images/I/41T4hXOClR...
12753,CAMOCOAT,Anti Leech Socks (Mud Grey),https://www.amazon.in/Camocoat-Anti-Leech-Sock...,https://m.media-amazon.com/images/I/71opb02vk2...
3825,M7 By Metronaut,Solid Men Dark Blue Sports Shorts,https://www.flipkart.com/m7-metronaut-solid-me...,https://rukminim1.flixcart.com/image/612/612/x...
15042,IZOD,Men's Merino Sweater,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,https://m.media-amazon.com/images/I/71nRux2Jza...
2910,Urbano Fashion,Men Slim Mid Rise Dark Blue Jeans,https://www.flipkart.com/urbano-fashion-slim-m...,https://rukminim1.flixcart.com/image/612/612/k...
26266,arriva fab,women's kutchi work embroidered Exclusive wear...,https://www.amazon.in/arriva-fab-Peacock-Embro...,https://m.media-amazon.com/images/I/91JYyTKEnY...


### Removing Duplicate Rows

In [5]:
df = df.drop_duplicates(subset='description')

In [6]:
clothing_df = df[['description','url','img']]

In [7]:
clothing_df_copy = clothing_df.copy()

In [8]:
clothing_df_copy.shape

(1944, 3)

In [9]:
clothing_df_copy = clothing_df_copy.drop_duplicates()

In [10]:
clothing_df_copy.shape

(1944, 3)

### Text Preprocessing

In [11]:
from nltk.corpus import stopwords    # Import the stopwords module from nltk.corpus
from nltk.stem import WordNetLemmatizer   # Import the WordNetLemmatizer class from nltk.stem
nltk.download('stopwords')    # Download the stopwords dataset
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()    # Create an instance of the PorterStemmer class

[nltk_data] Downloading package stopwords to C:\Users\Harshit
[nltk_data]     Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Harshit
[nltk_data]     Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
def clean_text(text:str):
    cleaned_text = re.sub('[^a-zA-Z]', ' ', text)  # Remove non-alphabetic characters and replace them with spaces
    cleaned_text = cleaned_text.lower()  # Convert the text to lowercase
    cleaned_text = cleaned_text.split()  # Split the text into a list of words
    cleaned_text = [lemmatizer.lemmatize(word) for word in cleaned_text if not word in stopwords.words('english')]  # Perform lemmetization and remove stopwords
    cleaned_text = ' '.join(cleaned_text)  # Join the words back into a single string
    return cleaned_text

In [13]:
# Apply the 'clean_text' function to the 'description' column of the DataFrame
# The lambda function 'lambda x: clean_text(x)' is used to apply the 'clean_text' function to each element in the 'description' column
# The cleaned text is assigned back to the 'description' column
clothing_df['cleaned_description'] = clothing_df['description'].apply(clean_text)

In [14]:
clothing_df

Unnamed: 0,description,url,img,cleaned_description
0,Men Regular Fit Printed Casual Shirt,https://www.flipkart.com/solbiza-men-printed-c...,https://rukminim1.flixcart.com/image/612/612/x...,men regular fit printed casual shirt
1,Men Regular Fit Checkered Spread Collar Casual...,https://www.flipkart.com/roadster-men-checkere...,https://rukminim1.flixcart.com/image/612/612/k...,men regular fit checkered spread collar casual...
2,Men Solid Polo Neck Green T-Shirt,https://www.flipkart.com/3bros-solid-men-polo-...,https://rukminim1.flixcart.com/image/612/612/x...,men solid polo neck green shirt
3,"Pack of 2 Men Striped Round Neck Dark Blue, Re...",https://www.flipkart.com/blive-striped-men-rou...,https://rukminim1.flixcart.com/image/612/612/x...,pack men striped round neck dark blue red shirt
4,Men Printed Round Neck Light Blue T-Shirt,https://www.flipkart.com/nb-nicky-boy-printed-...,https://rukminim1.flixcart.com/image/612/612/x...,men printed round neck light blue shirt
...,...,...,...,...
24616,BANARASI SAREE,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,https://m.media-amazon.com/images/I/41mGiLUh4Z...,banarasi saree
24617,Paithani Kanjeevaram Banarasi Silk Saree with ...,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,https://m.media-amazon.com/images/I/515jszyd9L...,paithani kanjeevaram banarasi silk saree unsti...
24618,URVASHI RAUTELA A NEW EMBRODERY SEQUENCE DESIGNER,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,https://m.media-amazon.com/images/I/51lDrsAQkj...,urvashi rautela new embrodery sequence designer
24619,SUPERHIT TREDING VICHITRA SILK SAREE & JACQUAR...,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,https://m.media-amazon.com/images/I/71zUg3Lk+M...,superhit treding vichitra silk saree jacquard ...


### Vectorizing Textual Data

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
def recommend_clothes(text:str,top_num:int):
    # Clean the input text by removing non-alphabetic characters
    cleaned_text = re.sub('[^a-zA-Z]', ' ', text)
    
    # Convert the text to lowercase
    cleaned_text = cleaned_text.lower()
    
    # Split the text into individual words
    cleaned_text = cleaned_text.split()
    
    # Lemmatize the words and remove stopwords
    cleaned_text = [lemmatizer.lemmatize(word) for word in cleaned_text if not word in stopwords.words('english')]
    
    # Join the cleaned words back into a string
    cleaned_text = ' '.join(cleaned_text)
    
    # Create a Pandas Series with the cleaned text
    cleaned_text_as_series = pd.Series([cleaned_text])
    
    # Get the existing descriptions from the clothing dataframe
    descriptions = clothing_df['cleaned_description']
    
    # Concatenate the existing descriptions with the cleaned text series
    decription_with_new_text = pd.concat([descriptions,cleaned_text_as_series]).reset_index(drop=True)
    
    # Vectorize the descriptions using CountVectorizer
    vectors = cv.fit_transform(decription_with_new_text).toarray()
    
    # Compute the cosine similarity scores between vectors
    similarity_scores = cosine_similarity(vectors)
    
    # Find the index of the input description in the concatenated series
    input_description_index = decription_with_new_text[decription_with_new_text==cleaned_text].index[0]
    
    # Get the similarity scores of the input description with other descriptions
    distances = similarity_scores[input_description_index]
    
    # Get the indices and distances of the top similar clothing items
    clothing_items_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:top_num+1]
    
    # Get the details of the top similar clothing items from the dataframe
    clothing_item_details = [(clothing_df.iloc[each[0]]).to_dict() for each in clothing_items_list]
    
    # Exclude the input description from the list of descriptions
    descriptions = descriptions[descriptions!=cleaned_text]
    
    # Return the details of the recommended clothing items
    return clothing_item_details


In [18]:
cl = recommend_clothes('track pants',5)

In [24]:
cl

[{'description': 'Men Striped Green Track Pants',
  'url': 'https://www.flipkart.com/pivl-striped-men-green-track-pants/p/itmb3502a99455a9?pid=TKPGFDB5Z2B42MXM&lid=LSTTKPGFDB5Z2B42MXMIZYHNL&marketplace=FLIPKART&store=clo%2Fvua%2Fjlk%2F6ql&srno=b_1_15&otracker=browse&fm=organic&iid=57582162-361e-4bb4-bd85-17b3a7f0f026.TKPGFDB5Z2B42MXM.SEARCH&ppt=None&ppn=None&ssid=ojq1ykf3s00000001684618943056',
  'img': 'https://rukminim1.flixcart.com/image/612/612/l55nekw0/track-pant/m/6/9/xxl-pv-5201-saze-pivl-original-imagfw94wtjpg49d.jpeg?q=70',
  'cleaned_description': 'men striped green track pant'},
 {'description': 'Men Solid Black Track Pants',
  'url': 'https://www.flipkart.com/jugular-solid-men-black-track-pants/p/itmc449c75277ee3?pid=TKPFKG92FZKJ6S2E&lid=LSTTKPFKG92FZKJ6S2EPRM9SW&marketplace=FLIPKART&store=clo%2Fvua%2Fjlk%2F6ql&srno=b_1_16&otracker=browse&fm=organic&iid=57582162-361e-4bb4-bd85-17b3a7f0f026.TKPFKG92FZKJ6S2E.SEARCH&ppt=None&ppn=None&ssid=ojq1ykf3s00000001684618943056',
  'img

In [25]:
import pickle

# Save the 'clothing_df' DataFrame using pickle
pickle.dump(clothing_df, open('clothing_df.pkl', 'wb'))
