In [1]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# loading the data from the csv file
df=pd.read_csv('fashion.csv')

In [3]:
df.shape

(2906, 10)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2906 entries, 0 to 2905
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ProductId     2906 non-null   int64 
 1   Gender        2906 non-null   object
 2   Category      2906 non-null   object
 3   SubCategory   2906 non-null   object
 4   ProductType   2906 non-null   object
 5   Colour        2906 non-null   object
 6   Usage         2906 non-null   object
 7   ProductTitle  2906 non-null   object
 8   Image         2906 non-null   object
 9   ImageURL      2906 non-null   object
dtypes: int64(1), object(9)
memory usage: 227.2+ KB


In [5]:
df.head()

Unnamed: 0,ProductId,Gender,Category,SubCategory,ProductType,Colour,Usage,ProductTitle,Image,ImageURL
0,42419,Girls,Apparel,Topwear,Tops,White,Casual,Gini and Jony Girls Knit White Top,42419.jpg,http://assets.myntassets.com/v1/images/style/p...
1,34009,Girls,Apparel,Topwear,Tops,Black,Casual,Gini and Jony Girls Black Top,34009.jpg,http://assets.myntassets.com/v1/images/style/p...
2,40143,Girls,Apparel,Topwear,Tops,Blue,Casual,Gini and Jony Girls Pretty Blossom Blue Top,40143.jpg,http://assets.myntassets.com/v1/images/style/p...
3,23623,Girls,Apparel,Topwear,Tops,Pink,Casual,Doodle Kids Girls Pink I love Shopping Top,23623.jpg,http://assets.myntassets.com/v1/images/style/p...
4,47154,Girls,Apparel,Bottomwear,Capris,Black,Casual,Gini and Jony Girls Black Capris,47154.jpg,http://assets.myntassets.com/v1/images/style/p...


In [6]:
#check null values
df.isnull().sum()

ProductId       0
Gender          0
Category        0
SubCategory     0
ProductType     0
Colour          0
Usage           0
ProductTitle    0
Image           0
ImageURL        0
dtype: int64

In [7]:
# Assuming you have a DataFrame called 'df' with a column 'ProductTitle'
import pandas as pd

def normalize_title(title):
    words = title.lower().split()
    sorted_words = ' '.join(sorted(words))
    return sorted_words

# Convert titles to normalized form for comparison
df['NormalizedTitle'] = df['ProductTitle'].apply(normalize_title)

# Remove duplicates from 'df' in place
df.drop_duplicates(subset='NormalizedTitle', keep='first', inplace=True)

# Remove the 'NormalizedTitle' column
df.drop(columns=['NormalizedTitle'], inplace=True)

# Print or use the modified DataFrame 'df' with duplicates removed
print(df)


      ProductId Gender  Category SubCategory   ProductType Colour   Usage  \
0         42419  Girls   Apparel     Topwear          Tops  White  Casual   
1         34009  Girls   Apparel     Topwear          Tops  Black  Casual   
2         40143  Girls   Apparel     Topwear          Tops   Blue  Casual   
3         23623  Girls   Apparel     Topwear          Tops   Pink  Casual   
4         47154  Girls   Apparel  Bottomwear        Capris  Black  Casual   
...         ...    ...       ...         ...           ...    ...     ...   
2901      51755  Women  Footwear       Shoes  Casual Shoes  Black  Casual   
2902      47630  Women  Footwear       Shoes         Flats   Blue  Casual   
2903      32836  Women  Footwear       Shoes         Flats   Pink  Casual   
2904      35821  Women  Footwear       Shoes         Heels  Black  Casual   
2905      18553  Women  Footwear       Shoes         Heels   Blue  Casual   

                                     ProductTitle      Image  \
0          

In [8]:
# Dataframe named df
df_with_index = df.reset_index()
df.reset_index(inplace=True)

# Display the DataFrame with the new column index
df.head()

Unnamed: 0,index,ProductId,Gender,Category,SubCategory,ProductType,Colour,Usage,ProductTitle,Image,ImageURL
0,0,42419,Girls,Apparel,Topwear,Tops,White,Casual,Gini and Jony Girls Knit White Top,42419.jpg,http://assets.myntassets.com/v1/images/style/p...
1,1,34009,Girls,Apparel,Topwear,Tops,Black,Casual,Gini and Jony Girls Black Top,34009.jpg,http://assets.myntassets.com/v1/images/style/p...
2,2,40143,Girls,Apparel,Topwear,Tops,Blue,Casual,Gini and Jony Girls Pretty Blossom Blue Top,40143.jpg,http://assets.myntassets.com/v1/images/style/p...
3,3,23623,Girls,Apparel,Topwear,Tops,Pink,Casual,Doodle Kids Girls Pink I love Shopping Top,23623.jpg,http://assets.myntassets.com/v1/images/style/p...
4,4,47154,Girls,Apparel,Bottomwear,Capris,Black,Casual,Gini and Jony Girls Black Capris,47154.jpg,http://assets.myntassets.com/v1/images/style/p...


In [9]:
#total number of rows and column
df.shape

(2131, 11)

In [10]:
#dropping the irrelevant columns 
df.drop(columns=["Image","ImageURL"] , inplace=True)

In [11]:
#selecting the relevant features
selected_features=['Gender','Category','SubCategory','ProductType','Colour','Usage','ProductTitle']
selected_features

['Gender',
 'Category',
 'SubCategory',
 'ProductType',
 'Colour',
 'Usage',
 'ProductTitle']

In [12]:
# Verify the column names in your dataset
print(df.columns)

# Combine the selected features
combined_features = df['Gender'] + ' ' +df['ProductTitle'] + ' ' + df['Category'] + ' ' + df['SubCategory'] + ' ' + df['ProductType'] + ' ' + df['Colour'] + ' ' + df['Usage']

# Display the combined features
print(combined_features)
combined_features.shape

Index(['index', 'ProductId', 'Gender', 'Category', 'SubCategory',
       'ProductType', 'Colour', 'Usage', 'ProductTitle'],
      dtype='object')
0       Girls Gini and Jony Girls Knit White Top Appar...
1       Girls Gini and Jony Girls Black Top Apparel To...
2       Girls Gini and Jony Girls Pretty Blossom Blue ...
3       Girls Doodle Kids Girls Pink I love Shopping T...
4       Girls Gini and Jony Girls Black Capris Apparel...
                              ...                        
2126    Women Catwalk Women Black Shoes Footwear Shoes...
2127    Women Carlton London Women Blue Shoes Footwear...
2128    Women Grendha Women Flori Pink Sandals Footwea...
2129    Women Enroute Women Black Heels Footwear Shoes...
2130    Women Catwalk Women Mary Janes Blue Flats Foot...
Length: 2131, dtype: object


(2131,)

In [13]:
#convert textual data into numerical data
vectorizer=TfidfVectorizer()



In [14]:
feature_vectors=vectorizer.fit_transform(combined_features)

In [15]:
feature_vectors.shape


(2131, 1225)

In [16]:
#cosine similarity 
#getting similarity score using cosine similarity
similarity=cosine_similarity(feature_vectors,feature_vectors)

In [17]:
# Round the similarity values to two decimal places
similarity= np.round(similarity, decimals=2)

In [18]:
np.savetxt('similarity.csv', similarity, delimiter=',')

In [19]:
#you can save it to csv and open it to analyze it

# Create a DataFrame from the similarity matrix
similarity_df = pd.DataFrame(similarity, index=df.index, columns=df.index)

# Save DataFrame to CSV file
csv_path = 'similarity_matrix.csv'
similarity_df.to_csv(csv_path)

print("Cosine similarity matrix saved to:", csv_path)


Cosine similarity matrix saved to: similarity_matrix.csv


In [20]:
similarity_df.shape
#we have to check the similiarity with each movie

(2131, 2131)

In [21]:
#creating a list with all the movie names given in the dataset
list_of_titles=df['ProductTitle'].tolist()


In [25]:
# Function to get recommendations for a specific product
def get_recommendations(product, similarity_scores, df, num_recommendations=10):
    find_close_match = difflib.get_close_matches(product, df['ProductTitle'].tolist())
    close_match = find_close_match[0]
    index_of_product = df[df.ProductTitle == close_match].index[0]
    similarity_scores = list(enumerate(similarity_scores[index_of_product]))
    sorted_similar_products = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    recommended_products = [(df.loc[p[0], 'ProductTitle'], p[1]) for p in sorted_similar_products[:num_recommendations]]
    return recommended_products

product_title=input('Enter a Product_title:')

# Get recommendations for the random query
recommended_products = get_recommendations(product_title, similarity, df, num_recommendations=10)

print("Recommended Products:")
for i, (product, similarity_score) in enumerate(recommended_products, 1):
    print(f"{i}. '{product}' (Similarity Score: {similarity_score:.2f})")




Enter a Product_title:KKR Boys Fangear Tee
Recommended Products:
1. 'KKR Boys Fangear Tee' (Similarity Score: 1.00)
2. 'KKR Boys Fangear Polo Jersey' (Similarity Score: 0.70)
3. 'RCB Boys Fangear Tee' (Similarity Score: 0.70)
4. 'Reebok Kids Kkr Boy's Premium Blue Jerseys' (Similarity Score: 0.42)
5. 'Nike Kids Boys Blue T-shirt' (Similarity Score: 0.42)
6. 'Doodle Boys Printed Blue T-shirt' (Similarity Score: 0.37)
7. 'Gini and Jony Boys Blue T-shirt' (Similarity Score: 0.36)
8. 'Madagascar3 Boys Blue Printed T-Shirt' (Similarity Score: 0.35)
9. 'Madagascar 3 Boys Blue Printed T-shirt' (Similarity Score: 0.34)
10. 'Gini and Jony Boys Printed Blue T-shirt' (Similarity Score: 0.34)
