## General Resources

### Libraries

In [1]:
# Data importation and manipulation
import os 
import pandas as pd
import numpy as np

# Data visualization, exploratory and result analysis
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap


In [2]:
import warnings

# Suppress 'deprecated' warnings, added in final run to clean notebook when printing into pdf
warnings.filterwarnings("ignore")

### Dataset

#### Import

In [3]:
def import_csv_dataset(file_path):
    df = pd.read_csv(file_path, header=0, encoding='latin-1')
    return df

In [4]:
flipkart_data = r"Dataset\flipkart_com-products.csv"
flipkart_data_3 =r"flipkart_data_df3.csv"

flipkart_data_df = import_csv_dataset(flipkart_data)
flipkart_data_df3 = import_csv_dataset(flipkart_data_3)

flipkart_id_name = flipkart_data_df[['pid', 'product_name']]


## Recommendation System

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Josh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Josh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
def preprocess_text(text):
    # Initialize lemmatizer and stopwords
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english')) - {"not", "no"}  # Keep negations
    
    # Convert to string and lowercase
    text = str(text).lower()
    
    # Remove special characters and digits but keep words
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Lemmatization 
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    
    return text

In [8]:
# Apply preprocessing
flipkart_data_df3['description'] = flipkart_data_df3['description'].apply(preprocess_text)

In [None]:
tfidf = TfidfVectorizer(stop_words="english", max_features=500)
tfidf_matrix = tfidf.fit_transform(flipkart_data_df3['description'])

print(tfidf_matrix.shape)

(20000, 300)


In [None]:
cosine_sim = cosine_similarity(tfidf_matrix,
                               tfidf_matrix)

print(cosine_sim.shape)
cosine_sim[1]

(20000, 20000)


In [14]:
from scipy.sparse import csr_matrix

# Convert to a sparse matrix to reduce size
cosine_sim_sparse = csr_matrix(cosine_sim)

print(cosine_sim_sparse.shape)  # Same shape but much smaller memory footprint

(20000, 20000)


In [15]:
import pickle

# Save the sparse matrix
with open('cosine_sim_sparse.pkl', 'wb') as f:
    pickle.dump(cosine_sim_sparse, f)

In [None]:
import pickle

# Ensure the directory exists
folder_path = r"RecommendationFile"
os.makedirs(folder_path, exist_ok=True)

# Load cosine_sim
with open(os.path.join(folder_path, "cosine_sim.pkl"), "rb") as f:
    cosine_sim = pickle.load(f)

# Split into 50 parts
num_parts = 50
split_arrays = np.array_split(cosine_sim, num_parts)

# Save each part separately inside the folder
for i, part in enumerate(split_arrays):
    file_path = os.path.join(folder_path, f'cosine_sim_part_{i}.pkl')
    with open(file_path, 'wb') as f:
        pickle.dump(part, f)

print("Cosine similarity matrix split and saved successfully in 'RecommendationFile/' folder.")


Cosine similarity matrix split and saved successfully in 'RecommendationFile/' folder.


In [37]:
def get_top_5_similar(index, df, df_name, cosine_sim):
    # Get the PID of the given index
    pid = df.loc[index, 'pid']
    
    # Get the product name from flipkart_id_name using pid
    product_name = df_name.loc[df_name['pid'] == pid, 'product_name'].values[0]
    
    # Get similarity scores for the given index and sort them in descending order
    sim_scores = list(enumerate(cosine_sim[index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:6]  # Exclude itself
    
    # Get the top 5 similar product indices
    top_indices = [i[0] for i in sim_scores]

    # Get corresponding product IDs and names
    top_pids = df.loc[top_indices, 'pid'].values
    top_names = df_name.loc[df_name['pid'].isin(top_pids), 'product_name'].values

    # Print results
    print(f"Product ID: {pid}")
    print(f"Product Name: {product_name}")
    for i, (top_pid, top_name) in enumerate(zip(top_pids, top_names), start=1):
        print(f"Top {i}: {top_pid} - {top_name}")

    return pid, product_name, top_pids, top_names

In [71]:
folder_path = "RecommendationFile"

# Load all parts
num_parts = 50
cosine_sim_parts = []

for i in range(num_parts):
    file_path = os.path.join(folder_path, f'cosine_sim_part_{i}.pkl')
    with open(file_path, 'rb') as f:
        cosine_sim_parts.append(pickle.load(f))

# Merge into a single array
cosine_sim = np.vstack(cosine_sim_parts)

print("Cosine similarity matrix successfully reconstructed.")


Cosine similarity matrix successfully reconstructed.


In [None]:
# # Import the cosine similarity matrix
# cosine_sim_path = r"cosine_sim.pkl"

# with open(cosine_sim_path, 'rb') as f:
#     cosine_sim = pickle.load(f)

In [45]:
flipkart_id_name

Unnamed: 0,pid,product_name
0,SRTEH2FF9KEDEFGF,Alisha Solid Women's Cycling Shorts
1,SBEEH3QGU7MFYJFY,FabHomeDecor Fabric Double Sofa Bed
2,SHOEH4GRSUBJGZXE,AW Bellies
3,SRTEH2F6HUZMQ6SJ,Alisha Solid Women's Cycling Shorts
4,PSOEH3ZYDMSYARJ5,Sicons All Purpose Arnica Dog Shampoo
...,...,...
19995,STIE7KFJAKSTDY9G,WallDesign Small Vinyl Sticker
19996,STIE9F5URNQGJCGH,Wallmantra Large Vinyl Stickers Sticker
19997,STIE7VAYDKQZEBSD,Elite Collection Medium Acrylic Sticker
19998,STIE8YSVEPPCZ42Y,Elite Collection Medium Acrylic Sticker


In [63]:
pid, product_name, top_pids, top_names = get_top_5_similar(1, flipkart_data_df3, flipkart_id_name, cosine_sim)

Product ID: SBEEH3QGU7MFYJFY
Product Name: FabHomeDecor Fabric Double Sofa Bed
Top 1: SBEEH3QGYGHFUEXN - FabHomeDecor Fabric Double Sofa Bed
Top 2: SBEEH3QGAYAEPRCG - FabHomeDecor Fabric Double Sofa Bed
Top 3: SBEEH3QGWRGG3J6Q - FabHomeDecor Fabric Double Sofa Bed
Top 4: SOFEGDV3HGY3AB43 - Ethnic Handicrafts Solid Wood Single Bed
Top 5: BDDEH29EWHWRAPWG - Comfort Couch Engineered Wood 3 Seater Sofa
