### Import all the necessary dependencies

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("model_building.csv")
df.head()

Unnamed: 0,title_x,average_rating,rating_number,price,parent_asin,rating,title_y,text,user_id,category_Grocery
0,instant compostable espresso capsules lungo me...,4.3,85,8.49,B0C2W77WJX,4,fresh tasting smelling slightly acidic light l...,happen instant pod dual coffee maker spots nes...,AF2BLE54TEMGZ546U763ZHZRXC4A,1
1,instant compostable espresso capsules lungo me...,4.3,85,8.49,B0C2W77WJX,5,dynamic flavor interesting flavor profile body...,tried leggero light roast lungo medium roast d...,AF2BLE54TEMGZ546U763ZHZRXC4A,1
2,instant compostable espresso capsules lungo me...,4.3,85,8.49,B0C2W77WJX,4,pricey much flavor,great roast ppl arent bitter heavy taste like ...,AEUDZQDVSZYCHEXQSXLB6NWQTMHA,1
3,edible markersfood coloring markersfood colori...,4.3,1193,8.99,B07PK9L29R,5,fun,much fun color cookies artist really work well...,AGECC4F4CDL2AVODIRNCF3V63BEQ,1
4,edible markersfood coloring markersfood colori...,4.3,1193,8.99,B07PK9L29R,5,perfect touch,perfect adding creative touches taste coloring...,AFF6LERKD46F2RLIKAMQTAQPOIWA,1


In [3]:
print(df.shape)

(110596, 10)


## Data Preprocessing: Tokenization
### Objective: Convert text data into tokens for further analysis.

#### Steps:

- Import Libraries: Utilize nltk for natural language processing tasks.
- Download Necessary Resources: Ensure the ‘punkt’ tokenizer models are available.
- Data Conversion: Cast the ‘text’ column to string type to avoid type-related errors.
- Tokenization: Apply word_tokenize from nltk to split the text into individual words or tokens.
- Store Tokens: Save the tokenized data in a new column ‘tokenized_reviews’ for easy access.

In [4]:
# Ensure nltk is downloaded and imported
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Convert any non-string data to string
df['text'] = df['text'].astype(str)

# Tokenize the text data
tokenized_data = df['text'].apply(lambda x: word_tokenize(x))

df['tokenized_reviews'] = tokenized_data

df.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hrishikesh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,title_x,average_rating,rating_number,price,parent_asin,rating,title_y,text,user_id,category_Grocery,tokenized_reviews
0,instant compostable espresso capsules lungo me...,4.3,85,8.49,B0C2W77WJX,4,fresh tasting smelling slightly acidic light l...,happen instant pod dual coffee maker spots nes...,AF2BLE54TEMGZ546U763ZHZRXC4A,1,"[happen, instant, pod, dual, coffee, maker, sp..."
1,instant compostable espresso capsules lungo me...,4.3,85,8.49,B0C2W77WJX,5,dynamic flavor interesting flavor profile body...,tried leggero light roast lungo medium roast d...,AF2BLE54TEMGZ546U763ZHZRXC4A,1,"[tried, leggero, light, roast, lungo, medium, ..."
2,instant compostable espresso capsules lungo me...,4.3,85,8.49,B0C2W77WJX,4,pricey much flavor,great roast ppl arent bitter heavy taste like ...,AEUDZQDVSZYCHEXQSXLB6NWQTMHA,1,"[great, roast, ppl, arent, bitter, heavy, tast..."
3,edible markersfood coloring markersfood colori...,4.3,1193,8.99,B07PK9L29R,5,fun,much fun color cookies artist really work well...,AGECC4F4CDL2AVODIRNCF3V63BEQ,1,"[much, fun, color, cookies, artist, really, wo..."
4,edible markersfood coloring markersfood colori...,4.3,1193,8.99,B07PK9L29R,5,perfect touch,perfect adding creative touches taste coloring...,AFF6LERKD46F2RLIKAMQTAQPOIWA,1,"[perfect, adding, creative, touches, taste, co..."


In [5]:
# now as the data (text - reviews data) is tokenised it's time to drop it the main 'text' column
df = df.drop('text', axis = 1)
df.head()

Unnamed: 0,title_x,average_rating,rating_number,price,parent_asin,rating,title_y,user_id,category_Grocery,tokenized_reviews
0,instant compostable espresso capsules lungo me...,4.3,85,8.49,B0C2W77WJX,4,fresh tasting smelling slightly acidic light l...,AF2BLE54TEMGZ546U763ZHZRXC4A,1,"[happen, instant, pod, dual, coffee, maker, sp..."
1,instant compostable espresso capsules lungo me...,4.3,85,8.49,B0C2W77WJX,5,dynamic flavor interesting flavor profile body...,AF2BLE54TEMGZ546U763ZHZRXC4A,1,"[tried, leggero, light, roast, lungo, medium, ..."
2,instant compostable espresso capsules lungo me...,4.3,85,8.49,B0C2W77WJX,4,pricey much flavor,AEUDZQDVSZYCHEXQSXLB6NWQTMHA,1,"[great, roast, ppl, arent, bitter, heavy, tast..."
3,edible markersfood coloring markersfood colori...,4.3,1193,8.99,B07PK9L29R,5,fun,AGECC4F4CDL2AVODIRNCF3V63BEQ,1,"[much, fun, color, cookies, artist, really, wo..."
4,edible markersfood coloring markersfood colori...,4.3,1193,8.99,B07PK9L29R,5,perfect touch,AFF6LERKD46F2RLIKAMQTAQPOIWA,1,"[perfect, adding, creative, touches, taste, co..."


## TF-IDF Vectorization

#### To convert the tokenized reviews into a numerical format that can be used for machine learning models, we use the `TfidfVectorizer` from `sklearn.feature_extraction.text`. This process is known as TF-IDF vectorization.

### Code Explanation

- **Tokenized Reviews**: We start with the `tokenized_reviews` column which contains the tokenized text data.
- **String Conversion**: Each list of tokens is joined into a single string.
- **Hyperparameters**: We define hyperparameters such as `ngram_range` to include unigrams, bigrams, trigrams and `min_df` to set the minimum document frequency for terms.
- **TF-IDF Vectorizer Initialization**: The vectorizer is initialized with the defined hyperparameters.
- **Fitting and Transforming**: The vectorizer is then fitted to the tokenized strings and transforms them into a TF-IDF matrix.
- **Matrix Shape**: Finally, we print the shape of the TF-IDF matrix to understand the dimensions of our feature space.

## For UNIGRAM

In [6]:
# initialising for UNIGRAM parameter
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming 'tokenized_reviews' column contains the tokenized reviews
tokenized_reviews = df['tokenized_reviews']

# Convert tokenized reviews into strings
tokenized_reviews_str = tokenized_reviews.apply(lambda x: ' '.join(x))

# Initialize TF-IDF vectorizer with hyperparameters
tfidf_vectorizer_unigram = TfidfVectorizer(min_df=5, max_features=1000, strip_accents='unicode', 
                                   analyzer='word', ngram_range=(1, 1), stop_words='english')

# Fit and transform the tokenized reviews
tfidf_matrix_unigram = tfidf_vectorizer_unigram.fit_transform(tokenized_reviews_str)

# Print the shape of the TF-IDF matrix
print("Shape of TF-IDF matrix:", tfidf_matrix_unigram.shape)


Shape of TF-IDF matrix: (110596, 1000)


### Here’s a breakdown of what these vectors represent:

- TF-IDF: Stands for Term Frequency-Inverse Document Frequency. It’s a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.
- Vector Format: Each vector is in the format (document_index, word_id) tf-idf_score.
- document_index: The index of the document in the corpus.
- word_id: The unique identifier for a word within the corpus.
- tf-idf_score: The TF-IDF score for that word in the specified document.
- Purpose: These vectors are used to convert text data into a numerical form that can be used for various tasks such as clustering, classification, and information retrieval.

#### For example, the vector (0, 8688) 0.13354025856957072 means that in the first document of the corpus (index 0), the word with ID 8688 has a TF-IDF score of approximately 0.134. This score represents the relative importance of this word in that particular document compared to the entire corpus. High scores indicate more importance, and vice versa. The vectors are typically sparse, meaning most of the values are zero, as a word does not appear in most documents.

In [7]:
print(tfidf_matrix_unigram)

  (0, 833)	0.19988324457419349
  (0, 101)	0.13618657213644478
  (0, 685)	0.13685877764510995
  (0, 385)	0.11787480024439587
  (0, 876)	0.07708812427502859
  (0, 335)	0.16514864102056648
  (0, 802)	0.11348320174931192
  (0, 172)	0.10221784605550495
  (0, 43)	0.1328151749357382
  (0, 558)	0.12413250395585446
  (0, 86)	0.13107646919734461
  (0, 851)	0.09219194785467436
  (0, 149)	0.1182761181879643
  (0, 223)	0.10472181958737305
  (0, 911)	0.12407589991831008
  (0, 676)	0.20267375568361673
  (0, 482)	0.3223766461398797
  (0, 742)	0.3324244229481629
  (0, 531)	0.48620797378900976
  (0, 483)	0.04927763825732075
  (0, 892)	0.09855283356708616
  (0, 642)	0.1328584149446982
  (0, 978)	0.11064549756638102
  (0, 307)	0.09370128609582781
  (0, 93)	0.07095592053032089
  :	:
  (110593, 108)	0.43299344806133416
  (110593, 267)	0.32073230128299807
  (110593, 69)	0.25582266740755594
  (110593, 846)	0.3177553455710735
  (110593, 936)	0.2397595869171905
  (110593, 600)	0.2973272132423977
  (110593, 714)

### Sparse Matrix Conversion
The code snippet is converting a dense TF-IDF matrix to a sparse matrix format using the `csr_matrix` class from the `scipy.sparse` module. This is done to optimize memory usage when dealing with large matrices.

### Output Explanation
- **Type of tfidf_matrix_sparse**: Confirms the matrix is now in CSR format.
- **Shape of tfidf_matrix_sparse**: The matrix dimensions reflect the number of documents (rows) and the maximum number of features (columns) considered in the TF-IDF vectorization.
- **Number of non-zero entries**: Indicates the total count of non-zero values in the matrix, which represents the actual data points that are stored in memory.


In [8]:
from scipy.sparse import csr_matrix

# Convert to a sparse matrix (if not already)
tfidf_matrix_sparse_unigram = csr_matrix(tfidf_matrix_unigram)

# Print the type and shape of the sparse TF-IDF matrix
print("Type of tfidf_matrix_sparse:", type(tfidf_matrix_sparse_unigram))
print("Shape of tfidf_matrix_sparse:", tfidf_matrix_sparse_unigram.shape)
print("Number of non-zero entries:", tfidf_matrix_sparse_unigram.nnz)

Type of tfidf_matrix_sparse: <class 'scipy.sparse._csr.csr_matrix'>
Shape of tfidf_matrix_sparse: (110596, 1000)
Number of non-zero entries: 1334939


#### Model building using sparsed vectors -  UNIGRAM

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from scipy.sparse.linalg import svds


# Normalize the TF-IDF matrix
normalized_tfidf_matrix_sparse_unigram = normalize(tfidf_matrix_sparse_unigram, norm='l2', axis=1)

# Initialize Truncated SVD with desired number of components
n_components = 100  # Adjust the number of components as needed
# Compute Truncated SVD
u, s, vt = svds(normalized_tfidf_matrix_sparse_unigram, k=n_components)

# Construct the reduced TF-IDF matrix
tfidf_matrix_reduced_unigram = np.dot(u, np.diag(s))


# Get recommendations based on cosine similarity matrix in chunks
def get_recommendations_in_chunks_unigram(product_index, matrix, n=5, threshold=0.2, chunk_size=1000):
    num_docs = matrix.shape[0]
    sim_scores = np.zeros(num_docs)

    for chunk_start in range(0, num_docs, chunk_size):
        chunk_end = min(chunk_start + chunk_size, num_docs)
        sim_chunk = cosine_similarity(matrix[product_index:product_index+1], 
                                      matrix[chunk_start:chunk_end])
        sim_scores[chunk_start:chunk_end] = sim_chunk.flatten()

    # Filter out low similarity scores based on threshold
    sim_indices = [(idx, score) for idx, score in enumerate(sim_scores) if score > threshold]
    # Sort the products based on similarity scores
    sim_indices = sorted(sim_indices, key=lambda x: x[1], reverse=True)
    # Get the top similar products
    top_similar_products = sim_indices[:n]  # Limit to top N
    return top_similar_products

#### Getting recommendation for the above mode


In [10]:
from tabulate import tabulate

# Example usage
product_index = 0
top_similar_products = get_recommendations_in_chunks_unigram(product_index, 
                                                             tfidf_matrix_reduced_unigram)

# Prepare the data for tabular display
table_data = []
for index, score in top_similar_products:
    title = df.iloc[index]['title_x']
    price = df.iloc[index]['price']
    avg_rating = df.iloc[index]['average_rating']
    table_data.append([title, price, avg_rating, score])

# Display the recommendations in a tabular format
print("Top 5 recommended products:")
print(tabulate(table_data, headers=['Title', 'Price', 'Avg Rating', 
                                    'Cosine Similarity Score'], tablefmt='pretty'))


Top 5 recommended products:
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+------------+-------------------------+
|                                                                                 Title                                                                                 | Price | Avg Rating | Cosine Similarity Score |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+------------+-------------------------+
| instant compostable espresso capsules lungo medium roast 10 plantbased capsules makers instant pot ecofriendly 100 organic arabica capsules compostable freshness bag | 8.49  |    4.3     |           1.0           |
|                   holland valley coffee keurig kcup coffee maker compatible high caffeine roast 100 or

##### NDGC Score - UNIGRAM

In [11]:
from sklearn.metrics import ndcg_score

# Extract the indices and scores from the recommendations
top_indices_unigram = [index for index, score in top_similar_products]
top_scores_unigram = [score for index, score in top_similar_products]

# Assuming 'df' is your DataFrame and 'average_rating' is a column in your dataset
# Prepare relevance scores based on the average_rating of the recommended products
relevance_scores = df.loc[top_indices_unigram, 'average_rating'].tolist()

# Normalize relevance scores since NDCG in sklearn assumes relevance scores, not ratings directly
max_rating = max(relevance_scores)
normalized_relevance_scores = [score / max_rating for score in relevance_scores]

# Reshape for ndcg_score function
true_relevance = np.asarray([normalized_relevance_scores])
predicted_relevance = np.asarray([top_scores_unigram])

# Calculate NDCG score
ndcg = ndcg_score(true_relevance, predicted_relevance)
print("NDCG Score:", ndcg)


NDCG Score: 0.9875426533550047


### Using bi gram
#### For this model, all the above pre-processing and feature engineering were performed similarly below for bigram.

In [12]:
from sklearn.decomposition import TruncatedSVD


# Initialize TF-IDF vectorizer to use only bigrams
tfidf_vectorizer_bigram = TfidfVectorizer(min_df=5, max_features=1000, strip_accents='unicode', 
                                           analyzer='word', ngram_range=(1, 2), stop_words='english')

# Fit and transform the tokenized reviews
tfidf_matrix_bigram = tfidf_vectorizer_bigram.fit_transform(tokenized_reviews_str)

# Convert to a sparse matrix
tfidf_matrix_sparse_bigram = csr_matrix(tfidf_matrix_bigram)

# Normalize the TF-IDF matrix
normalized_tfidf_matrix_sparse_bigram = normalize(tfidf_matrix_sparse_bigram)

# Initialize Truncated SVD
n_components = 100
svd = TruncatedSVD(n_components=n_components)

# Apply Truncated SVD
tfidf_matrix_reduced_bigram = svd.fit_transform(normalized_tfidf_matrix_sparse_bigram)

# Function to get recommendations based on cosine similarity
def get_recommendations_in_chunks(product_index, matrix, n=5, threshold=0.2, chunk_size=1000):
    num_docs = matrix.shape[0]
    sim_scores = np.zeros(num_docs)

    for chunk_start in range(0, num_docs, chunk_size):
        chunk_end = min(chunk_start + chunk_size, num_docs)
        sim_chunk = cosine_similarity(matrix[product_index:product_index+1], matrix[chunk_start:chunk_end])
        sim_scores[chunk_start:chunk_end] = sim_chunk.flatten()

    sim_indices = np.argwhere(sim_scores > threshold).flatten()
    top_similar_products_bigram = sorted([(idx, sim_scores[idx]) for idx in sim_indices], key=lambda x: x[1], reverse=True)[:n]
    return top_similar_products_bigram



#### Getting recommendation and NDGC score- 

In [26]:
# Example usage
product_index = 0
top_similar_products_bigram = get_recommendations_in_chunks(product_index, tfidf_matrix_reduced_bigram)

# Prepare the data for tabular display
table_data = []
for index, score in top_similar_products_bigram:
    title = df.iloc[index]['title_x']
    price = df.iloc[index]['price']
    avg_rating = df.iloc[index]['average_rating']
    table_data.append([title, price, avg_rating, score])

# Display the recommendations in a tabular format
print("Top 5 recommended products:")
print(tabulate(table_data, headers=['Title', 'Price', 'Avg Rating', 'Cosine Similarity Score'], tablefmt='pretty'))

# Evaluation
# product_index = 0
top_similar_products_bigram = get_recommendations_in_chunks(product_index, tfidf_matrix_reduced_bigram)

top_indices_bigram = [index for index, score in top_similar_products_bigram]
top_scores_bigram = [score for index, score in top_similar_products_bigram]
relevance_scores = df.loc[top_indices_bigram, 'average_rating'].tolist()

max_rating = max(relevance_scores)
normalized_relevance_scores = [score / max_rating for score in relevance_scores]

true_relevance = np.asarray([normalized_relevance_scores])
predicted_relevance = np.asarray([top_scores_bigram])

ndcg = ndcg_score(true_relevance, predicted_relevance)
print("NDCG Score:", ndcg)


Top 5 recommended products:
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+------------+-------------------------+
|                                                                                 Title                                                                                 | Price | Avg Rating | Cosine Similarity Score |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+------------+-------------------------+
| instant compostable espresso capsules lungo medium roast 10 plantbased capsules makers instant pot ecofriendly 100 organic arabica capsules compostable freshness bag | 8.49  |    4.3     |   0.9999999999999999    |
|                   holland valley coffee keurig kcup coffee maker compatible high caffeine roast 100 or

### For Trigram

In [14]:

# Initialize TF-IDF vectorizer to use only trigrams
tfidf_vectorizer_trigram = TfidfVectorizer(min_df=5, max_features=1000, strip_accents='unicode', 
                                            analyzer='word', ngram_range=(1, 3), stop_words='english')

# Fit and transform the tokenized reviews
tfidf_matrix_trigram = tfidf_vectorizer_trigram.fit_transform(tokenized_reviews_str)

# Convert to a sparse matrix
tfidf_matrix_sparse_trigram = csr_matrix(tfidf_matrix_trigram)

# Normalize the TF-IDF matrix
normalized_tfidf_matrix_sparse_trigram = normalize(tfidf_matrix_sparse_trigram)

# Initialize Truncated SVD
n_components = 100
svd = TruncatedSVD(n_components=n_components)

# Apply Truncated SVD
tfidf_matrix_reduced_trigram = svd.fit_transform(normalized_tfidf_matrix_sparse_trigram)

# Function to get recommendations based on cosine similarity
def get_recommendations_in_chunks(product_index, matrix, n=5, threshold=0.2, chunk_size=1000):
    num_docs = matrix.shape[0]
    sim_scores = np.zeros(num_docs)

    for chunk_start in range(0, num_docs, chunk_size):
        chunk_end = min(chunk_start + chunk_size, num_docs)
        sim_chunk = cosine_similarity(matrix[product_index:product_index+1], matrix[chunk_start:chunk_end])
        sim_scores[chunk_start:chunk_end] = sim_chunk.flatten()

    sim_indices = np.argwhere(sim_scores > threshold).flatten()
    top_similar_products_trigram = sorted([(idx, sim_scores[idx]) for idx in sim_indices], key=lambda x: x[1], reverse=True)[:n]
    return top_similar_products_trigram



In [25]:
# Example usage
product_index = 0
top_similar_products_trigram = get_recommendations_in_chunks(product_index, tfidf_matrix_reduced_trigram)

# Prepare the data for tabular display
table_data = []
for index, score in top_similar_products_trigram:
    title = df.iloc[index]['title_x']
    price = df.iloc[index]['price']
    avg_rating = df.iloc[index]['average_rating']
    table_data.append([title, price, avg_rating, score])

# Display the recommendations in a tabular format
print("Top 5 recommended products:")
print(tabulate(table_data, headers=['Title', 'Price', 'Avg Rating', 'Cosine Similarity Score'], tablefmt='pretty'))

# Evaluation

top_similar_products_trigram = get_recommendations_in_chunks(product_index, tfidf_matrix_reduced_trigram)

top_indices_trigram = [index for index, score in top_similar_products_trigram]
top_scores_trigram = [score for index, score in top_similar_products_trigram]
relevance_scores = df.loc[top_indices_trigram, 'average_rating'].tolist()

max_rating = max(relevance_scores)
normalized_relevance_scores = [score / max_rating for score in relevance_scores]

true_relevance = np.asarray([normalized_relevance_scores])
predicted_relevance = np.asarray([top_scores_trigram])

ndcg = ndcg_score(true_relevance, predicted_relevance)
print("NDCG Score:", ndcg)


Top 5 recommended products:
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+------------+-------------------------+
|                                                                                 Title                                                                                 | Price | Avg Rating | Cosine Similarity Score |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+------------+-------------------------+
| instant compostable espresso capsules lungo medium roast 10 plantbased capsules makers instant pot ecofriendly 100 organic arabica capsules compostable freshness bag | 8.49  |    4.3     |   1.0000000000000004    |
|                   holland valley coffee keurig kcup coffee maker compatible high caffeine roast 100 or

## Gensim
### Another way to build recommendation model is through gensim

since i was encountering this error - Unable to allocate 52.8 GiB for an array with shape (110596, 64024) and data type float64
i'm using gensim library,  Gensim has an efficient TF-IDF model that doesn’t require loading the entire corpus into memory at once. Instead, it operates on an iterable representation of your data.

Here’s how you can use Gensim for TF-IDF representation:

Install Gensim (if you haven’t already):
pip install gensim

Prepare Your Corpus:
Ensure your corpus is an iterable (e.g., a list of tokenized reviews).
You can use Python generators to achieve this.
Create a Gensim Dictionary:
Create a dictionary from your tokenized reviews.
The dictionary maps words to unique integer IDs.
Create a Gensim Corpus:
Convert your tokenized reviews into a Gensim corpus using the dictionary.
The corpus is a list of bag-of-words representations (sparse vectors) for each document.
Compute TF-IDF:
Use Gensim’s TfidfModel to compute the TF-IDF scores based on the corpus.
Access the TF-IDF Vectors:
You can access the TF-IDF vectors for individual documents without loading the entire matrix into memory.

### Thoughts behind using gensim and corpora - 
- Importing Gensim: We start by importing Gensim, a versatile Python library renowned for its capabilities in natural language processing tasks such as topic modeling, document similarity analysis, and text summarization. This step ensures we have access to the tools necessary for our task ahead.
- Creating a Dictionary: In this step, we construct a dictionary that serves as a fundamental component in the subsequent processes. This dictionary acts as a mapping between words and their unique integer IDs within our corpus of product reviews. By establishing this mapping, we lay the groundwork for efficient representation and analysis of textual data.
- Creating a Corpus: With our dictionary in place, we proceed to create a corpus, essentially a collection of documents represented in a format suitable for computational analysis. Each document is transformed into a bag-of-words vector, where each word's frequency within the document is recorded. This step transforms our raw textual data into a structured format amenable to further processing.
- TF-IDF Model: Building upon the bag-of-words representation, we leverage Gensim's capabilities to construct a TF-IDF model. TF-IDF (Term Frequency-Inverse Document Frequency) is a statistical measure used to evaluate the importance of a word in a document relative to a corpus. By applying this model to our corpus, we compute TF-IDF scores for each word, capturing their significance within individual documents and across the entire corpus.
- Accessing TF-IDF Vectors: Here, we access the TF-IDF vectors generated by our model, enabling us to explore and utilize the transformed textual data. These vectors encapsulate the essence of each document's content, with each dimension representing a unique word and its corresponding TF-IDF score. By retrieving these vectors, we gain valuable insights into the semantic composition of our product reviews, paving the way for various downstream applications.

In [16]:
import gensim
from gensim import corpora, models

# Assuming 'tokenized_reviews' is an iterable (e.g., list of lists)
dictionary = corpora.Dictionary(tokenized_reviews)
corpus = [dictionary.doc2bow(review) for review in tokenized_reviews]

# Create the TF-IDF model
tfidf_model = models.TfidfModel(corpus)

# Create a list to hold all TF-IDF vectors
all_tfidf_vectors = []

# Iterate over all documents in the corpus
for doc in corpus:
    # Apply the TF-IDF model to get the vector for the current document
    doc_tfidf_vector = tfidf_model[doc]
    # Append the vector to the list
    all_tfidf_vectors.append(doc_tfidf_vector)

# Now 'all_tfidf_vectors' contains the TF-IDF vectors for all documents
# Limit the output to the first 10 TF-IDF vectors
limited_tfidf_vectors = all_tfidf_vectors[:10]

# Now 'limited_tfidf_vectors' contains the TF-IDF vectors for the first 10 documents
print(limited_tfidf_vectors)

[[(0, 0.15822019336594337), (1, 0.07911009668297168), (2, 0.12170821532633551), (3, 0.07493418887314095), (4, 0.11030375926386878), (5, 0.10859182482154411), (6, 0.04947903549083479), (7, 0.10374842137816317), (8, 0.1136241355227434), (9, 0.13596692814909372), (10, 0.13138885661596642), (11, 0.09599582425095556), (12, 0.1565601201334325), (13, 0.08020523861709881), (14, 0.08266694420118104), (15, 0.11723536349072858), (16, 0.18658037825369134), (17, 0.0888713835981412), (18, 0.07936684071231864), (19, 0.07183341245268707), (20, 0.12707538562583412), (21, 0.07401393015048931), (22, 0.032219935791116754), (23, 0.12179428611948441), (24, 0.05987914001758319), (25, 0.120831002481178), (26, 0.09560107520762093), (27, 0.06796589385054766), (28, 0.10209478190637414), (29, 0.11026127062439335), (30, 0.2560739659343325), (31, 0.028176078444046855), (32, 0.11693806256431222), (33, 0.1019816764207001), (34, 0.11392707128848491), (35, 0.1524549702782134), (36, 0.3968735655113111), (37, 0.101757343

#### Applying padding on the vectors to obtain it in equal lengths 
- Importing NumPy: The code begins by importing the NumPy library, which is a fundamental package for scientific computing in Python.
- Max Length Calculation: max_length is determined by finding the longest vector in all_tfidf_vectors, which is a list of TF-IDF vectors. This ensures that all vectors will be padded to the same length.
- Padding Function: pad_vector is a function that takes a vector and the max_length as arguments. It calculates the necessary padding (a list of zeros) to make the vector’s length equal to max_length. The padding is then appended to the original vector, and the padded vector is returned.
- Applying Padding: The list comprehension applies the pad_vector function to each vector in all_tfidf_vectors. The result is padded_vectors, a new list where each TF-IDF vector has been padded with zeros to have the same length, ensuring uniformity for further processing.

In [18]:
# Find the maximum length among all vectors
max_length = max(len(vector) for vector in all_tfidf_vectors)

# Convert TF-IDF vectors to NumPy arrays
all_tfidf_arrays = [np.array([(0, 0)] * max_length) for vector in all_tfidf_vectors]


# Function to convert TF-IDF vectors to arrays
def convert_to_array(vector, max_length):
    array = np.array(vector)
    result = np.zeros((max_length, 2))  # Assuming TF-IDF vectors are represented as tuples (index, value)
    result[:len(array)] = array
    return result

# Apply conversion and padding to each vector
padded_vectors = [convert_to_array(vector, max_length) for vector in all_tfidf_vectors]


In [19]:

limited_padded_vectors = padded_vectors[:5]

# Now 'limited_tfidf_vectors' contains the TF-IDF vectors for the first 5 documents
print(limited_padded_vectors)

[array([[0.        , 0.15822019],
       [1.        , 0.0791101 ],
       [2.        , 0.12170822],
       ...,
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ]]), array([[0.        , 0.11416458],
       [1.        , 0.05708229],
       [2.        , 0.17563835],
       ...,
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ]]), array([[22.        ,  0.05943553],
       [31.        ,  0.0519759 ],
       [47.        ,  0.16353332],
       ...,
       [ 0.        ,  0.        ],
       [ 0.        ,  0.        ],
       [ 0.        ,  0.        ]]), array([[5.70000000e+01, 1.47148706e-01],
       [1.05000000e+02, 1.08863395e-01],
       [1.15000000e+02, 1.94739553e-01],
       ...,
       [0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00]]), array([[3.00000000e+00, 2.32654914e-01],
       [1.13000000e+02, 9.57452830e-02],
       [1

### Model Building and its steps
- Normalization: The TF-IDF matrix, stored in tfidf_matrix_sparse, is normalized using sklearn's normalize function. Normalization ensures that each TF-IDF vector has a unit norm, which can improve the performance of subsequent operations.
- Dimensionality Reduction: Truncated Singular Value Decomposition (SVD) is applied to the normalized TF-IDF matrix to reduce its dimensionality while preserving important information. This step is crucial for managing computational complexity and capturing the most relevant features.
- Chunk-based Similarity Calculation: To handle large datasets efficiently, the code defines a function chunk_similarity that calculates cosine similarity scores between a reference document and chunks of the TF-IDF matrix. This approach enables memory-efficient computation by processing the data in manageable chunks.
- Recommendation Generation: Another function, get_recommendations_in_chunks, utilizes the chunk-based similarity calculation to generate recommendations for a given product index. It calculates cosine similarity scores between the reference product and all other products in the dataset, filtering out low similarity scores based on a threshold. Finally, it returns the top N products with the highest similarity scores as recommendations.

#### Model building using Gensim method - 

In [20]:

'''
we need to reshape the padded_vectors array to collapse the last two dimensions 
into one before passing it to the normalize function.
'''
# Convert the list of padded vectors to a NumPy array
padded_vectors_array = np.array(padded_vectors)

# Reshape padded_vectors to collapse the last two dimensions into one
reshaped_vectors = padded_vectors_array.reshape(padded_vectors_array.shape[0], -1)

# Normalize the reshaped TF-IDF matrix
normalized_tfidf_matrix_sparse = normalize(reshaped_vectors)

# Initialize Truncated SVD with desired number of components
n_components = 100  # Adjust the number of components as needed
svd = TruncatedSVD(n_components=n_components)

# Apply Truncated SVD to the normalized TF-IDF matrix
tfidf_matrix_reduced = svd.fit_transform(normalized_tfidf_matrix_sparse)


# Get recommendations based on cosine similarity matrix in chunks
def get_recommendations_in_chunks(product_index, matrix, n=5, threshold=0.2, chunk_size=1000):
    num_docs = matrix.shape[0]
    sim_scores = np.zeros(num_docs)

    for chunk_start in range(0, num_docs, chunk_size):
        chunk_end = min(chunk_start + chunk_size, num_docs)
        sim_chunk = cosine_similarity(matrix[product_index:product_index+1], matrix[chunk_start:chunk_end])
        sim_scores[chunk_start:chunk_end] = sim_chunk.flatten()

    # Filter out low similarity scores based on threshold
    sim_indices = [(idx, score) for idx, score in enumerate(sim_scores) if score > threshold]
    # Sort the products based on similarity scores
    sim_indices = sorted(sim_indices, key=lambda x: x[1], reverse=True)
    # Get the top similar products
    top_similar_products = sim_indices[:n]  # Limit to top N
    return top_similar_products

### Getting Recommendations

In [21]:
# Example usage
product_index = 0
top_similar_products = get_recommendations_in_chunks(product_index, tfidf_matrix_reduced)

# Prepare the data for tabular display
table_data = []
for index, score in top_similar_products:
    title = df.iloc[index]['title_x']
    price = df.iloc[index]['price']
    avg_rating = df.iloc[index]['average_rating']
    table_data.append([title, price, avg_rating, score])

# Display the recommendations in a tabular format
print("Top 5 recommended products:")
print(tabulate(table_data, headers=['Title', 'Price', 'Avg Rating', 'Cosine Similarity Score'], tablefmt='pretty'))


Top 5 recommended products:
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+------------+-------------------------+
|                                                                                 Title                                                                                 | Price | Avg Rating | Cosine Similarity Score |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+------------+-------------------------+
| instant compostable espresso capsules lungo medium roast 10 plantbased capsules makers instant pot ecofriendly 100 organic arabica capsules compostable freshness bag | 8.49  |    4.3     |           1.0           |
|                                    artisana organics raw almond butter 14oz sugar added palm oil vegan

### Evaluation

In [22]:
# Assuming get_recommendations_in_chunks is defined and returns a list of tuples (index, score)
product_index = 0
top_similar_products = get_recommendations_in_chunks(product_index, tfidf_matrix_reduced)

# Extract the indices and scores from the recommendations
top_indices = [index for index, score in top_similar_products]
top_scores = [score for index, score in top_similar_products]

# Assuming 'df' is your DataFrame and 'average_rating' is a column in dataset
# Prepare relevance scores based on the average_rating of the recommended products
relevance_scores = df.loc[top_indices, 'average_rating'].tolist()

# Normalize relevance scores since NDCG in sklearn assumes relevance scores, not ratings directly
max_rating = max(relevance_scores)
normalized_relevance_scores = [score / max_rating for score in relevance_scores]

# Reshape for ndcg_score function
true_relevance = np.asarray([normalized_relevance_scores])
predicted_relevance = np.asarray([top_scores])

# Calculate NDCG score
ndcg = ndcg_score(true_relevance, predicted_relevance)
print("NDCG Score:", ndcg)


NDCG Score: 0.9814682661666775
