## Install

In [1]:
pip install tensorflow_hub transformers tensorflow python-dotenv pandas numpy scikit-learn

Note: you may need to restart the kernel to use updated packages.


## Import Necessary Libraries

In [8]:
import os
import requests
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from dotenv import load_dotenv
from transformers import BertTokenizer
from sklearn.metrics.pairwise import cosine_similarity

## Load Data

In [3]:
load_dotenv()  # Load environment variables from .env file
jwt_token = os.getenv('JWT_TOKEN')

headers = {'Authorization': f'Bearer {jwt_token}'}

api_urls = {
    'products': 'http://161.97.109.65:3000/api/products'
}

def fetch_data(url, headers):
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raises an HTTPError for bad requests
        data = pd.DataFrame(response.json())
        print(f"Data successfully fetched from {url}")
        print(data.head())  # Display the first few rows of the DataFrame
        return data
    except requests.RequestException as e:
        print(f'Failed to fetch data from {url}: {str(e)}')
        return pd.DataFrame()

# Fetch data from APIs
products = fetch_data(api_urls['products'], headers)

# Check if   data was fetched successfully
if not products.empty:
    print("All data fetched successfully.")
else:
    print("Data fetching failed, check errors and retry.")
    # Optionally, add logic to halt further processing if data is crucial

Data successfully fetched from http://161.97.109.65:3000/api/products
                        _id              name           category  \
0  6665ecb77aa0dfec0ad43b69         Textbooks              Books   
1  6665ecf07aa0dfec0ad43b6c    School Uniform           Clothing   
2  6665ed0f7aa0dfec0ad43b6f          Backpack        Accessories   
3  6665ed747aa0dfec0ad43b73  Sports Equipment  Sports & Outdoors   
4  6665ed8e7aa0dfec0ad43b76            Laptop        Electronics   

                                         description   price  \
0        Gently used textbooks for various subjects.   15.99   
1  Pre-owned boarding school uniform in good cond...   29.99   
2  Sturdy backpack suitable for boarding school e...   24.99   
3   Used sports gear for extracurricular activities.   49.99   
4      Refurbished laptop ideal for school projects.  199.99   

                   sellerId  \
0  6665e9847aa0dfec0ad43b26   
1  6665e9847aa0dfec0ad43b26   
2  6665e9847aa0dfec0ad43b26   
3  6665e9847

# Data Preprocessing

In [9]:
# Display fetched data
print(products.head())

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data
def tokenize_data(texts, tokenizer, max_length=128):
    return tokenizer(
        texts.tolist(),
        max_length=max_length,
        truncation=True,
        padding='max_length',
        return_tensors='tf'
    )

# Assuming the data contains a column named 'name'
tokenized_data = tokenize_data(products['name'], tokenizer)
input_ids = tokenized_data['input_ids']
attention_masks = tokenized_data['attention_mask']

# Convert tensors to numpy arrays
input_ids_np = input_ids.numpy()
attention_masks_np = attention_masks.numpy()

# Create dummy binary labels for demonstration purposes
# Example: Assign a label based on a condition, here assuming 'Books' category as 1 and others as 0
labels = (products['category'] == 'Books').astype(int).values

# Split the data
train_input_ids, val_input_ids, train_labels, val_labels = train_test_split(input_ids_np, labels, test_size=0.2, random_state=42)
train_attention_masks, val_attention_masks = train_test_split(attention_masks_np, test_size=0.2, random_state=42)

# Check the shapes of the splits to ensure correctness
print(f"Train input IDs shape: {train_input_ids.shape}")
print(f"Validation input IDs shape: {val_input_ids.shape}")
print(f"Train attention masks shape: {train_attention_masks.shape}")
print(f"Validation attention masks shape: {val_attention_masks.shape}")
print(f"Train labels shape: {train_labels.shape}")
print(f"Validation labels shape: {val_labels.shape}")

# Combine product name and category for Universal Sentence Encoder
titles = products['name'].tolist()
labels = products['category'].tolist()
combined_text = [f"{label} {title}" for label, title in zip(labels, titles)]

# Load the Universal Sentence Encoder
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Generate embeddings for the combined text
embeddings = embed(combined_text)

# Check the shape of the embeddings
print(embeddings.shape)

                        _id              name           category  \
0  6665ecb77aa0dfec0ad43b69         Textbooks              Books   
1  6665ecf07aa0dfec0ad43b6c    School Uniform           Clothing   
2  6665ed0f7aa0dfec0ad43b6f          Backpack        Accessories   
3  6665ed747aa0dfec0ad43b73  Sports Equipment  Sports & Outdoors   
4  6665ed8e7aa0dfec0ad43b76            Laptop        Electronics   

                                         description   price  \
0        Gently used textbooks for various subjects.   15.99   
1  Pre-owned boarding school uniform in good cond...   29.99   
2  Sturdy backpack suitable for boarding school e...   24.99   
3   Used sports gear for extracurricular activities.   49.99   
4      Refurbished laptop ideal for school projects.  199.99   

                   sellerId  \
0  6665e9847aa0dfec0ad43b26   
1  6665e9847aa0dfec0ad43b26   
2  6665e9847aa0dfec0ad43b26   
3  6665e9847aa0dfec0ad43b26   
4  6665e9847aa0dfec0ad43b26   

                   

In [10]:
# Define the search function
def semantic_search(query, embeddings, data, top_k):
    # Generate the embedding for the query
    query_embedding = embed([query])

    # Calculate cosine similarities
    similarities = cosine_similarity(query_embedding, embeddings).flatten()

    # Get the top_k products
    top_k_indices = np.argsort(similarities)[-top_k:][::-1]  # Corrected this line to use argsort
    if len(top_k_indices) == 0:
        return None  # Return None if no results found
    results = data.iloc[top_k_indices]
    return results

In [11]:
# Example usage
query = "kursi"
results = semantic_search(query, embeddings, products, top_k=10)

print(results)

                         _id                name             category  \
16  6665eeb47aa0dfec0ad43b9a            Umbrella          Accessories   
29  6665efbf7aa0dfec0ad43bc1         Study Chair            Furniture   
32  6665effa7aa0dfec0ad43bca        Toiletry Kit       Home & Kitchen   
14  6665ee7d7aa0dfec0ad43b94        Art Supplies        Arts & Crafts   
30  6665efd27aa0dfec0ad43bc4         Alarm Clock       Home & Kitchen   
11  6665ee427aa0dfec0ad43b8b  Musical Instrument  Musical Instruments   
0   6665ecb77aa0dfec0ad43b69           Textbooks                Books   
5   6665edc17aa0dfec0ad43b79           Textbooks                Books   
2   6665ed0f7aa0dfec0ad43b6f            Backpack          Accessories   
7   6665edef7aa0dfec0ad43b7f            Backpack          Accessories   

                                          description  price  \
16         Compact umbrella for rainy days on campus.   9.99   
29  Comfortable study chair for long hours of stud...  39.99   
32  