## Install

In [1]:
pip install tensorflow_hub transformers tensorflow python-dotenv pandas numpy scikit-learn

Collecting tensorflow_hub
  Downloading tensorflow_hub-0.16.1-py2.py3-none-any.whl.metadata (1.3 kB)
Collecting transformers
  Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting tf-keras>=2.14.1 (from tensorflow_hub)
  Downloading tf_keras-2.16.0-py3-none-any.whl.metadata (1.6 kB)
Collecting huggingface-hub<1.0,>=0.23.0 (from transformers)
  Downloading huggingface_hub-0.23.3-py3-none-any.whl.metadata (12 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.5.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Col

## Import Necessary Libraries

In [2]:
import os
import requests
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from dotenv import load_dotenv
from transformers import BertTokenizer
from sklearn.metrics.pairwise import cosine_similarity

2024-06-13 07:46:57.136319: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-13 07:46:57.192923: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-13 07:46:57.194223: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


## Load Data

In [3]:
load_dotenv()  # Load environment variables from .env file
jwt_token = os.getenv('JWT_TOKEN')

headers = {'Authorization': f'Bearer {jwt_token}'}

api_urls = {
    'products': 'http://161.97.109.65:3000/api/products'
}

def fetch_data(url, headers):
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raises an HTTPError for bad requests
        data = pd.DataFrame(response.json())
        print(f"Data successfully fetched from {url}")
        print(data.head())  # Display the first few rows of the DataFrame
        return data
    except requests.RequestException as e:
        print(f'Failed to fetch data from {url}: {str(e)}')
        return pd.DataFrame()

# Fetch data from APIs
products = fetch_data(api_urls['products'], headers)

# Check if   data was fetched successfully
if not products.empty:
    print("All data fetched successfully.")
else:
    print("Data fetching failed, check errors and retry.")
    # Optionally, add logic to halt further processing if data is crucial

Failed to fetch data from http://161.97.109.65:3000/api/products: 403 Client Error: Forbidden for url: http://161.97.109.65:3000/api/products
Data fetching failed, check errors and retry.


# Data Preprocessing

In [4]:
print(products.head())

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data
def tokenize_data(texts, tokenizer, max_length=128):
    return tokenizer(
        texts.tolist(),
        max_length=max_length,
        truncation=True,
        padding='max_length',
        return_tensors='tf'
    )

# Assuming the data contains a column named 'name'
tokenized_data = tokenize_data(products['name'], tokenizer)
input_ids = tokenized_data['input_ids']
attention_masks = tokenized_data['attention_mask']

# Convert tensors to numpy arrays
input_ids_np = input_ids.numpy()
attention_masks_np = attention_masks.numpy()

# Create dummy binary labels for demonstration purposes
# Example: Assign a label based on a condition, here assuming 'Meja' category as 1 and others as 0
labels = (products['category'] == 'Category1').astype(int).values  # Example binary labels based on category

# Split the data
train_input_ids, val_input_ids, train_labels, val_labels = train_test_split(input_ids_np, labels, test_size=0.2, random_state=42)
train_attention_masks, val_attention_masks = train_test_split(attention_masks_np, test_size=0.2, random_state=42)

# Check the shapes of the splits to ensure correctness
print(f"Train input IDs shape: {train_input_ids.shape}")
print(f"Validation input IDs shape: {val_input_ids.shape}")
print(f"Train attention masks shape: {train_attention_masks.shape}")
print(f"Validation attention masks shape: {val_attention_masks.shape}")
print(f"Train labels shape: {train_labels.shape}")
print(f"Validation labels shape: {val_labels.shape}")

# Combine product name and category for Universal Sentence Encoder
titles = products['name'].tolist()
labels = products['category'].tolist()
combined_text = [f"{label} {title}" for label, title in zip(labels, titles)]

# Load the Universal Sentence Encoder
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Generate embeddings for the combined text
embeddings = embed(combined_text)

# Check the shape of the embeddings
print(embeddings.shape)

Empty DataFrame
Columns: []
Index: []




KeyError: 'name'

# Create Model

In [7]:
# Define the search function
def semantic_search(query, embeddings, data, top_k):
    # Generate the embedding for the query
    query_embedding = embed([query])

    # Calculate cosine similarities
    similarities = cosine_similarity(query_embedding, embeddings).flatten()

    # Get the top_k products
    top_k_indices = np.argsort(similarities)[-top_k:][::-1]  # Corrected this line to use argsort
    if len(top_k_indices) == 0:
        return None  # Return None if no results found
    results = data.iloc[top_k_indices]
    return results

# Try Model

In [11]:
# Example usage
query = "kasur"
results = semantic_search(query, embeddings, products, top_k=10)

print(results)

                          _id category     price  \
345  6667ef73b3e75416b2fa7f8c    Kasur  185000.0   
423  6667ef73b3e75416b2fa7fda    Kasur  226320.0   
391  6667ef73b3e75416b2fa7fba    Kasur   61281.0   
324  6667ef73b3e75416b2fa7f77    Kasur   55900.0   
384  6667ef73b3e75416b2fa7fb3    Kasur   64000.0   
340  6667ef73b3e75416b2fa7f87    Kasur   90000.0   
295  6667ef73b3e75416b2fa7f5a    Kasur  498000.0   
356  6667ef73b3e75416b2fa7f97    Kasur  224000.0   
317  6667ef73b3e75416b2fa7f70    Kasur   98000.0   
381  6667ef73b3e75416b2fa7fb0    Kasur  149000.0   

                                                  name  \
345                                       kasur dewasa   
423  Kasur Lantai MOLLORCA 100cm/kasurlipat/kasurva...   
391       Kasur lipat matras kasur lantai (90x170x5cm)   
324               KASUR LIPAT 90x170x5cm...SUPER MURAH   
384              Kasur Lantai Palembang Empuk Termurah   
340  kasur palembang, kasur kapuk, kasur lantai, ka...   
295      Kasur Spring