## Install

In [1]:
pip install tensorflow_hub transformers tensorflow python-dotenv pandas numpy scikit-learn

Collecting tensorflow_hub
  Downloading tensorflow_hub-0.16.1-py2.py3-none-any.whl.metadata (1.3 kB)
Collecting transformers
  Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting tf-keras>=2.14.1 (from tensorflow_hub)
  Downloading tf_keras-2.16.0-py3-none-any.whl.metadata (1.6 kB)
Collecting huggingface-hub<1.0,>=0.23.0 (from transformers)
  Downloading huggingface_hub-0.23.3-py3-none-any.whl.metadata (12 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.5.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Col

## Import Necessary Libraries

In [1]:
import os
import requests
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from dotenv import load_dotenv
from transformers import BertTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import json

2024-06-14 17:34:36.700177: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-14 17:34:36.736534: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-14 17:34:36.737286: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


## Load Data

In [2]:
load_dotenv()  # Load environment variables from .env file
jwt_token = os.getenv('JWT_TOKEN')

headers = {'Authorization': f'Bearer {jwt_token}'}

api_urls = {
    'products': 'http://161.97.109.65:3000/api/products'
}

def fetch_data(url, headers):
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raises an HTTPError for bad requests
        data = pd.DataFrame(response.json())
        print(f"Data successfully fetched from {url}")
        print(data.head())  # Display the first few rows of the DataFrame
        return data
    except requests.RequestException as e:
        print(f'Failed to fetch data from {url}: {str(e)}')
        return pd.DataFrame()

# Fetch data from APIs
products = fetch_data(api_urls['products'], headers)

# Check if   data was fetched successfully
if not products.empty:
    print("All data fetched successfully.")
else:
    print("Data fetching failed, check errors and retry.")
    # Optionally, add logic to halt further processing if data is crucial

Data successfully fetched from http://161.97.109.65:3000/api/products
                        _id category     price  \
0  6667ef73b3e75416b2fa7e33     Meja  155000.0   
1  6667ef73b3e75416b2fa7e34     Meja  124000.0   
2  6667ef73b3e75416b2fa7e35     Meja  107000.0   
3  6667ef73b3e75416b2fa7e36     Meja   99500.0   
4  6667ef73b3e75416b2fa7e37     Meja  446000.0   

                                                name  \
0  Damaindah Meja Belajar Kayu Set Kursi / Meja B...   
1  Homedoki Meja / Meja Makan / Meja Komputer / M...   
2  Sakula Meja kantor meja kerja Meja Komputer Pe...   
3  Meja Portable Stand Laptop Meja Laptop Standin...   
4  PiPi Furniture Meja Gaming / Meja komputer / M...   

                   sellerId  \
0  6665e9847aa0dfec0ad43b26   
1  6665e9847aa0dfec0ad43b26   
2  6665e9847aa0dfec0ad43b26   
3  6665e9847aa0dfec0ad43b26   
4  6665e9847aa0dfec0ad43b26   

                                        productImage  __v description  
0  [https://storage.googleapis.co

# Data Preprocessing

In [5]:
# Prepare text data for embedding
titles = products['name'].tolist()
labels = products['category'].tolist()
combined_text = [f"{label} {title}" for label, title in zip(labels, titles)]

# Load the Universal Sentence Encoder
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Generate embeddings for the product descriptions
embeddings = embed(combined_text)

# Convert embeddings to numpy arrays
embeddings_np = embeddings.numpy()

# Assuming binary labels for demonstration purposes
labels = (products['category'] == 'Category1').astype(int).values  # Example binary labels based on category

# Split the data
train_embeddings, val_embeddings, train_labels, val_labels = train_test_split(embeddings_np, labels, test_size=0.2, random_state=42)

# Check the shapes of the splits to ensure correctness
print(f"Train embeddings shape: {train_embeddings.shape}")
print(f"Validation embeddings shape: {val_embeddings.shape}")
print(f"Train labels shape: {train_labels.shape}")
print(f"Validation labels shape: {val_labels.shape}")

2024-06-14 17:38:07.883762: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 34133760 exceeds 10% of free system memory.
2024-06-14 17:38:07.895569: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 34133760 exceeds 10% of free system memory.
2024-06-14 17:38:07.914072: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 34133760 exceeds 10% of free system memory.
2024-06-14 17:38:07.935316: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 34133760 exceeds 10% of free system memory.
2024-06-14 17:38:07.956591: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 34133760 exceeds 10% of free system memory.
2024-06-14 17:38:10.486529: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype string
	 [[{{node input

Train embeddings shape: (1082, 512)
Validation embeddings shape: (271, 512)
Train labels shape: (1082,)
Validation labels shape: (271,)


# Create Model

In [6]:
# Create TensorFlow datasets from the embeddings
def create_tf_dataset(embeddings, labels):
    dataset = tf.data.Dataset.from_tensor_slices((embeddings, labels))
    dataset = dataset.shuffle(buffer_size=1024).batch(32)
    return dataset

train_dataset = create_tf_dataset(train_embeddings, train_labels)
val_dataset = create_tf_dataset(val_embeddings, val_labels)

# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu', input_shape=(train_embeddings.shape[1],)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(1, activation='sigmoid')  # For binary classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(train_dataset, validation_data=val_dataset, epochs=10)

Epoch 1/10


2024-06-14 17:38:36.570175: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int64 and shape [1082]
	 [[{{node Placeholder/_1}}]]
2024-06-14 17:38:36.570429: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int64 and shape [1082]
	 [[{{node Placeholder/_1}}]]


Epoch 2/10

2024-06-14 17:38:38.346449: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int64 and shape [271]
	 [[{{node Placeholder/_1}}]]
2024-06-14 17:38:38.346745: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype int64 and shape [271]
	 [[{{node Placeholder/_1}}]]


Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [9]:
def semantic_search(query, embed_model, trained_model, embeddings, data, top_k=10):
    # Generate the embedding for the query using the embed_model
    query_embedding = embed_model([query]).numpy()

    # Calculate cosine similarities
    similarities = cosine_similarity(query_embedding, embeddings).flatten()

    # Get the top_k products
    top_k_indices = np.argsort(similarities)[-top_k:][::-1]
    results = products.iloc[top_k_indices]
    return results

# Try Model

In [14]:
# Example usage
query = "rice cooker miyako"
results = semantic_search(query, embed, model, embeddings_np, products, top_k=10)

print(results)

                           _id     category     price  \
862   6667ef73b3e75416b2fa8191  Rice Cooker   40000.0   
1006  6667ef76b3e75416b2fa8221  Rice Cooker  442000.0   
875   6667ef73b3e75416b2fa819e  Rice Cooker  258900.0   
948   6667ef73b3e75416b2fa81e7  Rice Cooker  209900.0   
1073  6667ef76b3e75416b2fa8264  Rice Cooker    6500.0   
1029  6667ef76b3e75416b2fa8238  Rice Cooker  233000.0   
961   6667ef73b3e75416b2fa81f4  Rice Cooker  210000.0   
866   6667ef73b3e75416b2fa8195  Rice Cooker  169900.0   
1005  6667ef76b3e75416b2fa8220  Rice Cooker  233000.0   
955   6667ef73b3e75416b2fa81ee  Rice Cooker  229292.0   

                                                 name  \
862                             Panci magiccom Miyako   
1006                           Mito - Rice Cooker R11   
875                    Miyako Rice Cooker MCM-507 SBC   
948          Miyako Rice Cooker 0,6L type 606A / 606B   
1073                 sendok nasi megicom /rice cooker   
1029              miyako mcm 5

In [15]:
# Save model configuration and weights
model_json = model.to_json()
with open("semanticmodel_config.json", "w") as json_file:
    json_file.write(model_json)
model.save_weights("semanticmodel_weights.h5")

In [16]:
# Load model & weights
with open("semanticmodel_config.json", "r") as json_file:
    loaded_model_json = json_file.read()

model1 = tf.keras.models.model_from_json(loaded_model_json)
model1.load_weights("semanticmodel_weights.h5")

# Compile loaded model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])