## Install

In [1]:
pip install tensorflow_hub transformers tensorflow python-dotenv pandas numpy scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


## Import Necessary Libraries

In [2]:
import os
import requests
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from dotenv import load_dotenv
from transformers import BertTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import json




  from .autonotebook import tqdm as notebook_tqdm


## Load Data

In [3]:
load_dotenv()  # Load environment variables from .env file
jwt_token = os.getenv('JWT_TOKEN')

headers = {'Authorization': f'Bearer {jwt_token}'}

api_urls = {
    'products': 'http://161.97.109.65:3000/api/products'
}

def fetch_data(url, headers):
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raises an HTTPError for bad requests
        data = pd.DataFrame(response.json())
        print(f"Data successfully fetched from {url}")
        print(data.head())  # Display the first few rows of the DataFrame
        return data
    except requests.RequestException as e:
        print(f'Failed to fetch data from {url}: {str(e)}')
        return pd.DataFrame()

# Fetch data from APIs
products = fetch_data(api_urls['products'], headers)

# Check if   data was fetched successfully
if not products.empty:
    print("All data fetched successfully.")
else:
    print("Data fetching failed, check errors and retry.")
    # Optionally, add logic to halt further processing if data is crucial

Data successfully fetched from http://161.97.109.65:3000/api/products
                        _id category     price  \
0  6667ef73b3e75416b2fa7e33     Meja  155000.0   
1  6667ef73b3e75416b2fa7e34     Meja  124000.0   
2  6667ef73b3e75416b2fa7e35     Meja  107000.0   
3  6667ef73b3e75416b2fa7e36     Meja   99500.0   
4  6667ef73b3e75416b2fa7e37     Meja  446000.0   

                                                name  \
0  Damaindah Meja Belajar Kayu Set Kursi / Meja B...   
1  Homedoki Meja / Meja Makan / Meja Komputer / M...   
2  Sakula Meja kantor meja kerja Meja Komputer Pe...   
3  Meja Portable Stand Laptop Meja Laptop Standin...   
4  PiPi Furniture Meja Gaming / Meja komputer / M...   

                   sellerId  \
0  6665e9847aa0dfec0ad43b26   
1  6665e9847aa0dfec0ad43b26   
2  6665e9847aa0dfec0ad43b26   
3  6665e9847aa0dfec0ad43b26   
4  6665e9847aa0dfec0ad43b26   

                                        productImage  __v description  
0  [https://storage.googleapis.co

# Data Preprocessing

In [4]:
# Prepare text data for embedding
titles = products['name'].tolist()
labels = products['category'].tolist()
combined_text = [f"{label} {title}" for label, title in zip(labels, titles)]

# Load the Universal Sentence Encoder
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Generate embeddings for the product descriptions
embeddings = embed(combined_text)

# Convert embeddings to numpy arrays
embeddings_np = embeddings.numpy()

# Assuming binary labels for demonstration purposes
labels = (products['category'] == 'Category1').astype(int).values  # Example binary labels based on category

# Split the data
train_embeddings, val_embeddings, train_labels, val_labels = train_test_split(embeddings_np, labels, test_size=0.2, random_state=42)

# Check the shapes of the splits to ensure correctness
print(f"Train embeddings shape: {train_embeddings.shape}")
print(f"Validation embeddings shape: {val_embeddings.shape}")
print(f"Train labels shape: {train_labels.shape}")
print(f"Validation labels shape: {val_labels.shape}")













Train embeddings shape: (1082, 512)
Validation embeddings shape: (271, 512)
Train labels shape: (1082,)
Validation labels shape: (271,)


# Create Model

In [5]:
# Create TensorFlow datasets from the embeddings
def create_tf_dataset(embeddings, labels):
    dataset = tf.data.Dataset.from_tensor_slices((embeddings, labels))
    dataset = dataset.shuffle(buffer_size=1024).batch(32)
    return dataset

train_dataset = create_tf_dataset(train_embeddings, train_labels)
val_dataset = create_tf_dataset(val_embeddings, val_labels)

# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu', input_shape=(train_embeddings.shape[1],)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(1, activation='sigmoid')  # For binary classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(train_dataset, validation_data=val_dataset, epochs=10)

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.5327 - loss: 0.8380 - val_accuracy: 1.0000 - val_loss: 0.5605
Epoch 2/10
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6598 - loss: 0.6437 - val_accuracy: 1.0000 - val_loss: 0.4394
Epoch 3/10
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7575 - loss: 0.5118 - val_accuracy: 1.0000 - val_loss: 0.3383
Epoch 4/10
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8573 - loss: 0.3946 - val_accuracy: 1.0000 - val_loss: 0.2394
Epoch 5/10
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9227 - loss: 0.2980 - val_accuracy: 1.0000 - val_loss: 0.1773
Epoch 6/10
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9448 - loss: 0.2323 - val_accuracy: 1.0000 - val_loss: 0.1217
Epoch 7/10
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━

In [6]:
def semantic_search(query, embed_model, trained_model, embeddings, data, top_k=10):
    # Generate the embedding for the query using the embed_model
    query_embedding = embed_model([query]).numpy()

    # Calculate cosine similarities
    similarities = cosine_similarity(query_embedding, embeddings).flatten()

    # Get the top_k products
    top_k_indices = np.argsort(similarities)[-top_k:][::-1]
    results = products.iloc[top_k_indices]
    return results

# Try Model

In [11]:
# Example usage
query = "rice cooker miyako"
results = semantic_search(query, embed, model, embeddings_np, products, top_k=10)

print(results)

                          _id category     price  \
144  6667ef73b3e75416b2fa7ec3     Meja  318000.0   
209  6667ef73b3e75416b2fa7f04     Meja  200000.0   
176  6667ef73b3e75416b2fa7ee3     Meja  240000.0   
53   6667ef73b3e75416b2fa7e68     Meja  119000.0   
184  6667ef73b3e75416b2fa7eeb     Meja  436000.0   
197  6667ef73b3e75416b2fa7ef8     Meja  107000.0   
168  6667ef73b3e75416b2fa7edb     Meja  204000.0   
230  6667ef73b3e75416b2fa7f19     Meja  195500.0   
266  6667ef73b3e75416b2fa7f3d     Meja   58000.0   
94   6667ef73b3e75416b2fa7e91     Meja  336000.0   

                                                  name  \
144                         MEJA MINIMALIS RANGKA BESI   
209  120CM Meja Belajar kayu Meja Tulis Minimalis M...   
176                     Meja Lipat Susun 80Ã—40Ã—75 cm   
53                         Meja komputer lesehan LS-01   
184  e-tife Meja Kantor/ Meja Kerja /Meja Komputer/...   
197                  Meja Kerja/meja belajar minimalis   
168  Meja Belajar Min

In [8]:
# Save model
model_path = os.path.join('../config', 'semanticmodel_config.json')
model_json = model.to_json()
with open(model_path, "w") as json_file:
    json_file.write(model_json)
    
# Save model weights in HDF5 format
weights_path = os.path.join('../weights', 'semanticmodel.weights.h5')
model.save_weights(weights_path)

In [9]:
# Load model & weights
model_path = os.path.join('../config', 'semanticmodel_config.json')
with open(model_path, "r") as json_file:
    loaded_model_json = json_file.read()

model1 = tf.keras.models.model_from_json(loaded_model_json)
model1.load_weights(os.path.join('../weights', 'semanticmodel.weights.h5'))

# Compile loaded model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])