## Install

In [9]:
pip install --upgrade tensorflow tensorflow-hub pandas numpy scikit-learn python-dotenv

Note: you may need to restart the kernel to use updated packages.


## Import Necessary Libraries

In [10]:
import os
import requests
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from dotenv import load_dotenv

## Load Data

In [11]:
load_dotenv()  # Load environment variables from .env file
jwt_token = os.getenv('JWT_TOKEN')

headers = {'Authorization': f'Bearer {jwt_token}'}

api_urls = {
    'products': 'http://161.97.109.65:3000/api/products'
}

def fetch_data(url, headers):
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raises an HTTPError for bad requests
        data = pd.DataFrame(response.json())
        print(f"Data successfully fetched from {url}")
        print(data.head())  # Display the first few rows of the DataFrame
        return data
    except requests.RequestException as e:
        print(f'Failed to fetch data from {url}: {str(e)}')
        return pd.DataFrame()

# Fetch data from APIs
products = fetch_data(api_urls['products'], headers)

# Check if data was fetched successfully
if not products.empty:
    print("All data fetched successfully.")
else:
    print("Data fetching failed, check errors and retry.")
    # Optionally, add logic to halt further processing if data is crucial

Data successfully fetched from http://161.97.109.65:3000/api/products
                        _id    name         category description  price  \
0  6665935c2b9108ea2b463dc2  bababa  asdadaasdadaasd  1234567890     25   

                   sellerId  \
0  666545a42b9108ea2b463d87   

                                        productImage  __v  
0  [https://storage.googleapis.com/kelas-app-test...    0  
All data fetched successfully.


In [12]:
titles = products['name'].tolist()
labels = products['category'].tolist()

In [14]:
# Combine title and label for better semantic understanding
combined_text = [f"{label} {title}" for label, title in zip(labels, titles)]

# Load the Universal Sentence Encoder
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Generate embeddings for the combined descriptions
embeddings = embed(combined_text)

# Handling deprecated warnings by updating to current functions
tf.io.gfile.makedirs instead of tf.gfile.MakeDirs
tf.compat.v2.saved_model.load instead of tf.saved_model.load_v2

SyntaxError: invalid syntax (1653464730.py, line 11)

In [None]:
# Define the search function
def semantic_search(query, embeddings, data, top_k):
    # Generate the embedding for the query
    query_embedding = embed([query])

    # Calculate cosine similarities
    similarities = cosine_similarity(query_embedding, embeddings).flatten()
    # Get the top_k products
    top_k_indices = np.where(similarities > 0.3)[0][-top_k:][::-1]
    if len(top_k_indices) == 0:
        return None  # Return None if no results found
    results = data.iloc[top_k_indices]
    return results

In [None]:
# Example usage
query = "kasur lipat"
results = semantic_search(query, embeddings, datas, top_k=20)

print(results)

NameError: name 'datas' is not defined

In [None]:
query_embedding = embed([query])

In [None]:
similarities = cosine_similarity(query_embedding, embeddings).flatten()
similarities = similarities[similarities>0.3]

In [None]:
similarities

In [None]:
top_k_indices = similarities.argsort()[-5:][::-1]
top_k_indices

In [None]:
pip install tensorflow_text

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from annoy import AnnoyIndex
from sklearn.cluster import KMeans

# Load data
def load_data(file_path):
    data = pd.read_csv(file_path)
    data = data[['Label', 'Title', 'Harga', 'Asal Kota']]  # Selecting the relevant columns
    return data

# Preprocess the data
def preprocess(data):
    data['Title'] = data['Title'].str.lower().str.replace('[^\w\s]', '', regex=True)
    data['Asal Kota'] = data['Asal Kota'].str.lower().str.replace('[^\w\s]', '', regex=True)
    return data

# Load the pre-trained model from TensorFlow Hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Generate embeddings
def get_embeddings(data):
    embeddings = embed(data['Title'])
    return embeddings.numpy()

# Build Annoy index for efficient similarity search
def build_index(embeddings):
    dimension = embeddings.shape[1]  # Dimensions of embeddings
    index = AnnoyIndex(dimension, 'angular')
    for i, vector in enumerate(embeddings):
        index.add_item(i, vector)
    index.build(10)  # More trees, more precision
    return index

# Cluster data for recommendations
def cluster_data(embeddings, num_clusters=10):
    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(embeddings)
    return kmeans.labels_

# Search and recommend products
def search_and_recommend(query, index, data, embeddings, kmeans_labels, num_results=10):
    query_embedding = embed([query]).numpy()[0]
    indices = index.get_nns_by_vector(query_embedding, num_results)
    primary_results = data.iloc[indices]

    # Recommend additional items from the same cluster
    if not primary_results.empty:
        cluster_label = primary_results['Cluster'].mode()[0]
        additional_suggestions = data[data['Cluster'] == cluster_label].sample(n=5)
        return pd.concat([primary_results, additional_suggestions]).drop_duplicates()
    return primary_results

In [None]:
# Main function to execute the process
def main(file_path):
    data = load_data(file_path)
    data = preprocess(data)
    embeddings = get_embeddings(data)
    index = build_index(embeddings)
    data['Cluster'] = cluster_data(embeddings)

    # Example search
    query = "kasur kos"
    results = search_and_recommend(query, index, data, embeddings, data['Cluster'])
    return results

In [None]:
# Hasil Rekomendasi
main('/content/shopee.csv')

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Sample data (product titles and corresponding recommendations)
product_titles = data['Title']

recommendations = list(data['Label'])

# Tokenize product titles
tokenizer = Tokenizer()
tokenizer.fit_on_texts(product_titles)
sequences = tokenizer.texts_to_sequences(product_titles)

# Padding sequences
max_sequence_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

# Convert recommendations to one-hot encoding
recommendation_classes = sorted(list(set(recommendations)))
recommendations_one_hot = np.zeros((len(recommendations), len(recommendation_classes)))
for i, rec in enumerate(recommendations):
    idx = recommendation_classes.index(rec)
    recommendations_one_hot[i, idx] = 1

# Build a simple Sequential model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=16, input_length=max_sequence_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(len(recommendation_classes), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(padded_sequences, recommendations_one_hot, epochs=10, batch_size=1)

# Predict recommendations for new product titles
new_product_titles = ["Meja Belajar Anak"]
new_sequences = tokenizer.texts_to_sequences(new_product_titles)
new_padded_sequences = pad_sequences(new_sequences, maxlen=max_sequence_length, padding='post')
predictions = model.predict(new_padded_sequences)

# Get the top 20 predictions
top_indices = np.argsort(predictions, axis=1)[:, -20:][0]
top_predictions = predictions[0][top_indices]

# Output the top 20 predictions
output_predictions = []
for i, idx in enumerate(top_indices):
    output_predictions.append((recommendation_classes[idx], top_predictions[i]))

print(output_predictions)

In [None]:
predicted_class_index = np.argmax(predictions)

# Use the predicted class index to recommend a product
recommended_product = recommendations[predicted_class_index]

print("Recommended Product:", recommended_product)

In [None]:
# Step 1: Load the CSV File
df = data
df.head()

# Step 2: Preprocess Data
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['Title'].values)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(df['Title'].values)
padded_sequences = pad_sequences(sequences, padding='post')

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['Title'].values, test_size=0.2, random_state=42)

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

print("Shape of X_train:", X_train.shape)
print("Shape of y_train_encoded:", y_train_encoded.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test_encoded:", y_test_encoded.shape)

# Step 3: Build and Train the Model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, GlobalAveragePooling1D

embedding_dim = 50
max_length = len(padded_sequences[0])

inputs = Input(shape=(max_length,))
x = Embedding(input_dim=5000, output_dim=embedding_dim, input_length=max_length)(inputs)
x = LSTM(64, return_sequences=True)(x)
x = GlobalAveragePooling1D()(x)
x = Dropout(0.5)(x)
outputs = Dense(len(label_encoder.classes_), activation='softmax')(x)

model = Model(inputs, outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

history = model.fit(
    X_train, y_train_encoded,
    epochs=10,
    validation_data=(X_test, y_test_encoded),
    batch_size=16,
    verbose=1
)

# Step 4: Make Recommendations
def recommend_products(title, model, tokenizer, label_encoder, num_recommendations=3):
    sequence = tokenizer.texts_to_sequences([title])
    padded_sequence = pad_sequences(sequence, maxlen=len(padded_sequences[0]), padding='post')
    predictions = model.predict(padded_sequence).flatten()
    top_indices = predictions.argsort()[-num_recommendations:][::-1]

    recommended_titles = [label_encoder.inverse_transform([i])[0] for i in top_indices]

    return recommended_titles

# Example: Recommend products for a given title
title = "Meja kerja kayu untuk kantor"
recommendations = recommend_products(title, model, tokenizer, label_encoder)
print("Recommended product titles for the title '{}': {}".format(title, recommendations))


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Dropout, concatenate
from tensorflow.keras.callbacks import EarlyStopping


# Encode the product titles and labels
title_encoder = LabelEncoder()
data['Title_encoded'] = title_encoder.fit_transform(data['Title'])

label_encoder = LabelEncoder()
data['Label_encoded'] = label_encoder.fit_transform(data['Label'])

# Normalize the product IDs
scaler = MinMaxScaler()
data['Product ID'] = scaler.fit_transform(data['Product ID'].values.reshape(-1, 1))

# Prepare arrays
product_ids = data['Product ID'].values
titles = data['Title_encoded'].values
labels = data['Label_encoded'].values

# Split data into train and test sets
train_product_ids, test_product_ids, train_titles, test_titles, train_labels, test_labels = train_test_split(product_ids, titles, labels, test_size=0.2, random_state=42)

# Define inputs
input_product = Input(shape=(1,), name='Product')
input_title = Input(shape=(1,), name='Title')
input_label = Input(shape=(1,), name='Label')

# Define embedding layers
embedding_product = Embedding(input_dim=len(data['Product ID']) + 1, output_dim=50)(input_product)
embedding_title = Embedding(input_dim=len(data['Title_encoded']) + 1, output_dim=50)(input_title)
embedding_label = Embedding(input_dim=len(data['Label_encoded']) + 1, output_dim=50)(input_label)

# Flatten embedding layers
flat_product = Flatten()(embedding_product)
flat_title = Flatten()(embedding_title)
flat_label = Flatten()(embedding_label)

# Concatenate embeddings
concat = concatenate([flat_product, flat_title, flat_label])

# Add dense layers
dense_1 = Dense(256, activation='relu')(concat)
dropout_1 = Dropout(0.5)(dense_1)
dense_2 = Dense(128, activation='relu')(dropout_1)
dropout_2 = Dropout(0.5)(dense_2)
output = Dense(1, activation='linear')(dropout_2)

# Define the model
model = Model(inputs=[input_product, input_title, input_label], outputs=output)

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mean_squared_error')

# Reshape the data to fit the model input
train_product_ids = train_product_ids.reshape(-1, 1)
test_product_ids = test_product_ids.reshape(-1, 1)
train_titles = train_titles.reshape(-1, 1)
test_titles = test_titles.reshape(-1, 1)
train_labels = train_labels.reshape(-1, 1)
test_labels = test_labels.reshape(-1, 1)

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
model.fit([train_product_ids, train_titles, train_labels], train_titles, epochs=100, batch_size=64, verbose=1, validation_data=([test_product_ids, test_titles, test_labels], test_titles), callbacks=[early_stopping])

# Function to get product recommendations based on model predictions
def get_recommendations(product_id, title, label, model, top_n=10):
    # Create an array of product IDs to predict
    all_product_ids = np.array(product_ids).reshape(-1, 1)
    all_titles = np.array(titles).reshape(-1, 1)
    all_labels = np.array(labels).reshape(-1, 1)

    # Predict the titles for all products
    predicted_titles = model.predict([all_product_ids, all_titles, all_labels]).flatten()

    # Get the indices of the top N predictions
    top_indices = np.argsort(predicted_titles)[-top_n:]

    # Return the top N product IDs
    return product_ids[top_indices]

# Function to search for products and get recommendations
def search_and_recommend(search_term, data, model, top_n=10):
    # Filter the products based on the search term
    filtered_data = data[data['Title'].str.contains(search_term, case=False, na=False)]

    if filtered_data.empty:
        return pd.DataFrame(columns=['Recommended Product Titles'])

    # Use the first product ID from the filtered results for recommendations
    sample_product_id = filtered_data['Product ID'].values[0]
    sample_title = filtered_data['Title_encoded'].values[0]
    sample_label = filtered_data['Label_encoded'].values[0]
    recommended_product_ids = get_recommendations(sample_product_id, sample_title, sample_label, model, top_n=top_n)

    # Get the recommended product titles
    recommended_products_titles = data[data['Product ID'].isin(recommended_product_ids)]['Title']

    # Ensure the recommended products are similar to the search term
    recommended_products_titles = recommended_products_titles[recommended_products_titles.str.contains(search_term, case=False, na=False)]

    # Create a DataFrame to display the recommended product titles as a column
    recommendations_df = pd.DataFrame(recommended_products_titles.tolist(), columns=['Recommended Product Titles'])

    return recommendations_df

# Example usage: search for "Meja Belajar" and get recommendations
search_term = "Meja Belajar"
recommendations_df = search_and_recommend(search_term, data, model)

# Display the recommendations
print(recommendations_df)


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# Define a function to clean the data
def clean_data(df):
    df = df.dropna(subset=['Title'])  # Drop rows with missing titles
    df = df.reset_index(drop=True)    # Reset index after dropping rows
    return df

# Clean the data
data = clean_data(data)

# Vectorize the product titles using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['Title'])

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to get product recommendations based on title similarity
def get_recommendations(title, cosine_sim=cosine_sim):
    # Find the index of the product that matches the title
    idx = data[data['Title'].str.contains(title, case=False, na=False)].index[0]

    # Get the pairwise similarity scores of all products with the specified product
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the products based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the top 10 most similar products
    sim_scores = sim_scores[1:11]

    # Get the product indices
    product_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar product titles
    return data['Title'].iloc[product_indices]

# Example usage: search for "Meja Belajar" and get recommendations
search_term = "Meja kayu"
recommendations = get_recommendations(search_term)

# Display the recommendations
print("Recommended Product Titles:")
for i, title in enumerate(recommendations):
    print(f"{i + 1}. {title}")


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Flatten, Dense, Input
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics.pairwise import cosine_similarity

# Prepare data
titles = data['Title'].tolist()
labels = data['Label'].tolist()

# Combine title and label for better semantic understanding
combined_text = [f"{label} {title}" for label, title in zip(labels, titles)]

# Tokenize the combined text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(combined_text)
sequences = tokenizer.texts_to_sequences(combined_text)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# Pad sequences
max_sequence_length = max(len(seq) for seq in sequences)
data_padded = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

# Define the model
embedding_dim = 50

model = Sequential()
model.add(Input(shape=(max_sequence_length,)))
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(embedding_dim, activation='linear'))

model.compile(optimizer='adam', loss='mean_squared_error')
model.fit
model.summary()

# Generate embeddings for the product descriptions
embeddings = model.predict(data_padded)

# Define the search function
def semantic_search(query, embeddings, data, tokenizer, model, max_sequence_length, top_k=10):
    # Tokenize and pad the query
    query_seq = tokenizer.texts_to_sequences([query])
    query_padded = pad_sequences(query_seq, maxlen=max_sequence_length, padding='post')

    # Generate the embedding for the query
    query_embedding = model.predict(query_padded)

    # Calculate cosine similarities
    similarities = cosine_similarity(query_embedding, embeddings).flatten()

    # Get the top_k products
    top_k_indices = np.argsort(similarities)[-top_k:][::-1]
    results = data.iloc[top_k_indices]
    return results



In [None]:
# Example usage
query = "meja"
results = semantic_search(query, embeddings, data, tokenizer, model, max_sequence_length, top_k=20)

print(results)