<a href="https://colab.research.google.com/github/himanshudas13/semantic-meaning-classifier/blob/main/Quora_semantic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
organizations_quora_question_pairs_dataset_path = kagglehub.dataset_download('organizations/quora/question-pairs-dataset')
thanakomsn_glove6b300dtxt_path = kagglehub.dataset_download('thanakomsn/glove6b300dtxt')
numberninja13_semantic_similarity_keras_default_1_path = kagglehub.model_download('numberninja13/semantic-similarity/Keras/default/1')

print('Data source import complete.')


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re
from bs4 import BeautifulSoup

import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip install  nltk
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
STOP_WORDS = stopwords.words("english")

In [None]:
df = pd.read_csv('/kaggle/input/question-pairs-dataset/questions.csv')
df.shape

(404351, 6)

In [None]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [None]:
df.value_counts('is_duplicate')

is_duplicate
0    255045
1    149306
Name: count, dtype: int64

In [None]:
#here since the data is quite unbalanced, we need to balance the data but undersampling
class_0 = df[df['is_duplicate'] == 0 ]   ## majority class
class_1 = df[df['is_duplicate'] == 1 ]   ## minority class


class_0_undersampled = class_0.sample(n = len(class_1) , random_state = 42)

sampled_df = pd.concat([class_0_undersampled, class_1], axis = 0).sample(frac=1, random_state=42)

sampled_df



Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
139827,139827,276792,276793,What is the importance of smart objects in Pho...,What is a smart object in Photoshop?,1
161175,161175,318664,318665,How do I check who blocked me on Instagram?,Is there a way to find out who blocked me on I...,1
227921,227921,449170,449171,How essential is Brijmohan Agrawal's role in C...,How important is Brijmohan Agrawal's role in C...,1
72941,72941,144954,144955,Why do some completely straight forward questi...,Why are some questions on Quora flagged as nee...,1
378149,378149,739656,739657,What are the features of java8?,Which course is suitable for MBA either HR and...,0
...,...,...,...,...,...,...
120436,120436,238672,238673,Do Ray-Bans come with a case?,Where can I buy Ray-Ban 4195s at a cheap price?,0
295641,295641,580738,580739,What qualification is required for work in SEBI?,What are the qualifications required to get a ...,1
231035,231035,455253,455254,How can we design a building which uses day li...,Can I choose 安菁莉 or 安荧莉 as my Chinese name?,0
50379,50379,100280,100281,Who are some bisexual/lesbian top models?,I have come out as bisexual but I think I migh...,0


In [None]:
##data pre processing
def preprocess(q):
    #lowercase and removed leading and trailing spaces
    q = str(q).lower().strip()

    # Replace certain special characters with their string equivalents
    q = q.replace('%', ' percent')
    q = q.replace('$', ' dollar ')
    q = q.replace('₹', ' rupee ')
    q = q.replace('€', ' euro ')
    q = q.replace('@', ' at ')

    # The pattern '[math]' appears around 900 times in the whole dataset.
    q = q.replace('[math]', '')

    # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
    q = q.replace(',000,000,000 ', 'b ')
    q = q.replace(',000,000 ', 'm ')
    q = q.replace(',000 ', 'k ')
    q = re.sub(r'([0-9]+)000000000', r'\1b', q)
    q = re.sub(r'([0-9]+)000000', r'\1m', q)
    q = re.sub(r'([0-9]+)000', r'\1k', q)

    # Decontracting words
    # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
    # https://stackoverflow.com/a/19794953
    contractions = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "can not have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }

    q_decontracted = []

    for word in q.split():
        if word in contractions:
            word = contractions[word]

        q_decontracted.append(word)

    q = ' '.join(q_decontracted)
    q = q.replace("'ve", " have")
    q = q.replace("n't", " not")
    q = q.replace("'re", " are")
    q = q.replace("'ll", " will")

    # Removing HTML tags
    q = BeautifulSoup(q)
    q = q.get_text()

    # Remove punctuations
    pattern = re.compile('\W')
    q = re.sub(pattern,' ', q).strip()


    return q

In [None]:
preprocess("I've already! wasn't <b>done</b>?")

'i have already  was not done'

In [None]:
sampled_df['question1'] = sampled_df['question1'].apply(preprocess)
sampled_df['question2'] = sampled_df['question2'].apply(preprocess)

In [None]:
sampled_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
139827,139827,276792,276793,what is the importance of smart objects in pho...,what is a smart object in photoshop,1
161175,161175,318664,318665,how do i check who blocked me on instagram,is there a way to find out who blocked me on i...,1
227921,227921,449170,449171,how essential is brijmohan agrawal s role in c...,how important is brijmohan agrawal s role in c...,1
72941,72941,144954,144955,why do some completely straight forward questi...,why are some questions on quora flagged as nee...,1
378149,378149,739656,739657,what are the features of java8,which course is suitable for mba either hr and...,0


In [None]:
#checking if there is no null values in our dataset
sampled_df.isnull().sum()
df.duplicated().sum()

0

In [None]:
#removing stopwords from our dataset

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(text)  # Tokenize and convert to lowercase
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)


sampled_df['question1'] = sampled_df["question1"].apply(remove_stopwords)
sampled_df['question2'] = sampled_df["question2"].apply(remove_stopwords)


In [None]:
sampled_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
139827,139827,276792,276793,importance smart objects photoshop,smart object photoshop,1
161175,161175,318664,318665,check blocked instagram,way find blocked instagram,1
227921,227921,449170,449171,essential brijmohan agrawal role chhattisgarh ...,important brijmohan agrawal role chhattisgarh ...,1
72941,72941,144954,144955,completely straight forward questions get mark...,questions quora flagged needing improvement ne...,1
378149,378149,739656,739657,features java8,course suitable mba either hr marketing,0


In [None]:
#dropping the unwanted columns
final_df = sampled_df.drop(columns = ['id' , 'qid1' , 'qid2'])


In [None]:
final_df.head()

Unnamed: 0,question1,question2,is_duplicate
139827,importance smart objects photoshop,smart object photoshop,1
161175,check blocked instagram,way find blocked instagram,1
227921,essential brijmohan agrawal role chhattisgarh ...,important brijmohan agrawal role chhattisgarh ...,1
72941,completely straight forward questions get mark...,questions quora flagged needing improvement ne...,1
378149,features java8,course suitable mba either hr marketing,0


In [None]:
x = final_df[['question1' , 'question2']]
y = final_df[['is_duplicate']]


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
from nltk.tokenize import word_tokenize
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Embedding, Bidirectional, LSTM, Dense, Dropout, Attention, GlobalMaxPooling1D, GlobalAveragePooling1D, Concatenate
)
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam

In [None]:
x["question1"] = x["question1"].apply(lambda x: str(x).lower() if isinstance(x, str) else "")
x["question2"] = x["question2"].apply(lambda x: str(x).lower() if isinstance(x, str) else "")

**MODEL BUILDING**

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("thanakomsn/glove6b300dtxt")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/glove6b300dtxt


In [None]:
glove_file = "/kaggle/input/glove6b300dtxt/glove.6B.300d.txt"
embedding_dim = 300
embeddings_index = {}

with open(glove_file, encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], dtype="float32")
        embeddings_index[word] = vectors


tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train["question1"].tolist() + X_train["question2"].tolist())
tokenizer.fit_on_texts(X_test["question1"].tolist() + X_test["question2"].tolist())
word_index = tokenizer.word_index





In [None]:
q1_sequences_train = tokenizer.texts_to_sequences(X_train["question1"])
q2_sequences_train = tokenizer.texts_to_sequences(X_train["question2"])
q1_sequences_test = tokenizer.texts_to_sequences(X_test["question1"])
q2_sequences_test = tokenizer.texts_to_sequences(X_test["question2"])

In [None]:
print(len(q1_sequences_test))
print(len(y_test))

59723
59723


In [None]:
max_len1 = max(len(seq) for seq in q1_sequences_train)
max_len2 = max(len(seq) for seq in q2_sequences_train)
max_len3 = max(len(seq) for seq in q2_sequences_test)
max_len4 = max(len(seq) for seq in q2_sequences_test)
max_len = max(max_len1 , max_len2 ,max_len3,max_len4 )



In [None]:
q1_padded_train = pad_sequences(q1_sequences_train, maxlen=max_len, padding="post")
q2_padded_train = pad_sequences(q2_sequences_train, maxlen=max_len, padding="post")
q1_padded_test = pad_sequences(q1_sequences_test, maxlen=max_len, padding="post")
q2_padded_test = pad_sequences(q2_sequences_test, maxlen=max_len, padding="post")
x1_train = [q1_padded_train , q2_padded_train]
x1_test = [q1_padded_test , q2_padded_test]



In [None]:
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
# Positional Encoding (Sinusoidal)
def get_sinusoidal_encoding(max_len, embedding_dim):
    position = np.arange(max_len)[:, np.newaxis]
    div_term = np.exp(np.arange(0, embedding_dim, 2) * -(np.log(10000.0) / embedding_dim))

    pos_enc = np.zeros((max_len, embedding_dim))
    pos_enc[:, 0::2] = np.sin(position * div_term)
    pos_enc[:, 1::2] = np.cos(position * div_term)

In [None]:

    input_q1 = Input(shape=(max_len,))
    input_q2 = Input(shape=(max_len,))

    # Shared embedding layer (frozen)
    embedding_layer = Embedding(
        input_dim=len(word_index) + 1,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=max_len,
        trainable=False,
    )

# Embedding layers for both questions
    embedded_q1 = embedding_layer(input_q1)
    embedded_q2 = embedding_layer(input_q2)

# LSTM layers
    lstm_q1 = Bidirectional(LSTM(64, return_sequences=True))(embedded_q1)
    lstm_q2 = Bidirectional(LSTM(64, return_sequences=True))(embedded_q2)

# Attention mechanism
    attention_q1 = GlobalMaxPooling1D()(lstm_q1)
    attention_q2 = GlobalMaxPooling1D()(lstm_q2)

# Combine features
    combined = Concatenate()([attention_q1, attention_q2])

# Dense layers
    dense = Dense(128, activation="relu")(combined)
    dropout = Dropout(0.3)(dense)
    output = Dense(1, activation="sigmoid")(dropout)

    # Create the model
    model = Model(inputs=[input_q1, input_q2], outputs=output)
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])




model.summary()



In [None]:
# Positional Encoding (Sinusoidal)
def get_sinusoidal_encoding(max_len, embedding_dim):
    position = np.arange(max_len)[:, np.newaxis]
    div_term = np.exp(np.arange(0, embedding_dim, 2) * -(np.log(10000.0) / embedding_dim))

    pos_enc = np.zeros((max_len, embedding_dim))
    pos_enc[:, 0::2] = np.sin(position * div_term)
    pos_enc[:, 1::2] = np.cos(position * div_term)

    return tf.convert_to_tensor(pos_enc, dtype=tf.float32)

In [None]:
# Transformer Block (Self-Attention + Feedforward Layer)
def transformer_block(inputs, embedding_dim, num_heads, ff_dim, dropout_rate=0.1):
    attn_output = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)(inputs, inputs)
    attn_output = Dropout(dropout_rate)(attn_output)
    out1 = LayerNormalization(epsilon=1e-6)(inputs + attn_output)

    ffn = Dense(ff_dim, activation='relu')(out1)
    ffn = Dense(embedding_dim)(ffn)
    ffn = Dropout(dropout_rate)(ffn)
    out2 = LayerNormalization(epsilon=1e-6)(out1 + ffn)

    return out2

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, LayerNormalization, MultiHeadAttention, GlobalMaxPooling1D, Concatenate
from tensorflow.keras.models import Model

num_heads = 4  # Number of attention heads
ff_dim = 256  # Feedforward dimension
# Inputs
input_q1 = Input(shape=(max_len,))
input_q2 = Input(shape=(max_len,))

# Shared Embedding Layer (GloVe + Positional Encoding)
embedding_layer = Embedding(input_dim=len(word_index) + 1,
                            output_dim=embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_len,
                            trainable=False)
# embedded_q1 = embedding_layer(input_q1) + get_sinusoidal_encoding(max_len, embedding_dim)
# embedded_q2 = embedding_layer(input_q2) + get_sinusoidal_encoding(max_len, embedding_dim)

embedded_q1 = embedding_layer(input_q1)
embedded_q2 = embedding_layer(input_q2)

# Transformer Encoder Blocks
transformer_q1 = transformer_block(embedded_q1, embedding_dim, num_heads, ff_dim)
transformer_q2 = transformer_block(embedded_q2, embedding_dim, num_heads, ff_dim)

# Cross-Attention between Question 1 and Question 2
cross_attention = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)(transformer_q1, transformer_q2)
cross_attention = GlobalMaxPooling1D()(cross_attention)

# Feature Fusion and Dense Layers
combined = Concatenate()([GlobalMaxPooling1D()(transformer_q1), GlobalMaxPooling1D()(transformer_q2), cross_attention])
dense = Dense(128, activation="relu")(combined)
dropout = Dropout(0.3)(dense)
output = Dense(1, activation="sigmoid")(dropout)

# Define Model
model = Model(inputs=[input_q1, input_q2], outputs=output)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Model Summary
model.summary()

In [None]:
X_train.shape

(238889, 2)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    monitor='val_loss',    # Monitor validation loss
    patience=5,            # Stop training after 3 epochs with no improvement
    restore_best_weights=True  # Restore the weights of the best epoch
)

# Train the model with early stopping


histoty = model.fit([x1_train[0], x1_train[1]], y_train, validation_data=([x1_test[0], x1_test[1]], y_test), epochs=100, batch_size=128 , callbacks=[early_stopping])

Epoch 1/100
[1m1867/1867[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m180s[0m 88ms/step - accuracy: 0.6415 - loss: 0.6451 - val_accuracy: 0.6997 - val_loss: 0.5759
Epoch 2/100
[1m1867/1867[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 82ms/step - accuracy: 0.7116 - loss: 0.5567 - val_accuracy: 0.7223 - val_loss: 0.5410
Epoch 3/100
[1m1867/1867[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 82ms/step - accuracy: 0.7278 - loss: 0.5357 - val_accuracy: 0.7268 - val_loss: 0.5386
Epoch 4/100
[1m1867/1867[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 82ms/step - accuracy: 0.7290 - loss: 0.5329 - val_accuracy: 0.7225 - val_loss: 0.5369
Epoch 5/100
[1m1867/1867[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 82ms/step - accuracy: 0.7412 - loss: 0.5144 - val_accuracy: 0.7334 - val_loss: 0.5244
Epoch 6/100
[1m1867/1867[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 82ms/step - accuracy: 0.7457 - loss: 0.5096 - val_accuracy: 0.7300 - val_loss:

In [None]:
model.save("semantic_LSTM.keras")


In [None]:
model.save("semantic_TRNSFRM.keras")


In [None]:
from keras.models import load_model
model = load_model('/kaggle/input/semantic-similarity/keras/default/1/semantic_TRNSFRM.keras')

In [None]:
loss, accuracy = model.evaluate([x1_test[0], x1_test[1]], y_test, batch_size=128)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")


[1m467/467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 30ms/step - accuracy: 0.7348 - loss: 0.5227
Test Loss: 0.5244, Test Accuracy: 0.7334


In [None]:
y_pred = model.predict([x1_test[0], x1_test[1]])


[1m1867/1867[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 8ms/step


In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

# Assuming all preprocessing functions, embedding_matrix, and model creation are already done

def preprocess_sentences(sentence1, sentence2, tokenizer, max_len):
    """
    Preprocess sentences: Tokenize and pad to max_len.
    """
    # Tokenizing the input sentences
    sequences1 = tokenizer.texts_to_sequences([sentence1])
    sequences2 = tokenizer.texts_to_sequences([sentence2])

    # Padding sequences
    padded_seq1 = pad_sequences(sequences1, maxlen=max_len, padding="post")
    padded_seq2 = pad_sequences(sequences2, maxlen=max_len, padding="post")

    return padded_seq1, padded_seq2

def predict_semantics(sentence1, sentence2, tokenizer, model, max_len):
    """
    Predicts whether the two input sentences are semantically the same or not.
    """
    # Check if sentence length exceeds max_len
    if len(sentence1.split()) > max_len or len(sentence2.split()) > max_len:
        return "Sentence is too long."

    # Preprocess sentences
    padded_seq1, padded_seq2 = preprocess_sentences(sentence1, sentence2, tokenizer, max_len)

    # Predict with the model
    prediction = model.predict([padded_seq1, padded_seq2])

    # Convert prediction to "same" or "different"
    if prediction >= 0.5:
        return "Semantically Same"
    else:
        return "Semantically Different"

# Assuming the tokenizer, embedding_matrix, and model are already defined and trained
# Example usage:

sentence1 = "Can I get a job?"
sentence2 = "Will I be employed?"

# Predict
result = predict_semantics(sentence1, sentence2, tokenizer, model, max_len)
print(result)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Semantically Same
