<a href="https://colab.research.google.com/github/kk412027247/nlp/blob/main/compare_two_sentence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
import tensorflow as tf
from transformers import TFBertModel, BertTokenizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertModel.from_pretrained(model_name)

# Function to get sentence embeddings with pooling over the last hidden state
def get_sentence_embedding(sentence):
    # Tokenize the input sentence
    inputs = tokenizer(sentence, return_tensors='tf', padding=True, truncation=True, max_length=512)

    # Get the BERT model outputs (hidden states)
    outputs = model(**inputs)

    # Last hidden states (batch_size, seq_length, hidden_size)
    last_hidden_state = outputs.last_hidden_state

    # Apply mean pooling over the sequence dimension
    pooled_embedding = tf.reduce_mean(last_hidden_state, axis=1)

    # print(pooled_embedding.shape, '\n')
    return pooled_embedding

# Function to compute cosine similarity
def compute_cosine_similarity(embedding1, embedding2):
    # Convert embeddings to numpy arrays
    embedding1 = embedding1.numpy()
    embedding2 = embedding2.numpy()

    # Compute cosine similarity
    similarity = cosine_similarity(embedding1, embedding2)
    return similarity[0][0]




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [51]:
# Example sentences
sentence1 = "I love programming."
sentence2 = "Coding is my passion."

# Get embeddings for both sentences
embedding1 = get_sentence_embedding(sentence1)
embedding2 = get_sentence_embedding(sentence2)

# Compute cosine similarity
similarity_score = compute_cosine_similarity(embedding1, embedding2)
print(f"Cosine Similarity: {similarity_score}")

Cosine Similarity: 0.8084195256233215


In [48]:
# Example sentences
sentence1 = "I love programming."
sentence2 = "how are you."

# Get embeddings for both sentences
embedding1 = get_sentence_embedding(sentence1)
embedding2 = get_sentence_embedding(sentence2)

# Compute cosine similarity
similarity_score = compute_cosine_similarity(embedding1, embedding2)
print(f"Cosine Similarity: {similarity_score}")

Cosine Similarity: 0.4891205430030823


In [18]:
# Example sentences
sentence1 = "how are you."
sentence2 = "i am fine, thank you."

# Get embeddings for both sentences
embedding1 = get_sentence_embedding(sentence1)
embedding2 = get_sentence_embedding(sentence2)

# Compute cosine similarity
similarity_score = compute_cosine_similarity(embedding1, embedding2)
print(f"Cosine Similarity: {similarity_score}")

Cosine Similarity: 0.6672059297561646


In [22]:
# Example sentences
sentence1 = "I love programming."
sentence2 = "I am a c# developer."

# Get embeddings for both sentences
embedding1 = get_sentence_embedding(sentence1)
embedding2 = get_sentence_embedding(sentence2)

# Compute cosine similarity
similarity_score = compute_cosine_similarity(embedding1, embedding2)
print(f"Cosine Similarity: {similarity_score}")

Cosine Similarity: 0.7123180627822876


In [23]:
# Example sentences
sentence1 = "I love programming."
sentence2 = "I like Java."

# Get embeddings for both sentences
embedding1 = get_sentence_embedding(sentence1)
embedding2 = get_sentence_embedding(sentence2)

# Compute cosine similarity
similarity_score = compute_cosine_similarity(embedding1, embedding2)
print(f"Cosine Similarity: {similarity_score}")

Cosine Similarity: 0.8276803493499756


In [52]:
# Example sentences
sentence1 = "I love programming."
sentence2 = "I like javascript."

# Get embeddings for both sentences
embedding1 = get_sentence_embedding(sentence1)
embedding2 = get_sentence_embedding(sentence2)

# Compute cosine similarity
similarity_score = compute_cosine_similarity(embedding1, embedding2)
print(f"Cosine Similarity: {similarity_score}")

Cosine Similarity: 0.8029322624206543


In [53]:
# Example sentences
sentence1 = "I love programming."
sentence2 = "how are you."

# Get embeddings for both sentences
embedding1 = get_sentence_embedding(sentence1)
embedding2 = get_sentence_embedding(sentence2)

# Compute cosine similarity
similarity_score = compute_cosine_similarity(embedding1, embedding2)
print(f"Cosine Similarity: {similarity_score}")

Cosine Similarity: 0.4891205430030823
