<a href="https://colab.research.google.com/github/komal220504/Capturing_semantic_relationship/blob/main/NLP_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers sentence-transformers


In [None]:
from sentence_transformers import SentenceTransformer, util

# Load pre-trained BERT model (optimized for semantic similarity)
model = SentenceTransformer('all-MiniLM-L6-v2')

# Sample sentences
sentences = [
    "A dog is barking in the yard.",
    "There is a dog making noise outside.",
    "I love eating pizza on weekends.",
    "A canine is howling in the backyard."
]

# Encode sentences into embeddings
embeddings = model.encode(sentences, convert_to_tensor=True)

# Compute cosine similarity between all sentence pairs
similarity_matrix = util.pytorch_cos_sim(embeddings, embeddings)

# Display similarity scores
for i in range(len(sentences)):
    for j in range(len(sentences)):
        print(f"Similarity between:\n \"{sentences[i]}\"\n and\n \"{sentences[j]}\"\n → {similarity_matrix[i][j]:.4f}\n")


In [None]:
pip install transformers sentence-transformers torch


In [None]:
from transformers import MarianMTModel, MarianTokenizer
from sentence_transformers import SentenceTransformer, util
import torch

# Step 1: Load translation model (e.g., French to English)
src_lang = "fr"  # Source language code
model_name = f'Helsinki-NLP/opus-mt-{src_lang}-en'

translator_tokenizer = MarianTokenizer.from_pretrained(model_name)
translator_model = MarianMTModel.from_pretrained(model_name)

def translate_to_english(sentences):
    inputs = translator_tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)
    translated = translator_model.generate(**inputs)
    return [translator_tokenizer.decode(t, skip_special_tokens=True) for t in translated]

# Step 2: Load a sentence transformer model (VSM)
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Step 3: Multilingual documents (in French)
documents_fr = [
    "Le chien aboie dans la cour.",      # The dog barks in the yard.
    "J'aime manger de la pizza.",        # I love eating pizza.
    "Un chat dort sur le canapé.",       # A cat is sleeping on the couch.
    "Le soleil brille aujourd'hui."      # The sun is shining today.
]

# Step 4: Translate documents to English for embedding
documents_en = translate_to_english(documents_fr)

# Step 5: Convert to semantic vectors
doc_embeddings = embedding_model.encode(documents_en, convert_to_tensor=True)

# Step 6: Query (in English)
query = "A dog is barking outside."
query_embedding = embedding_model.encode(query, convert_to_tensor=True)

# Step 7: Compute cosine similarities
cos_scores = util.pytorch_cos_sim(query_embedding, doc_embeddings)[0]

# Step 8: Show results
print("Query:", query)
print("\nTop relevant documents:")
top_results = torch.topk(cos_scores, k=3)

for score, idx in zip(top_results[0], top_results[1]):
    print(f"\nFrench: {documents_fr[idx]}")
    print(f"English: {documents_en[idx]}")
    print(f"Score: {score:.4f}")


In [None]:
pip install transformers sentence-transformers torch


In [None]:
from transformers import MarianMTModel, MarianTokenizer
from sentence_transformers import SentenceTransformer, util
import torch

# Step 1: Load the Machine Translation model (French to English)
source_lang = "fr"  # You can change this to 'de', 'es', etc.
mt_model_name = f"Helsinki-NLP/opus-mt-{source_lang}-en"

tokenizer = MarianTokenizer.from_pretrained(mt_model_name)
mt_model = MarianMTModel.from_pretrained(mt_model_name)

def translate(sentences):
    """Translate sentences from source_lang to English."""
    tokens = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)
    translated = mt_model.generate(**tokens)
    return [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

# Step 2: Load the VSM Model (Sentence Embeddings)
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Step 3: Example Multilingual Documents (French)
documents_fr = [
    "Le chien court dans le jardin.",         # The dog runs in the garden.
    "Je cuisine une pizza délicieuse.",       # I am cooking a delicious pizza.
    "Le soleil brille fortement aujourd'hui.",# The sun is shining brightly today.
    "Un chat dort sur le canapé."             # A cat sleeps on the couch.
]

# Step 4: Translate to English
documents_en = translate(documents_fr)

# Step 5: Create Vector Representations (VSM)
document_embeddings = embedding_model.encode(documents_en, convert_to_tensor=True)

# Step 6: Query (in English or Translated)
query = "The dog is playing in the yard."
query_embedding = embedding_model.encode(query, convert_to_tensor=True)

# Step 7: Compute Similarities
cosine_scores = util.pytorch_cos_sim(query_embedding, document_embeddings)[0]

# Step 8: Display top matches
top_results = torch.topk(cosine_scores, k=3)

print("\n Query:", query)
print("\nTop Matching Documents:")
for score, idx in zip(top_results[0], top_results[1]):
    print(f"\n→ French: {documents_fr[idx]}")
    print(f"→ English: {documents_en[idx]}")
    print(f"→ Similarity Score: {score.item():.4f}")
