<a href="https://colab.research.google.com/github/imrishabhyadav/NLP-Polysemy/blob/main/Polysemy_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers nltk
!pip install wordnet
!pip install nltk


import torch
from transformers import BertTokenizer, BertForMaskedLM
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
from transformers import pipeline
import nltk
from nltk.corpus import stopwords

# Download NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')

# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

# Set up pipeline
fill_mask = pipeline('fill-mask', model='bert-base-uncased', tokenizer='bert-base-uncased')

# Load stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [word for word in tokens if word.lower() not in stop_words]

def get_word_sense(word):
    synsets = wn.synsets(word)
    if synsets:
        return synsets[0].definition()
    return "No definition found."

def contextualize_polysemy(sentence, word):
    sentence_with_mask = sentence.replace(word, '[MASK]')
    predictions = fill_mask(sentence_with_mask)
    return predictions

def detect_polysemy_and_describe(sentence1, sentence2):
    polysemy_details = {}

    # Tokenize and remove stopwords
    tokens_sentence1 = remove_stopwords(word_tokenize(sentence1))
    tokens_sentence2 = remove_stopwords(word_tokenize(sentence2))

    # Get WordNet senses
    polysemy_1 = [(word, get_word_sense(word)) for word in tokens_sentence1 if wn.synsets(word)]
    polysemy_2 = [(word, get_word_sense(word)) for word in tokens_sentence2 if wn.synsets(word)]

    # Contextual predictions using BERT
    contextual_sentence1 = {word: contextualize_polysemy(sentence1, word) for word, _ in polysemy_1}
    contextual_sentence2 = {word: contextualize_polysemy(sentence2, word) for word, _ in polysemy_2}

    # Store results
    polysemy_details['sentence_1_polysemy'] = {
        'words_and_senses': polysemy_1,
        'contextual_predictions': contextual_sentence1
    }
    polysemy_details['sentence_2_polysemy'] = {
        'words_and_senses': polysemy_2,
        'contextual_predictions': contextual_sentence2
    }

    return polysemy_details

# Example sentences
sentence1 = "Turn the light on"
sentence2 = "The box was very light in weight"

result = detect_polysemy_and_describe(sentence1, sentence2)

# Pretty print
for key, value in result.items():
    print(f"\n{key}:")
    for subkey, subvalue in value.items():
        print(f"  {subkey}:")
        if isinstance(subvalue, dict):
            for word, predictions in subvalue.items():
                print(f"    {word}:")
                for pred in predictions:
                    print(f"      - {pred}")
        else:
            for item in subvalue:
                print(f"    - {item}")


Collecting wordnet
  Using cached wordnet-0.0.1b2.tar.gz (8.8 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoi


sentence_1_polysemy:
  words_and_senses:
    - ('Turn', 'a circular segment of a curve')
    - ('light', '(physics) electromagnetic radiation that can produce a visual sensation')
  contextual_predictions:
    Turn:
      - {'score': 0.3366808593273163, 'token': 2735, 'token_str': 'turn', 'sequence': 'turn the light on'}
      - {'score': 0.10120954364538193, 'token': 12342, 'token_str': 'shine', 'sequence': 'shine the light on'}
      - {'score': 0.04098863527178764, 'token': 2357, 'token_str': 'turned', 'sequence': 'turned the light on'}
      - {'score': 0.03962360695004463, 'token': 2681, 'token_str': 'leave', 'sequence': 'leave the light on'}
      - {'score': 0.03153924643993378, 'token': 2718, 'token_str': 'hit', 'sequence': 'hit the light on'}
    light:
      - {'score': 0.12967431545257568, 'token': 2422, 'token_str': 'light', 'sequence': 'turn the light on'}
      - {'score': 0.06076744943857193, 'token': 17446, 'token_str': 'ignition', 'sequence': 'turn the ignition on'}
 