In [12]:
from collections import defaultdict, Counter
import numpy as np

# Step 1: Calculate Prior Probabilities
def calculate_priors(training_data):
    class_counts = Counter([label for label, _ in training_data])
    total_count = sum(class_counts.values())
    priors = {cls: count / total_count for cls, count in class_counts.items()}
    return priors, class_counts

# Step 2: Calculate Feature Likelihoods
def calculate_likelihoods(training_data, class_counts):
    feature_likelihoods = defaultdict(lambda: defaultdict(lambda: 1))  # Laplace smoothing

    for label, features in training_data:
        for feature in features:
            feature_likelihoods[label][feature] += 1

    # Normalize likelihoods
    for label in feature_likelihoods:
        total_count = sum(feature_likelihoods[label].values())
        for feature in feature_likelihoods[label]:
            feature_likelihoods[label][feature] /= total_count

    return feature_likelihoods

# Step 3: Calculate Posterior Probabilities and Predict Class
def predict(features, priors, feature_likelihoods):
    posteriors = {}

    for label in priors:
        posterior = np.log(priors[label])  # Start with log(P(Class))
        for feature in features:
            if feature in feature_likelihoods[label]:
                posterior += np.log(feature_likelihoods[label][feature])  # Multiply by P(feature|Class)
            else:
                posterior += np.log(1 / (sum(feature_likelihoods[label].values()) + len(features)))  # Smoothing

        posteriors[label] = posterior

    return max(posteriors, key=posteriors.get)  # Return the class with the highest posterior

# Example usage
if __name__ == "__main__":
    # Training data: a list of tuples (class, feature_vector)
    training_data = [
        ('f', ['fish', 'smoked','fish']),
        ('f', ['fish', 'line']),
        ('f', ['fish', 'haul','smoked']),
        ('g', ['guitar', 'jazz','line']),
    ]
    
    # Step 1: Calculate Priors
    priors, class_counts = calculate_priors(training_data)
    
    # Step 2: Calculate Likelihoods for Each Feature
    feature_likelihoods = calculate_likelihoods(training_data, class_counts)
    
    # Test data (feature vector)
    test_features = ['line', 'guitar', 'jazz','jazz',]
    
    # Step 3: Predict Class
    predicted_class = predict(test_features, priors, feature_likelihoods)
    print(f"The predicted class for features {test_features} is '{predicted_class}'")

The predicted class for features ['line', 'guitar', 'jazz', 'jazz'] is 'g'


In [39]:
import math
from collections import defaultdict
from nltk import word_tokenize

# Given data
data = {
    1: ['bass', 'eat', 'amount'],
    2: ['bass', 'lunch', 'excellent'],
    3: ['bass', 'ate','like'],
    4: ['guitar', 'play', 'music'],
    5: ['money', 'interest', 'pay','amount'],
    6: ['guitar','interest','melody'],
    7: ['fish', 'haul','line'],
    8: ['guitar','like','play'],
    9: ['rate']
}

# Corresponding classes (senses)
classes = {
    1: 'fish',
    2: 'fish',
    3: 'fish',
    4: 'instrument',
    5: 'finance',
    6: 'instrument',
    7: 'fish',
    8: 'instrument',
    9: 'finance'
}

# 1) Calculate priors
class_counts = defaultdict(int)
for cls in classes.values():
    class_counts[cls] += 1

# Number of documents
N = len(classes)

# Calculate prior probabilities
priors = {cls: count / N for cls, count in class_counts.items()}

# 2) Calculate the conditional probability of each word with each class
word_counts = defaultdict(lambda: defaultdict(int))

for idx, words in data.items():
    cls = classes[idx]
    for word in words:
        word_counts[cls][word.lower()] += 1  # Convert words to lowercase for consistency

# Total words per class
total_words_per_class = {cls: sum(counts.values()) for cls, counts in word_counts.items()}

# Full vocabulary size
vocab = set(word.lower() for words in data.values() for word in words)
V = len(vocab)

# Calculate conditional probabilities
conditional_probabilities = defaultdict(dict)

for cls, words in word_counts.items():
    for word in vocab:
        count = words[word]  # This will be 0 if the word isn't in the class
        conditional_probabilities[cls][word] = (count + 1) / (total_words_per_class[cls] + V)

# 3) Define the target words and find v (count of words in to-be-found case/test case)
x = input("ENTER target words:")
target_words = word_tokenize(x)
# 4) Score calculation
scores = defaultdict(float)

# Calculate scores for each class
for cls in priors.keys():
    scores[cls] = math.log(priors[cls])  # Initialize with log prior

    for word in target_words:
        vj = word.lower()  # Convert word to lower case for comparison
        if vj in conditional_probabilities[cls]:
            scores[cls] += math.log(conditional_probabilities[cls][vj])
        else:
            # If the word is not found, assume a small probability (Laplace smoothing)
            scores[cls] += math.log(1 / (total_words_per_class[cls] + V))

# Determine the class with the highest score
predicted_class = max(scores, key=scores.get)

# Output the results
print(f"Scores: {scores}")
print(f"Predicted Class for target words {target_words}: {predicted_class}")

ENTER target words: i like to play guitar


Scores: defaultdict(<class 'float'>, {'fish': -17.12376994396716, 'instrument': -14.39974278834181, 'finance': -17.181548476422023})
Predicted Class for target words ['i', 'like', 'to', 'play', 'guitar']: instrument


In [27]:
import math
from collections import defaultdict

# Given data
data = {
    1: ['Bass', 'eat', 'super','pay'],
    2: ['Bass', 'lunch', 'excellent'],
    3: ['Bass', 'ate', 'like'],
    4: ['Bass', 'play', 'music'],
    5: ['Bass', 'interest', 'pay'],
    6: ['fish', 'smoked','fish'],
    7: ['fish', 'line'],
    8: ['line','vocals']
}

# Corresponding classes (senses)
classes = {
    1: 'fish',
    2: 'fish',
    3: 'fish',
    4: 'music',
    5: 'music',
    6: 'fish',
    7: 'fish',
    8: 'music'
}

# 1) Calculate priors
class_counts = defaultdict(int)
for cls in classes.values():
    class_counts[cls] += 1

# Number of documents
N = len(classes)

# Calculate prior probabilities
priors = {cls: count / N for cls, count in class_counts.items()}

# 2) Calculate the conditional probability of each word with each class
word_counts = defaultdict(lambda: defaultdict(int))

for idx, words in data.items():
    cls = classes[idx]
    for word in words:
        word_counts[cls][word] += 1

# Total words per class
total_words_per_class = {cls: sum(counts.values()) for cls, counts in word_counts.items()}

# Calculate conditional probabilities
conditional_probabilities = defaultdict(dict)

for cls, words in word_counts.items():
    for word, count in words.items():
        # Use Laplace smoothing
        conditional_probabilities[cls][word] = (count + 1) / (total_words_per_class[cls] + len(word_counts[cls]))

# 3) Define the target words and find v (count of words in to-be-found case/test case)
target_words = ['Bass', 'line', 'super','smoked','pay' ]

# 4) Score calculation
scores = defaultdict(float)

# Calculate scores for each class
for cls in priors.keys():
    scores[cls] = math.log(priors[cls])  # Initialize with log prior

    for word in target_words:
        vj = word.lower()  # Convert word to lower case for comparison
        if vj in conditional_probabilities[cls]:
            scores[cls] += math.log(conditional_probabilities[cls][vj])
        else:
            # If the word is not found, assume a small probability (Laplace smoothing)
            scores[cls] += math.log(1 / (total_words_per_class[cls] + len(word_counts[cls])))

# Determine the class with the highest score
predicted_class = max(scores, key=scores.get)

# Output the results
print(f"Scores: {scores}")
print(f"Predicted Class for target words {target_words}: {predicted_class}")

Scores: defaultdict(<class 'float'>, {'fish': -13.987897597113363, 'music': -13.134785897402887})
Predicted Class for target words ['Bass', 'line', 'super', 'smoked', 'pay']: music


In [28]:
import math
from collections import defaultdict

# Given data
data = {
    1: ['Bass', 'eat', 'super'],
    2: ['Bass', 'lunch', 'excellent'],
    3: ['Bass', 'ate', 'like'],
    4: ['Bass', 'play', 'music'],
    5: ['Bass', 'interest', 'pay'],
    6: ['fish', 'smoked', 'fish'],
    7: ['fish', 'line'],
    8: ['line', 'vocals']
}

# Corresponding classes (senses)
classes = {
    1: 'fish',
    2: 'fish',
    3: 'fish',
    4: 'music',
    5: 'music',
    6: 'fish',
    7: 'fish',
    8: 'music'
}

# 1) Calculate priors
class_counts = defaultdict(int)
for cls in classes.values():
    class_counts[cls] += 1

# Number of documents
N = len(classes)

# Calculate prior probabilities
priors = {cls: count / N for cls, count in class_counts.items()}

# 2) Calculate the conditional probability of each word with each class
word_counts = defaultdict(lambda: defaultdict(int))

for idx, words in data.items():
    cls = classes[idx]
    for word in words:
        word_counts[cls][word] += 1

# Total words per class
total_words_per_class = {cls: sum(counts.values()) for cls, counts in word_counts.items()}

# Calculate conditional probabilities
conditional_probabilities = defaultdict(dict)

for cls, words in word_counts.items():
    for word, count in words.items():
        # Use Laplace smoothing
        conditional_probabilities[cls][word] = (count + 1) / (total_words_per_class[cls] + len(word_counts[cls]))

# 3) Define the target words and find v (count of words in to-be-found case/test case)
target_words = ['Bass', 'line', 'super', 'smoked']

# 4) Score calculation
scores = defaultdict(float)

# Calculate scores for each class
for cls in priors.keys():
    scores[cls] = math.log(priors[cls])  # Initialize with log prior

    for word in target_words:
        vj = word.lower()  # Convert word to lower case for comparison
        if vj in conditional_probabilities[cls]:
            scores[cls] += math.log(conditional_probabilities[cls][vj])
        else:
            # If the word is not found, assume a small probability (Laplace smoothing)
            scores[cls] += math.log(1 / (total_words_per_class[cls] + len(word_counts[cls])))

# Determine the class with the highest score
predicted_class = max(scores, key=scores.get)

# Output the results
print(f"Scores: {scores}")
print(f"Predicted Class for target words {target_words}: {predicted_class}")


Scores: defaultdict(<class 'float'>, {'fish': -11.102777408957682, 'music': -11.119882876860622})
Predicted Class for target words ['Bass', 'line', 'super', 'smoked']: fish


In [49]:
def word_sense_disambiguation(context, senses, words_in_context):
    # Definitions of senses
    D1 = {"mammal", "flies", "night"}
    D2 = {"sports", "equipment", "hit", "ball", "player", "game"}
    
    # Context from the sentence
    Vj = set(words_in_context)
    
    # Possible senses
    S1 = senses[0]  # mammal
    S2 = senses[1]  # sports equipment
    
    # EV sets for related words in the context
    EV1 = {"player", "animal", "creature", "wings"}
    EV2 = {"sports", "game", "hit", "ball"}
    EV3 = {"equipment", "bat", "ball", "player"}
    
    # Union of all EV sets
    EV_union = EV1.union(EV2).union(EV3)
    
    # Calculate scores for each sense
    score_mammal = len(D1.intersection(EV_union.intersection(Vj)))
    score_sports_equipment = len(D2.intersection(EV_union.intersection(Vj)))
    
    # Determine which sense the word belongs to
    if score_mammal > score_sports_equipment:
        return f"The word is used in the sense of {S1}"
    elif score_sports_equipment > score_mammal:
        return f"The word is used in the sense of {S2}"
    else:
        return "The word sense is ambiguous"

# Example usage
context = "The player swung the bat and hit the ball out of the park."
senses = ["mammal", "sports equipment"]
words_in_context = ["player", "swung", "bat", "hit", "ball", "park"]

result = word_sense_disambiguation(context, senses, words_in_context)
print(result)


The word is used in the sense of sports equipment


In [50]:
def word_sense_disambiguation(context, senses, words_in_context):
    # Definitions of senses
    D1 = {"mammal", "flies", "night", "insects", "hunting"}
    D2 = {"sports", "equipment", "hit", "ball", "player", "game"}
    
    # Context from the sentence
    Vj = set(words_in_context)
    
    # Possible senses
    S1 = senses[0]  # mammal
    S2 = senses[1]  # sports equipment
    
    # EV sets for related words in the context
    EV1 = {"animal", "creature", "wings", "night"}
    EV2 = {"insects", "hunting", "senses"}
    EV3 = {"fly", "silent", "night"}
    
    # Union of all EV sets
    EV_union = EV1.union(EV2).union(EV3)
    
    # Calculate scores for each sense
    score_mammal = len(D1.intersection(EV_union.intersection(Vj)))
    score_sports_equipment = len(D2.intersection(EV_union.intersection(Vj)))
    
    # Determine which sense the word belongs to
    if score_mammal > score_sports_equipment:
        return f"The word is used in the sense of {S1}"
    elif score_sports_equipment > score_mammal:
        return f"The word is used in the sense of {S2}"
    else:
        return "The word sense is ambiguous"

# Example usage
context = "The bat flew silently through the night, hunting insects with its sharp senses."
senses = ["mammal", "sports equipment"]
words_in_context = ["bat", "flew", "silently", "night", "hunting", "insects", "senses"]

result = word_sense_disambiguation(context, senses, words_in_context)
print(result)


The word is used in the sense of mammal


In [52]:
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer



# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

def word_sense_disambiguation(context, senses, words_in_context):
    # Definitions of senses
    D1 = {"mammal", "flies", "night", "insects", "hunting"}
    D2 = {"sports", "equipment", "hit", "ball", "player", "game"}
    
    # Context from the sentence
    Vj = set(words_in_context)
    
    # Possible senses
    S1 = senses[0]  # mammal
    S2 = senses[1]  # sports equipment
    
    # EV sets for related words in the context
    EV1 = {"player", "animal", "creature", "wings"}
    EV2 = {"sports", "game", "hit", "ball"}
    EV3 = {"equipment", "bat", "ball", "player"}
    
    # Union of all EV sets
    EV_union = EV1.union(EV2).union(EV3)
    
    # Calculate scores for each sense
    score_mammal = len(D1.intersection(EV_union.intersection(Vj)))
    score_sports_equipment = len(D2.intersection(EV_union.intersection(Vj)))
    
    # Determine which sense the word belongs to
    if score_mammal > score_sports_equipment:
        return f"The word is used in the sense of {S1}"
    elif score_sports_equipment > score_mammal:
        return f"The word is used in the sense of {S2}"
    else:
        return "The word sense is ambiguous"

# Get input from the user
context = input("Enter a sentence containing the word 'bat': ")

# Tokenize and lemmatize the words in the context
words_in_context = nltk.word_tokenize(context)
lemmatized_words = [lemmatizer.lemmatize(word.lower()) for word in words_in_context]

# Example senses for "bat"
senses = ["mammal", "sports equipment"]

# Run the word sense disambiguation
result = word_sense_disambiguation(context, senses, lemmatized_words)
print(result)


Enter a sentence containing the word 'bat':  The player swung the bat and hit the ball out of the park


The word is used in the sense of sports equipment


In [55]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

def word_sense_disambiguation(context, senses, words_in_context):
    # Definitions of senses
    D1 = {"mammal", "flies", "night", "insects", "hunting"}
    D2 = {"sports", "equipment", "hit", "ball", "player", "game"}
    
    # Context from the sentence
    Vj = set(words_in_context)
    
    # Possible senses
    S1 = senses[0]  # mammal
    S2 = senses[1]  # sports equipment
    
    # EV sets for related words in the context
    EV1_mammal = {"animal", "creature", "wings", "night", "insects", "fly", "hunting", "silent"}
    EV1_sports = {"player", "sports", "game", "hit", "ball", "equipment", "bat"}
    
    # Calculate scores for each sense
    score_mammal = len(D1.intersection(EV1_mammal.intersection(Vj)))
    score_sports_equipment = len(D2.intersection(EV1_sports.intersection(Vj)))
    
    # Determine which sense the word belongs to
    if score_mammal > score_sports_equipment:
        return f"The word is used in the sense of {S1}"
    elif score_sports_equipment > score_mammal:
        return f"The word is used in the sense of {S2}"
    else:
        return "The word sense is ambiguous"

# Get input from the user
context = input("Enter a sentence containing the word 'bat': ")

# Tokenize and lemmatize the words in the context
words_in_context = word_tokenize(context)
lemmatized_words = [lemmatizer.lemmatize(word.lower()) for word in words_in_context]

# Example senses for "bat"
senses = ["mammal", "sports equipment"]

# Run the word sense disambiguation
result = word_sense_disambiguation(context, senses, lemmatized_words)
print(result)


Enter a sentence containing the word 'bat':  The bat flew silently through the night, hunting insects with its sharp senses


The word is used in the sense of mammal
