Q4

In [1]:
from nltk.corpus import senseval

# Load the serve.pos dataset
instances = senseval.instances('hard.pos')

# Print the first instance
print(instances[0])


SensevalInstance(word='hard-a', position=20, context=[('``', '``'), ('he', 'PRP'), ('may', 'MD'), ('lose', 'VB'), ('all', 'DT'), ('popular', 'JJ'), ('support', 'NN'), (',', ','), ('but', 'CC'), ('someone', 'NN'), ('has', 'VBZ'), ('to', 'TO'), ('kill', 'VB'), ('him', 'PRP'), ('to', 'TO'), ('defeat', 'VB'), ('him', 'PRP'), ('and', 'CC'), ('that', 'DT'), ("'s", 'VBZ'), ('hard', 'JJ'), ('to', 'TO'), ('do', 'VB'), ('.', '.'), ("''", "''")], senses=('HARD1',))


In [2]:
def serve_features(instance):
    # Extract the context sentence and target word position
    context = instance.context
    target_index = instance.position
    
    # Define the window size for context words
    window = 3
    
    # Extract the context words within the window around the target word
    features = {}
    for i in range(max(0, target_index - window), target_index):
        features[f"word_before_{i-target_index}"] = context[i]
    for i in range(target_index+1, min(len(context), target_index + window + 1)):
        features[f"word_after_{i-target_index-1}"] = context[i]
        
    return features

In [3]:
# Extract the features and labels for each instance
featuresets = [(serve_features(inst), inst.senses[0]) for inst in instances]

# Split the dataset into training and testing sets
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]

In [4]:
import nltk
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.9145496535796767


Q5

In [5]:
import nltk
from nltk.corpus import movie_reviews

# Define a feature extractor for movie reviews
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features[f'contains({word})'] = (word in document_words)
    return features

# Load the movie reviews dataset
documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]

# Define the 2000 most frequent words as the feature set
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

# Extract the document features and split the dataset into training and testing sets
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]

# Train a Naive Bayes classifier on the training set
classifier = nltk.NaiveBayesClassifier.train(train_set)

# Print the 30 most informative features
print(classifier.show_most_informative_features(30))


Most Informative Features
   contains(outstanding) = True              pos : neg    =     10.4 : 1.0
        contains(seagal) = True              neg : pos    =      8.7 : 1.0
         contains(mulan) = True              pos : neg    =      8.1 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.3 : 1.0
         contains(damon) = True              pos : neg    =      5.7 : 1.0
          contains(lame) = True              neg : pos    =      5.6 : 1.0
        contains(wasted) = True              neg : pos    =      5.6 : 1.0
         contains(awful) = True              neg : pos    =      5.4 : 1.0
         contains(flynt) = True              pos : neg    =      5.1 : 1.0
    contains(ridiculous) = True              neg : pos    =      5.1 : 1.0
        contains(poorly) = True              neg : pos    =      4.9 : 1.0
         contains(waste) = True              neg : pos    =      4.9 : 1.0
          contains(jedi) = True              pos : neg    =      4.8 : 1.0