In [1]:
# Import necessary libraries
import nltk 
from nltk.corpus import stopwords
from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy

In [2]:
# Download necessary datasets
nltk.download('movie_reviews')  # Movie review dataset
nltk.download('stopwords')      # Stopwords for preprocessing
nltk.download('punkt')  # Tokenizer for text processing

[nltk_data] Downloading package movie_reviews to C:\Users\Jarshana
[nltk_data]     Shrestha\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Jarshana
[nltk_data]     Shrestha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Jarshana
[nltk_data]     Shrestha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Load Dataset
# Extract movie reviews and their associated labels
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

#### Tokenization

In [4]:
# Example of tokenizing one document for clarity
sample_document = documents[0][0]  # Get the first document (words)
print(f"Original Document: {sample_document[:20]}")  # Print the first 20 words for reference

Original Document: ['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an']


#### Lowercasing

In [5]:
# Convert all words to lowercase
lowercased_words = [word.lower() for word in sample_document]
print(f"Lowercased Words: {lowercased_words[:20]}")

Lowercased Words: ['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an']


#### Removing Non-Alphabetic Tokens

In [6]:
# Remove words that are not purely alphabetic
alphabetic_words = [word for word in lowercased_words if 
word.isalpha()]
print(f"Alphabetic Words: {alphabetic_words[:20]}")

Alphabetic Words: ['plot', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', 'drink', 'and', 'then', 'drive', 'they', 'get', 'into', 'an', 'accident', 'one', 'of']


#### Stop Word Removal

In [7]:
# Load stopwords and remove them from the dataset
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in alphabetic_words if word not in stop_words]
print(f"Filtered Words (No Stopwords): {filtered_words[:20]}")

Filtered Words (No Stopwords): ['plot', 'two', 'teen', 'couples', 'go', 'church', 'party', 'drink', 'drive', 'get', 'accident', 'one', 'guys', 'dies', 'girlfriend', 'continues', 'see', 'life', 'nightmares', 'deal']


#### Feature Extraction

In [15]:
# Create a list of all words in the dataset
all_words = nltk.FreqDist(word.lower() for word in movie_reviews.words() if word.isalpha())
all_words

FreqDist({'the': 76529, 'a': 38106, 'and': 35576, 'of': 34123, 'to': 31937, 'is': 25195, 'in': 21822, 's': 18513, 'it': 16107, 'that': 15924, ...})

In [16]:
# Use the 2000 most common words as features
word_features = list(all_words.keys())[:2000]

# feature extractor function
def document_features(document):
    document_words = set(document)
    features = {word: (word in document_words) for word in word_features}
    return features

# Preprocess all documents
preprocessed_documents = []
for (doc, category) in documents:  
    filtered_words = [word.lower() for word in doc if word.isalpha() and word.lower() not in stop_words] 
    features = document_features(filtered_words) 
    preprocessed_documents.append((features, category))

#### Training a Naive Bayes Classifier

In [17]:
# Split the dataset into training and testing sets (80% training, 20% testing)
train_set = preprocessed_documents[:1600]
test_set = preprocessed_documents[1600:]

In [18]:
# Train the Classifier
classifier = NaiveBayesClassifier.train(train_set)

#### Testing and Evaluating the Model

In [19]:
# Evaluate the Model
print(f"Accuracy: {accuracy(classifier, test_set) * 100:.2f}%")

Accuracy: 74.25%


In [20]:
# Test with New Data
test_review = "This movie was absolutely great, with great performances and a good story."
test_tokens = nltk.word_tokenize(test_review)
test_words = [word.lower() for word in test_tokens if word.isalpha() and word.lower() not in stop_words]
test_features = document_features(test_words)

print(f"Prediction for test review: {classifier.classify(test_features)}")

Prediction for test review: neg


In [21]:
# Display the Most Informative Features
print("\nMost Informative Features:")
classifier.show_most_informative_features(10)


Most Informative Features:
Most Informative Features
                   chick = True              neg : pos    =      8.6 : 1.0
                 frances = True              pos : neg    =      8.3 : 1.0
              undercover = True              neg : pos    =      7.8 : 1.0
              derivative = True              neg : pos    =      7.0 : 1.0
                  inject = True              neg : pos    =      7.0 : 1.0
                 justify = True              neg : pos    =      6.2 : 1.0
                   banal = True              neg : pos    =      5.8 : 1.0
                bothered = True              neg : pos    =      5.8 : 1.0
                     ugh = True              neg : pos    =      5.8 : 1.0
                   waste = True              neg : pos    =      5.7 : 1.0
