# 13th June 2024 - NLP - Module 5

In [1]:
import nltk
from nltk.corpus import movie_reviews

Text Pre Processing

In [2]:
len(movie_reviews.words())

1583820

In [3]:
movie_reviews.categories()

['neg', 'pos']

In [4]:
movie_reviews.fileids()[:5]

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt']

We remove the punctuations:

In [5]:
text = " ".join(movie_reviews.words())
import string
text_filtered = text.translate(str.maketrans('', '', string.punctuation))

Then remove the stopwords and then make all words lowercase.

In [6]:
from nltk import word_tokenize
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
tokens = word_tokenize(text_filtered)
word_filtered = [w.lower() for w in tokens if w not in stopwords]

We can use FreqDist() function on NLTK to have a dictionary of frequency of apperance of word in a text.

In [7]:
counter_dict = nltk.FreqDist(word_filtered)

In [8]:
print(counter_dict.most_common(15))

[('film', 9519), ('one', 5853), ('movie', 5774), ('like', 3690), ('even', 2565), ('good', 2411), ('time', 2411), ('story', 2170), ('would', 2110), ('much', 2050), ('character', 2020), ('also', 1967), ('get', 1949), ('two', 1912), ('well', 1906)]


In [9]:
docs = [(list(movie_reviews.words(fileid)), category)
        for category in movie_reviews.categories()
        for fileid in movie_reviews.fileids(category)]

The above section of the code can be translated to: 

In every category (we have either pos or neg), take the entire file IDs (every review has own ID), 

Store the word_tokenized version (list of words) for the file ID, and then followed by the positive or negative label in one big list.

In [None]:
docs

Feature Extraction

We create a list containing 3000 most frequent words in the documents.

In [11]:
word_features = [w[0] for w in counter_dict.most_common(3000)]

In [None]:
word_features

We consider existence/non-existence of these words in each of reviews as features by defining the following function.


In [13]:
def search_features(doc):
    words = set(doc)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

We can test the function on the first review (docs was defined before as the list of tuples containing reviews with their corresponding sentiments).

In [None]:
search_features(docs[0][0])

These are only the first 14 words in the word_features variable (containing 3000 words) for the first review. 


Applying the function to all the reviews.


In [15]:
featureset = [(search_features(doc), category) for (doc, category) in docs]

Print the first element from featureset and explore the component inside it

In [16]:
print(featureset[0][0])



Training and testing set

Before we can train and test our algorithm, we should first split our data into training and test sets. 

Since our dataset has been shuffled, the first 1600 shuffled reviews (consisting both positive and negative reviews) will be used as the training set. 

The remaining 20% (400 reviews) will be used to perform the test. 


In [17]:
training_set = featureset[:1600]
testing_set = featureset[1600:]

In [None]:
testing_set

In [19]:
count =0
pcount=0
for value in training_set:
    if value[1] == 'neg':
        count +=1
    elif value[1] == 'pos':
        pcount +=1
print(count)
print(pcount)

1000
600


# training set

In [20]:
ncount =0
pcount=0
for value in training_set:
    if value[1] == 'neg':
        ncount +=1
    elif value[1] == 'pos':
        pcount +=1
print(ncount)
print(pcount)

# Calculate total number of reviews
total_featureset = len(training_set)

# Calculate percentages
percentage_pos = ( pcount/ total_featureset) * 100
percentage_neg = (ncount / total_featureset) * 100
print(percentage_pos)
print(percentage_neg)

1000
600
37.5
62.5


# testing set

In [21]:
ncount =0
pcount=0
for value in testing_set:
    if value[1] == 'neg':
        ncount +=1
    elif value[1] == 'pos':
        pcount +=1
print(ncount)
print(pcount)

# Calculate total number of reviews
total_featureset = len(testing_set)

# Calculate percentages
percentage_pos = ( pcount/ total_featureset) * 100
percentage_neg = (ncount / total_featureset) * 100
print(percentage_pos)
print(percentage_neg)



0
400
100.0
0.0


## using shuffle

In [22]:
import random
random.shuffle(featureset)
training_set = featureset[:1600]
testing_set = featureset[1600:]

# training set after shuffle

In [23]:
print("training set after shuffle")
ncount =0
pcount=0
for value in training_set:
    if value[1] == 'neg':
        ncount +=1
    elif value[1] == 'pos':
        pcount +=1
print(ncount)
print(pcount)

# Calculate total number of reviews
total_featureset = len(training_set)

# Calculate percentages
percentage_pos = ( pcount/ total_featureset) * 100
percentage_neg = (ncount / total_featureset) * 100
print(percentage_pos)
print(percentage_neg)

training set after shuffle
808
792
49.5
50.5


# testing set after shuffle

In [24]:
print("testing set after shuffle")
ncount =0
pcount=0
for value in testing_set:
    if value[1] == 'neg':
        ncount +=1
    elif value[1] == 'pos':
        pcount +=1
print(ncount)
print(pcount)

# Calculate total number of reviews
total_featureset = len(testing_set)

# Calculate percentages
percentage_pos = ( pcount/ total_featureset) * 100
percentage_neg = (ncount / total_featureset) * 100
print(percentage_pos)
print(percentage_neg)


testing set after shuffle
192
208
52.0
48.0


# Training

In [25]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [26]:
print("Classifier's training accuracy is: {}".format(nltk.classify.accuracy(classifier, training_set)* 100))
print("Classifier's testing accuracy is: {}".format(nltk.classify.accuracy(classifier, testing_set)* 100))

Classifier's training accuracy is: 87.5625
Classifier's testing accuracy is: 83.25


### Most informative words

In [27]:
report = classifier.show_most_informative_features(300)

Most Informative Features
               ludicrous = True              neg : pos    =     10.8 : 1.0
             outstanding = True              pos : neg    =      9.4 : 1.0
                   mulan = True              pos : neg    =      9.2 : 1.0
            breathtaking = True              pos : neg    =      8.6 : 1.0
                   damon = True              pos : neg    =      8.4 : 1.0
                  welles = True              neg : pos    =      7.5 : 1.0
                  finest = True              pos : neg    =      6.6 : 1.0
                religion = True              pos : neg    =      6.6 : 1.0
             wonderfully = True              pos : neg    =      6.5 : 1.0
                 freddie = True              neg : pos    =      6.5 : 1.0
                  seagal = True              neg : pos    =      6.5 : 1.0
                 idiotic = True              neg : pos    =      6.4 : 1.0
                  prinze = True              neg : pos    =      6.1 : 1.0

In [28]:
important_features = classifier.most_informative_features(300)


In [None]:
important_features

In [30]:

# Function to get top features with highest pos:neg and neg:pos ratios
def get_top_ratios(classifier, n=300, top_n=10):
    # Get most informative features
    most_informative = classifier.most_informative_features(n)

    top_pos_neg = []
    top_neg_pos = []

    for fname, _ in most_informative:
        pos_ratio = classifier._feature_probdist['pos', fname].prob(True) / \
                    classifier._feature_probdist['neg', fname].prob(True)
        neg_ratio = classifier._feature_probdist['neg', fname].prob(True) / \
                    classifier._feature_probdist['pos', fname].prob(True)

        top_pos_neg.append((fname, pos_ratio))
        top_neg_pos.append((fname, neg_ratio))

    # Sort features by ratios
    top_pos_neg.sort(key=lambda x: x[1], reverse=True)
    top_neg_pos.sort(key=lambda x: x[1], reverse=True)

    # Select top features
    top_pos_neg = top_pos_neg[:top_n]
    top_neg_pos = top_neg_pos[:top_n]

    return top_pos_neg, top_neg_pos

# Get top 10 features with highest pos:neg and neg:pos ratios
top_pos_neg_features, top_neg_pos_features = get_top_ratios(classifier, n=300, top_n=10)

print("Top 10 features with highest pos:neg ratios:")
for feature, ratio in top_pos_neg_features:
    print(f"{feature}: {ratio:.2f}")

print("\nTop 10 features with highest neg:pos ratios:")
for feature, ratio in top_neg_pos_features:
    print(f"{feature}: {ratio:.2f}")


Top 10 features with highest pos:neg ratios:
outstanding: 9.37
mulan: 9.18
breathtaking: 8.60
damon: 8.37
finest: 6.58
religion: 6.56
wonderfully: 6.54
flynt: 5.78
lebowski: 5.78
anger: 5.10

Top 10 features with highest neg:pos ratios:
ludicrous: 10.78
welles: 7.52
freddie: 6.47
seagal: 6.47
idiotic: 6.43
prinze: 6.08
lifeless: 5.77
wasted: 5.55
schumacher: 5.29
inept: 5.20


In [31]:
from nltk.classify import NaiveBayesClassifier

# Assuming you have a training set prepared
# Example: training_set = [({'feature1': True, 'feature2': False}, 'pos'), ...]

# Train your Naive Bayes classifier
classifier = NaiveBayesClassifier.train(training_set)

# Function to get top features with highest pos:neg and neg:pos ratios
def get_top_ratios(classifier, n=300, top_n=25):
    # Get most informative features
    most_informative = classifier.most_informative_features(n)

    top_pos_neg = []
    top_neg_pos = []

    for fname, _ in most_informative:
        pos_ratio = classifier._feature_probdist['pos', fname].prob(True) / \
                    classifier._feature_probdist['neg', fname].prob(True)
        neg_ratio = classifier._feature_probdist['neg', fname].prob(True) / \
                    classifier._feature_probdist['pos', fname].prob(True)

        top_pos_neg.append((fname, pos_ratio))
        top_neg_pos.append((fname, neg_ratio))

    # Sort features by ratios
    top_pos_neg.sort(key=lambda x: x[1], reverse=True)
    top_neg_pos.sort(key=lambda x: x[1], reverse=True)

    # Select top features
    top_pos_neg = top_pos_neg[:top_n]
    top_neg_pos = top_neg_pos[:top_n]

    # Extract feature names
    top_features = [feature for feature, _ in top_pos_neg] + [feature for feature, _ in top_neg_pos]

    return top_features

# Get top 50 features (25 from highest pos:neg and 25 from highest neg:pos ratios)
top_features = get_top_ratios(classifier, n=300, top_n=50)

print("Top 100 features for training:")
print(top_features)


Top 100 features for training:
['outstanding', 'mulan', 'breathtaking', 'damon', 'finest', 'religion', 'wonderfully', 'flynt', 'lebowski', 'anger', 'tucker', 'miles', 'beautifully', 'portrayed', 'terrific', 'fantastic', 'era', 'na', 'spacey', 'ordinary', 'allows', 'nomination', 'colors', 'decades', 'delightful', 'hanks', 'diaz', 'innocence', 'memorable', 'german', 'friendship', 'mature', 'jedi', 'stiller', 'portrayal', 'contrast', 'subtle', 'refreshing', 'nevertheless', 'perfectly', 'ripley', 'tony', 'excellent', 'cameron', 'remarkable', 'joy', 'lonely', 'endearing', 'conventional', 'visually', 'ludicrous', 'welles', 'freddie', 'seagal', 'idiotic', 'prinze', 'lifeless', 'wasted', 'schumacher', 'inept', 'lame', 'poorly', 'alicia', 'embarrassing', 'tedious', 'bore', 'ridiculous', 'sat', 'awful', 'alas', 'garbage', 'whatsoever', 'random', 'inane', 'unfunny', 'worst', 'bland', 'dull', 'stupid', 'breasts', 'mess', 'painfully', 'anywhere', 'waste', 'pointless', 'jolie', 'eve', 'snake', 'laug

In [32]:
word_features = top_features

In [33]:
def search_features(doc):
    words = set(doc)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

In [34]:
search_features(docs[0][0])

{'outstanding': False,
 'mulan': False,
 'breathtaking': False,
 'damon': False,
 'finest': False,
 'religion': False,
 'wonderfully': False,
 'flynt': False,
 'lebowski': False,
 'anger': False,
 'tucker': False,
 'miles': False,
 'beautifully': False,
 'portrayed': False,
 'terrific': False,
 'fantastic': False,
 'era': False,
 'na': False,
 'spacey': False,
 'ordinary': False,
 'allows': False,
 'nomination': False,
 'colors': False,
 'decades': False,
 'delightful': False,
 'hanks': False,
 'diaz': False,
 'innocence': False,
 'memorable': False,
 'german': False,
 'friendship': False,
 'mature': False,
 'jedi': False,
 'stiller': False,
 'portrayal': False,
 'contrast': False,
 'subtle': False,
 'refreshing': False,
 'nevertheless': False,
 'perfectly': False,
 'ripley': False,
 'tony': False,
 'excellent': False,
 'cameron': False,
 'remarkable': False,
 'joy': False,
 'lonely': False,
 'endearing': False,
 'conventional': False,
 'visually': False,
 'ludicrous': False,
 'welles'

In [35]:
featureset = [(search_features(doc), category) for (doc, category) in docs]

In [36]:
training_set = featureset[:1600]
testing_set = featureset[1600:]

In [37]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [38]:
print("Classifier's training accuracy is: {}".format(nltk.classify.accuracy(classifier, training_set)* 100))
print("Classifier's testing accuracy is: {}".format(nltk.classify.accuracy(classifier, testing_set)* 100))

Classifier's training accuracy is: 81.5625
Classifier's testing accuracy is: 79.25


# Saving

In [39]:
import pickle
save_classifier = open("naive_bayes_model.pkl","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [40]:
classifier_f = open("naive_bayes_model.pkl","rb")
classifier = pickle.load(classifier_f)
classifier_f.close()

In [41]:
custom_review = "I hated the restaurant. It was a disaster eating there. Poor service, arrogant waiters."



In [42]:
from nltk import word_tokenize
custom_review_tokens = word_tokenize(custom_review)
custom_review_set = document_features(custom_review_tokens)
print(classifier.classify(custom_review_set))



NameError: name 'document_features' is not defined

In [None]:
prob_result = classifier.prob_classify(custom_review_set)
print(prob_result.max())
print(prob_result.prob("pos"))
print(prob_result.prob("neg"))

NameError: name 'custom_review_set' is not defined