In [1]:
import numpy as np
import nltk

In [4]:
print(np.__version__)
print(nltk.__version__)

1.14.2
3.2.5


In [22]:
def get_reviews(path, positive=True):
    label = 1 if positive else 0
    
    with open(path, 'r', encoding='utf-8') as f:
        review_text = f.readlines()
        
    reviews = []
    for text in review_text:
        reviews.append((text, label))
    return reviews

In [23]:
def extract_reviews():
    positive = get_reviews("AI/nlp/rt-polarity.pos", positive=True)
    negative = get_reviews("AI/nlp/rt-polarity.neg", positive=False)
    
    return positive, negative

In [25]:
positive, negative = extract_reviews()

In [61]:
negative[:2]

[('simplistic , silly and tedious . \n', 0),
 ("it's so laddish and juvenile , only teenage boys could possibly find it funny . \n",
  0)]

In [28]:
TRAIN_DATA = 5000
TOTAL_DATA = len(positive)

train_reviews = positive[:TRAIN_DATA] + negative[:TRAIN_DATA]

test_positive = positive[TRAIN_DATA:TOTAL_DATA]
test_negative = negative[TRAIN_DATA:TOTAL_DATA]

In [29]:
def get_vocabulary(train):
    word_set = set()
    
    for review in train:
        word_set.update(review[0].split())
        
    return list(word_set)

vocabulary = get_vocabulary(train_reviews)

In [30]:
vocabulary[:5]

['psychodrama', 'rambunctious', 'dadaist', 'warm-blooded', 'chillingly']

In [31]:
def extract_features(review_text):
    review_words = set(review_text.split())
    
    features = {}
    for word in vocabulary:
        features[word] = (word in review_words)

In [32]:
train_features = nltk.classify.apply_features(extract_features, train_reviews)

In [60]:
trained_classifier = nltk.NaiveBayesClassifier.train(train_features)

In [89]:
def sentiment_calculator(review_text):
    features = extract_features(review_text)
    return trained_classifier.classify(features)

In [107]:
sentiment_calculator("What an amazing movie!")

1

In [108]:
sentiment_calculator("What a terrible movie!")

0

In [110]:
def classify_test_reviews(test_positive_reviews, test_negative_reviews, sentiment_calculator):
    positive_results = [sentiment_calculator(review[0]) for review in test_positive_reviews]
    negative_results = [sentiment_calculator(review[0]) for review in test_negative_reviews]
    
    true_positives = sum(x == 1 for x in positive_results)
    true_negatives = sum(x == 0 for x in negative_results)
    
    percent_true_positive = float(true_positives) / len(positive_results)
    percent_true_negative = float(true_negatives) / len(negative_results)
    
    total_accurate = true_positives + true_negatives
    total = len(positive_results) + len(negative_results)
    
    print("Accuracy on positive reviews = " + "%.2f" % (percent_true_positive * 100) + "%")
    print("Accuracy on negative reviews = " + "%.2f" % (percent_true_negative * 100) + "%")
    print("Overall accuracy = " + "%.2f" % (total_accurate * 100/total) + "%")

In [112]:
classify_test_reviews(test_positive, test_negative, sentiment_calculator)