In [16]:
# unigram based naive bayes from scratch.
import nltk 
import numpy 
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC
from sklearn import model_selection
from nltk.corpus import movie_reviews 
import random
document = [(movie_reviews.words(file_id),category) for file_id in movie_reviews.fileids() for category in movie_reviews.categories(file_id)]
random.shuffle(document)
train_set, test_set = model_selection.train_test_split(document,test_size = 0.2)
pos_size = 0
neg_size = 0
token_pos_count = {}
token_neg_count = {}
for sample in train_set:
    if sample[1] == 'pos':
        for word in sample[0]:
            if word not in token_pos_count:
                token_pos_count[word] = 1
            else:
                token_pos_count[word] += 1
            pos_size += 1
    else:
        for word in sample[0]:
            if word not in token_neg_count:
                token_neg_count[word] = 1
            else:
                token_neg_count[word] += 1
            neg_size += 1

In [17]:
# Testing with naive bayes
sp = 1
correct_guesses = 0
total_guesses = 0
true_positive = 0
false_positive = 0
true_negative = 0
false_negative = 0
for sample in test_set:
    prob_pos = 0.0
    prob_neg = 0.0
    for word in sample[0]:
        if word in token_pos_count:
            prob_pos += numpy.log((token_pos_count.get(word) + sp) / (pos_size + sp * len(token_pos_count)))
        else:
            prob_pos += numpy.log(sp/(pos_size + sp * len(token_pos_count)))
        if word in token_neg_count:
            prob_neg += numpy.log((token_neg_count.get(word) + sp) / (neg_size + sp * len(token_neg_count)))
        else:
            prob_neg += numpy.log(sp/(neg_size + sp * len(token_neg_count)))

    if sample[1] == 'pos':
        if prob_pos >= prob_neg:
            correct_guesses += 1
            true_positive += 1
        else:
            false_negative += 1
        
    else:
        if prob_pos <= prob_neg:
            correct_guesses += 1
            true_negative += 1
        else:
            false_positive += 1
    total_guesses += 1
    
print('Accuracy', correct_guesses/total_guesses * 100, '%')
print('Precision', true_positive/ (true_positive + false_positive) * 100, '%')
print('Recall', true_positive/ (true_positive + false_negative) * 100, '%')
print('F1-Score', 200 * (true_positive/ (true_positive + false_positive) * true_positive/ (true_positive + false_negative))/(true_positive/ (true_positive + false_positive) + true_positive/ (true_positive + false_negative)), '%')

Accuracy 79.75 %
Precision 86.3905325443787 %
Recall 71.56862745098039 %
F1-Score 78.28418230563003 %


In [4]:
# play with different features. (tf-idf)
import nltk 
import numpy 
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC
from sklearn import model_selection
from nltk.corpus import movie_reviews 
import random
document = [(movie_reviews.words(file_id),category) for file_id in movie_reviews.fileids() for category in movie_reviews.categories(file_id)]
random.shuffle(document)
train_set, test_set = model_selection.train_test_split(document,test_size = 0.2)
pos_size = 0
neg_size = 0
token_pos_count = {}
token_neg_count = {}
for sample in train_set:
    if sample[1] == 'pos':
        for word in sample[0]:
            if word not in token_pos_count:
                token_pos_count[word] = 1
            else:
                token_pos_count[word] += 1
            pos_size += 1
    else:
        for word in sample[0]:
            if word not in token_neg_count:
                token_neg_count[word] = 1
            else:
                token_neg_count[word] += 1
            neg_size += 1
    

In [5]:
# Testing with naive bayes
sp = 1
correct_guesses = 0
total_guesses = 0
true_positive = 0
false_positive = 0
true_negative = 0
false_negative = 0
shown = []

for sample in test_set:
    shown_time = {}
    for word in sample[0]:
        if word not in shown_time:
            shown_time[word] = 1
        else:
            shown_time[word] += 1
    shown.append(shown_time)
        
for sample, shown_time in zip(test_set, shown):
    prob_pos = 0.0
    prob_neg = 0.0
    for word in sample[0]:
        if word in token_pos_count and word in shown_time:
            prob_pos += shown_time[word] / len(sample[0]) * numpy.log((token_pos_count.get(word) + sp) / (pos_size + sp * len(token_pos_count))) 
        elif word not in token_pos_count and word in shown_time:
            prob_pos += numpy.log(sp/(pos_size + sp * len(token_pos_count)))
        else:
            prob_pos += 0
        if word in token_neg_count and word in shown_time:
            prob_neg += shown_time[word] / len(sample[0]) * numpy.log((token_neg_count.get(word) + sp) / (neg_size + sp * len(token_neg_count)))
        elif word not in token_neg_count and word in shown_time:
            prob_neg += numpy.log(sp/(neg_size + sp * len(token_neg_count)))
        else:
            prob_neg += 0

    if sample[1] == 'pos':
        if prob_pos >= prob_neg:
            correct_guesses += 1
            true_positive += 1
        else:
            false_negative += 1
        
    else:
        if prob_pos <= prob_neg:
            correct_guesses += 1
            true_negative += 1
        else:
            false_positive += 1
    total_guesses += 1
    
print('Accuracy', correct_guesses/total_guesses * 100, '%')
print('Precision', true_positive/ (true_positive + false_positive) * 100, '%')
print('Recall', true_positive/ (true_positive + false_negative) * 100, '%')
print('F1-Score', 200 * (true_positive/ (true_positive + false_positive) * true_positive/ (true_positive + false_negative))/(true_positive/ (true_positive + false_positive) + true_positive/ (true_positive + false_negative)), '%')

Accuracy 67.0 %
Precision 64.54545454545455 %
Recall 72.44897959183673 %
F1-Score 68.26923076923076 %


In [6]:
# Only select non-puncts.
import nltk 
import numpy 
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC
from sklearn import model_selection
from nltk.corpus import movie_reviews 
import random
document = [(movie_reviews.words(file_id),category) for file_id in movie_reviews.fileids() for category in movie_reviews.categories(file_id)]
random.shuffle(document)
train_set, test_set = model_selection.train_test_split(document,test_size = 0.2)
pos_size = 0
neg_size = 0
token_pos_count = {}
token_neg_count = {}
punct_list = [',', '.', '"','\'', ';', ':', '{', '}', '[', ']', '|', '_', '-', '+', '=']
for sample in train_set:
    if sample[1] == 'pos':
        for word in sample[0]:
            if word in punct_list:
                continue
            if word not in token_pos_count:
                token_pos_count[word] = 1
            else:
                token_pos_count[word] += 1
            pos_size += 1
    else:
        for word in sample[0]:
            if word in punct_list:
                continue
            if word not in token_neg_count:
                token_neg_count[word] = 1
            else:
                token_neg_count[word] += 1
            neg_size += 1

In [7]:
# Testing with naive bayes
sp = 1
correct_guesses = 0
total_guesses = 0
true_positive = 0
false_positive = 0
true_negative = 0
false_negative = 0
for sample in test_set:
    prob_pos = 0.0
    prob_neg = 0.0
    for word in sample[0]:
        if word in punct_list:
            continue
        if word in token_pos_count:
            prob_pos += numpy.log((token_pos_count.get(word) + sp) / (pos_size + sp * len(token_pos_count)))
        else:
            prob_pos += numpy.log(sp/(pos_size + sp * len(token_pos_count)))
        if word in token_neg_count:
            prob_neg += numpy.log((token_neg_count.get(word) + sp) / (neg_size + sp * len(token_neg_count)))
        else:
            prob_neg += numpy.log(sp/(neg_size + sp * len(token_neg_count)))

    if sample[1] == 'pos':
        if prob_pos >= prob_neg:
            correct_guesses += 1
            true_positive += 1
        else:
            false_negative += 1
        
    else:
        if prob_pos <= prob_neg:
            correct_guesses += 1
            true_negative += 1
        else:
            false_positive += 1
    total_guesses += 1
    
print('Accuracy', correct_guesses/total_guesses * 100, '%')
print('Precision', true_positive/ (true_positive + false_positive) * 100, '%')
print('Recall', true_positive/ (true_positive + false_negative) * 100, '%')
print('F1-Score', 200 * (true_positive/ (true_positive + false_positive) * true_positive/ (true_positive + false_negative))/(true_positive/ (true_positive + false_positive) + true_positive/ (true_positive + false_negative)), '%')

Accuracy 82.0 %
Precision 84.70588235294117 %
Recall 75.78947368421053 %
F1-Score 80.0 %


In [8]:
# Only select non-puncts.
import nltk 
import numpy 
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC
from sklearn import model_selection
from nltk.corpus import movie_reviews 
import random
document = [(movie_reviews.words(file_id),category) for file_id in movie_reviews.fileids() for category in movie_reviews.categories(file_id)]
random.shuffle(document)
train_set, test_set = model_selection.train_test_split(document,test_size = 0.2)
pos_size = 0
neg_size = 0
token_pos_count = {}
token_neg_count = {}
punct_list = [',', '.', '"','\'', ';', ':', '{', '}', '[', ']', '|', '_', '-', '+', '=']
for sample in train_set:
    if sample[1] == 'pos':
        for word in sample[0]:
            if word not in punct_list:
                continue
            if word not in token_pos_count:
                token_pos_count[word] = 1
            else:
                token_pos_count[word] += 1
            pos_size += 1
    else:
        for word in sample[0]:
            if word not in punct_list:
                continue
            if word not in token_neg_count:
                token_neg_count[word] = 1
            else:
                token_neg_count[word] += 1
            neg_size += 1

In [9]:
# Testing with naive bayes
sp = 1
correct_guesses = 0
total_guesses = 0
true_positive = 0
false_positive = 0
true_negative = 0
false_negative = 0
for sample in test_set:
    prob_pos = 0.0
    prob_neg = 0.0
    for word in sample[0]:
        if word not in punct_list:
            continue
        if word in token_pos_count:
            prob_pos += numpy.log((token_pos_count.get(word) + sp) / (pos_size + sp * len(token_pos_count)))
        else:
            prob_pos += numpy.log(sp/(pos_size + sp * len(token_pos_count)))
        if word in token_neg_count:
            prob_neg += numpy.log((token_neg_count.get(word) + sp) / (neg_size + sp * len(token_neg_count)))
        else:
            prob_neg += numpy.log(sp/(neg_size + sp * len(token_neg_count)))

    if sample[1] == 'pos':
        if prob_pos >= prob_neg:
            correct_guesses += 1
            true_positive += 1
        else:
            false_negative += 1
        
    else:
        if prob_pos <= prob_neg:
            correct_guesses += 1
            true_negative += 1
        else:
            false_positive += 1
    total_guesses += 1
    
print('Accuracy', correct_guesses/total_guesses * 100, '%')
print('Precision', true_positive/ (true_positive + false_positive) * 100, '%')
print('Recall', true_positive/ (true_positive + false_negative) * 100, '%')
print('F1-Score', 200 * (true_positive/ (true_positive + false_positive) * true_positive/ (true_positive + false_negative))/(true_positive/ (true_positive + false_positive) + true_positive/ (true_positive + false_negative)), '%')

Accuracy 58.25 %
Precision 57.89473684210527 %
Recall 55.83756345177665 %
F1-Score 56.84754521963824 %


In [10]:
# Only select non-puncts.
import nltk 
import numpy 
from nltk import word_tokenize
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC
from sklearn import model_selection
from nltk.corpus import movie_reviews 
import random
document = [(movie_reviews.words(file_id),category) for file_id in movie_reviews.fileids() for category in movie_reviews.categories(file_id)]
# random.shuffle(document)
new_document = []
pos_list = ['NN', 'NNS', 'VB', 'VBS', 'VBG', 'RB', 'JJ', 'VBP']
for sample in document:
    string = ''
    for word in sample[0]:
        string += word
        string += ' '
    string = word_tokenize(string)
    local = nltk.pos_tag(string)
    new_local = (local, sample[1])
    new_document.append(new_local)

random.shuffle(new_document)
token_pos_count = {}
token_neg_count = {}
train_set, test_set = model_selection.train_test_split(new_document,test_size = 0.2)
pos_size = 0
neg_size = 0

for sample in train_set:
    if sample[1] == 'pos':
        for word in sample[0]:
            if word[1] not in pos_list:
                continue
            if word[0] not in token_pos_count:
                token_pos_count[word[0]] = 1
            else:
                token_pos_count[word[0]] += 1
            pos_size += 1
    else:
        for word in sample[0]:
            if word[1] not in pos_list:
                continue
            if word[0] not in token_neg_count:
                token_neg_count[word[0]] = 1
            else:
                token_neg_count[word[0]] += 1
            neg_size += 1

In [12]:
# Testing with naive bayes
sp = 1
correct_guesses = 0
total_guesses = 0
true_positive = 0
false_positive = 0
true_negative = 0
false_negative = 0
for sample in test_set:
    prob_pos = 0.0
    prob_neg = 0.0
    for word in sample[0]:
        if word[1] not in pos_list:
            continue
        if word[0] in token_pos_count:
            prob_pos += numpy.log((token_pos_count.get(word[0]) + sp) / (pos_size + sp * len(token_pos_count)))
        else:
            prob_pos += numpy.log(sp/(pos_size + sp * len(token_pos_count)))
        if word[0] in token_neg_count:
            prob_neg += numpy.log((token_neg_count.get(word[0]) + sp) / (neg_size + sp * len(token_neg_count)))
        else:
            prob_neg += numpy.log(sp/(neg_size + sp * len(token_neg_count)))

    if sample[1] == 'pos':
        if prob_pos >= prob_neg:
            correct_guesses += 1
            true_positive += 1
        else:
            false_negative += 1
        
    else:
        if prob_pos <= prob_neg:
            correct_guesses += 1
            true_negative += 1
        else:
            false_positive += 1
    total_guesses += 1
    
print('Accuracy', correct_guesses/total_guesses * 100, '%')
print('Precision', true_positive/ (true_positive + false_positive) * 100, '%')
print('Recall', true_positive/ (true_positive + false_negative) * 100, '%')
print('F1-Score', 200 * (true_positive/ (true_positive + false_positive) * true_positive/ (true_positive + false_negative))/(true_positive/ (true_positive + false_positive) + true_positive/ (true_positive + false_negative)), '%')

Accuracy 81.75 %
Precision 84.04907975460122 %
Recall 74.45652173913044 %
F1-Score 78.96253602305475 %
