In [60]:
import os
import re
import math
from nltk.tokenize import sent_tokenize, word_tokenize

all_words = set()

def create_corp(path, flag_test = False):
    all_texts = ''
    all_filenames = os.listdir(path)
    files_to_read = all_filenames[math.floor(len(all_filenames) * 0.1):] if flag_test else all_filenames[:math.floor(len(all_filenames) * 0.9)]

    for filename in files_to_read:
       with open(os.path.join(path, filename), 'r', encoding="utf8") as f:
           all_texts += f.read()

    res = []
    for sent in sent_tokenize(all_texts):
        words = word_tokenize(re.sub('[^А-Яа-я]', " ", sent.lower()))
        all_words.update(words)
        res.append(words)

    return res

def text_2_features(text):
    return {word: (word in text) for word in all_words}

def process(sentences, mood):
    return [(text_2_features(sent), mood) for sent in sentences]


corpus = process(create_corp("bayes-corpus/pos"), "positive") + process(create_corp("bayes-corpus/neg"), "negative")

In [61]:
from nltk import NaiveBayesClassifier

model = NaiveBayesClassifier.train(corpus)

In [62]:
model.show_most_informative_features(10)

Most Informative Features
                  брандо = True           positi : negati =     44.8 : 1.0
                корлеоне = True           positi : negati =     35.6 : 1.0
                   пьюзо = True           positi : negati =     34.9 : 1.0
                    дона = True           positi : negati =     29.5 : 1.0
                   семьи = True           positi : negati =     28.1 : 1.0
                   марио = True           positi : negati =     25.7 : 1.0
                   мафии = True           positi : negati =     25.7 : 1.0
                  майкла = True           positi : negati =     24.1 : 1.0
                    форд = True           positi : negati =     21.1 : 1.0
                  марлон = True           positi : negati =     20.5 : 1.0


In [63]:
print(model.classify({'мне': True, 'не': True, 'очень': True}))
print(model.classify({'мне': True, 'не': False, 'очень': True, 'понравилось': True}))

negative
positive


In [64]:
pd = model.prob_classify(({'мне': True, 'не': False, 'очень': True, 'понравилось': True}))
print(pd.prob('positive'))
print(pd.prob('negative'))

0.6972101088087337
0.30278989119126526


In [65]:
test_corpus = process(create_corp("bayes-corpus/pos", True), "positive") + process(create_corp("bayes-corpus/neg", True), "negative")
correct = 0;

for sent, mood in test_corpus:
     if model.classify(sent) == mood:
         correct += 1

print(correct/len(test_corpus))


0.5429193037974683
