In [60]:
import os
import re
import numpy
import math

In [73]:
PATHS = {
    'TRAIN' : {
        'POSITIVE' : './IMDB_dataset/train/pos',
        'NEGATIVE' : './IMDB_dataset/train/neg'
    },
    'TEST' : {
        'POSITIVE' : './IMDB_dataset/test/pos',
        'NEGATIVE' : './IMDB_dataset/test/neg'
    }
}

documents_amount = {
    'TRAIN' : {
        'POSITIVE' : len(os.listdir(PATHS['TRAIN']['POSITIVE'])),
        'NEGATIVE' : len(os.listdir(PATHS['TRAIN']['NEGATIVE']))
    }, 
    'TEST' : {
        'POSITIVE' : len(os.listdir(PATHS['TEST']['POSITIVE'])),
        'NEGATIVE' : len(os.listdir(PATHS['TEST']['NEGATIVE']))
    }
}

total_documents = {
    'TRAIN' : sum(documents_amount['TRAIN'].values()),
    'TEST'  : sum(documents_amount['TEST'].values())
}

classes = {
    'POSITIVE' : {
        'docs' : {},
        'WORDS' : [],
        'FREQUENCY' : {},
        'PRIOR': 0
    },
    'NEGATIVE' : {
        'docs' : {},
        'WORDS' : [],
        'FREQUENCY' : {},
        'PRIOR': 0
    }
}

for tipo, total in total_documents.items():
    print(f"Total de textos ({tipo}):\n{total}")

Total de textos (TRAIN):
25000
Total de textos (TEST):
25000


In [74]:
regex = r"[-'a-zA-ZÀ-ÖØ-öø-ÿ]+|[.,;!?]"
CORPUS_USE_PERCENTAGE = 0.05

def get_document_words(document_path):
    content = open(document_path, 'r').read().lower()
    return negation_handling(re.findall(regex, content))

def update_words_list(path, words_list):
    for i in range(0, int(len(os.listdir(path))*CORPUS_USE_PERCENTAGE)):
        file_name = os.listdir(path)[i]
        words     = get_document_words(path+"/"+file_name)
        words_list.extend(words)

def update_frequency(frequency, words):
    for word in set(words):
        frequency[word] = words.count(word)

In [75]:
# Negation Handling
punctuationRe = r"[,.;!?]"
negationRe = r"not|no|\w*n't"

def negation_handling(words):
    negated = False
    words_set = set()

    for word in words:
        if (re.fullmatch(punctuationRe, word)):
            negated = False
            continue
        if (re.fullmatch(negationRe, word)):
            negated = not negated
            continue
        if (negated):
            word = "not_" + word
        words_set.add(word)
            
    return words_set

In [76]:
LAPLACIAN_SMOOTHING = 1

def likelihood_word_class(word, classe):
    count_word_class = 0
    if (word in classes[classe]['FREQUENCY']):
        count_word_class = classes[classe]['FREQUENCY'][word]
    
    total_words_class = len(classes[classe]['WORDS'])
    
    return (count_word_class + LAPLACIAN_SMOOTHING)/((LAPLACIAN_SMOOTHING + 1) * total_words_class)

def likelihood_doc_class(doc_path, classe):                            
    likelihood = 0

    for word in get_document_words(doc_path):
        likelihood +=  math.log(likelihood_word_class(word, classe))
    
    return likelihood + classes[classe]['PRIOR']

In [77]:
# train:
#     average time @ core i7, 7th gen
#     (x%  corpus: y min)
#     10% : 03 min)
#     20% : 08 min)
#     25% : 13 min)
#     33% : 19 min)
#     50% :+40 min)
#    100% :  ? min)

for classe in classes:
    classes[classe]['PRIOR'] = math.log(documents_amount['TRAIN'][classe]/total_documents['TRAIN'])
    update_words_list(PATHS['TRAIN'][classe], classes[classe]['WORDS'])
    update_frequency(classes[classe]['FREQUENCY'], classes[classe]['WORDS'])

In [78]:
# test

def test(classe, total_documents):
    correct_tests = 0

    path = PATHS['TEST'][classe]
    for file_name in os.listdir(path):
          likelihood = {}
          
          for classe_aux in classes:
              likelihood[classe_aux] = likelihood_doc_class(path+"/"+file_name, classe_aux)
          
          if (classe == max(likelihood, key=likelihood.get)):
              correct_tests += 1
    
    return 100*correct_tests/total_documents

accuracy = {}

for classe in classes:
    accuracy[classe] = test(classe, documents_amount['TEST'][classe])
    print(f"\nAccuracy ({classe}) = {accuracy[classe]}%")

print(f"\nAccuracy (average) = {sum(accuracy.values())/len(accuracy)}%")


Accuracy (POSITIVE) = 75.744%

Accuracy (NEGATIVE) = 85.384%

Accuracy (average) = 80.564%
