In [1]:
import os
import re
import numpy
import math

In [2]:
PATHS = {
    'TRAIN' : {
        'POSITIVE' : './IMDB_dataset/train/pos',
        'NEGATIVE' : './IMDB_dataset/train/neg'
    },
    'TEST' : {
        'POSITIVE' : './IMDB_dataset/test/pos',
        'NEGATIVE' : './IMDB_dataset/test/neg'
    }
}

documents_amount = {
    'TRAIN' : {
        'POSITIVE' : len(os.listdir(PATHS['TRAIN']['POSITIVE'])),
        'NEGATIVE' : len(os.listdir(PATHS['TRAIN']['NEGATIVE']))
    }, 
    'TEST' : {
        'POSITIVE' : len(os.listdir(PATHS['TEST']['POSITIVE'])),
        'NEGATIVE' : len(os.listdir(PATHS['TEST']['NEGATIVE']))
    }
}

total_documents = {
    'TRAIN' : sum(documents_amount['TRAIN'].values()),
    'TEST'  : sum(documents_amount['TEST'].values())
}

classes = {
    'POSITIVE' : {
        'docs' : {},
        'WORDS' : [],
        'FREQUENCY' : {},
        'PRIOR': 0,
        'NGRAMS': {
            '1': dict([]),
            '2': dict([]),
            '3': dict([]),
            'TOTAL_1': 0,
            'TOTAL_2': 0,
            'TOTAL_3': 0
        }
    },
    'NEGATIVE' : {
        'docs' : {},
        'WORDS' : [],
        'FREQUENCY' : {},
        'PRIOR': 0,
        'NGRAMS': {
            '1': dict([]),
            '2': dict([]),
            '3': dict([]),
            'TOTAL_1': 0,
            'TOTAL_2': 0,
            'TOTAL_3': 0
        }
    }
}

for tipo, total in total_documents.items():
    print(f"Total de textos ({tipo}):\n{total}")

Total de textos (TRAIN):
25000
Total de textos (TEST):
25000


In [3]:
regex = r"[-'a-zA-ZÀ-ÖØ-öø-ÿ]+|[.,;!?]"
CORPUS_USE_PERCENTAGE = 0.01

def get_document_words(document_path):
    content = open(document_path, 'r', encoding="UTF-8").read().lower()
    return negation_handling(re.findall(regex, content))

def update_words_list(path, words_list):
    for i in range(0, int(len(os.listdir(path))*CORPUS_USE_PERCENTAGE)):
        file_name = os.listdir(path)[i]
        words     = get_document_words(path+"/"+file_name)
        words_list.extend(words)
        
def update_words_list_ngram(path, classe):
    for i in range(0, int(len(os.listdir(path))*CORPUS_USE_PERCENTAGE)):
        file_name = os.listdir(path)[i]
        words = get_document_words(path+"/"+file_name)
        
        # unigrams
        for w in words:
            if w not in classes[classe]['NGRAMS']['1']:
                classes[classe]['NGRAMS']['1'][w] = 0
            classes[classe]['NGRAMS']['1'][w] += 1
            classes[classe]['NGRAMS']['TOTAL_1'] += 1

        # bigrams
        for i in range(0, len(words)-1):
            b = (words[i], words[i+1])
            if b not in classes[classe]['NGRAMS']['2']:
                classes[classe]['NGRAMS']['2'][b] = 0
            classes[classe]['NGRAMS']['2'][b] += 1
            classes[classe]['NGRAMS']['TOTAL_2'] += 1

        # trigrams
        for i in range(0, len(words)-2):
            t = (words[i], words[i+1], words[i+2])
            if t not in classes[classe]['NGRAMS']['3']:
                classes[classe]['NGRAMS']['3'][t] = 0
            classes[classe]['NGRAMS']['3'][t] += 1
            classes[classe]['NGRAMS']['TOTAL_3'] += 1

def update_frequency(frequency, words):
    for word in set(words):
        frequency[word] = words.count(word)

In [4]:
# Negation Handling
punctuationRe = r"[,.;!?]"
negationRe = r"not|no|\w*n't"

def negation_handling(words):
    negated = False
    words_list = list()

    for word in words:
        if (re.fullmatch(punctuationRe, word)):
            negated = False
            continue
        if (re.fullmatch(negationRe, word)):
            negated = not negated
            continue
        if (negated):
            word = "not_" + word
        words_list.append(word)
            
    return words_list

In [5]:
LAPLACIAN_SMOOTHING = 1

def likelihood_word_class(word, classe):
    count_word_class = 0
    if (word in classes[classe]['FREQUENCY']):
        count_word_class = classes[classe]['FREQUENCY'][word]
    
    total_words_class = len(classes[classe]['WORDS'])
    
    return (count_word_class + LAPLACIAN_SMOOTHING)/((LAPLACIAN_SMOOTHING + 1) * total_words_class)

def likelihood_word_class_ngram(word, classe, n):
    count_word_class = 0
    if (word in classes[classe]['NGRAMS'][n]):
        count_word_class = classes[classe]['NGRAMS'][n][word]
    
    totalName = 'TOTAL_' + n
    total_words_class = classes[classe]['NGRAMS'][totalName]
    
    return (count_word_class + LAPLACIAN_SMOOTHING)/((LAPLACIAN_SMOOTHING + 1) * total_words_class)

def likelihood_doc_class(doc_path, classe):                            
    likelihood = 0

    for word in get_document_words(doc_path):
        likelihood +=  math.log(likelihood_word_class(word, classe))
    
    return likelihood + classes[classe]['PRIOR']

def likelihood_doc_class_ngram(doc_path, classe):                            
    likelihood = 0
    
    words = get_document_words(doc_path)
    
    for unigram in words:
        likelihood += math.log(likelihood_word_class_ngram(unigram, classe, '1'))
    
    #for i in range(0, len(words)-1):
    #        bigram = (words[i], words[i+1])
    #        likelihood += math.log(likelihood_word_class_ngram(bigram, classe, '2'))
            
    #for i in range(0, len(words)-2):
    #        trigram = (words[i], words[i+1], words[i+2])
    #        likelihood += math.log(likelihood_word_class_ngram(trigram, classe, '3'))
    
    return likelihood + classes[classe]['PRIOR']

In [6]:
# train:
#     average time @ core i7, 7th gen
#     (x%  corpus: y min)
#     10% : 03 min
#     20% : 08 min
#     25% : 13 min
#     33% : 19 min
#     50% :+40 min
#    100% :  ? min
#
#     average time @ core i5, 5th gen
#     (x%  corpus: y min)
#     5% : 06 min

for classe in classes:
    classes[classe]['PRIOR'] = math.log(documents_amount['TRAIN'][classe]/total_documents['TRAIN'])
    #update_words_list(PATHS['TRAIN'][classe], classes[classe]['WORDS'])
    #update_frequency(classes[classe]['FREQUENCY'], classes[classe]['WORDS'])
    update_words_list_ngram(PATHS['TRAIN'][classe], classe)

In [7]:
# test

def test(classe, total_documents):
    correct_tests = 0

    path = PATHS['TEST'][classe]
    for file_name in os.listdir(path):
          likelihood = {}
          
          for classe_aux in classes:
              likelihood[classe_aux] = likelihood_doc_class(path+"/"+file_name, classe_aux)
          
          if (classe == max(likelihood, key=likelihood.get)):
              correct_tests += 1
    
    return 100*correct_tests/total_documents

def test_ngram(classe, total_documents):
    correct_tests = 0

    path = PATHS['TEST'][classe]
    for file_name in os.listdir(path):
          likelihood = dict([])
          
          for classe_aux in classes:
              likelihood[classe_aux] = likelihood_doc_class_ngram(path+"/"+file_name, classe_aux)
          
          if (classe == max(likelihood, key=likelihood.get)):
              correct_tests += 1
    
    return 100*correct_tests/total_documents

accuracy = {}

for classe in classes:
    accuracy[classe] = test_ngram(classe, documents_amount['TEST'][classe])
    print(f"\nAccuracy ({classe}) = {accuracy[classe]}%")

print(f"\nAccuracy (average) = {sum(accuracy.values())/len(accuracy)}%")


Accuracy (POSITIVE) = 68.536%

Accuracy (NEGATIVE) = 77.88%

Accuracy (average) = 73.208%
