# Naive Bayes Legal Document Classifier

## Import common libraries

In [1]:
import numpy as np
import os, sys
import pandas as pd
import glob
from pymongo import MongoClient

print('Libraries loaded')

Libraries loaded


## Start database connection

In [2]:
conn = MongoClient()
db = conn.lexit
collection = db.judgements
print ('Database successfully connected')

Database successfully connected


## Define feature suite factory procedures

In [43]:
### Create category folders 
#folders = create_label_folders(collection)
#print ('PASTAS OK')

### Store record according category
#export_record_to_category()
#print ('Arquivos OK')

def get_features_suite_01(judgement):
    return judgement['ementa'].strip().replace("\n", "")

def get_features_suite_02(judgement):
    return (
        judgement['paginaInternaTitulo'].strip() + 
        judgement['ementa'].strip() + 
        judgement['decisao'].strip() + 
        judgement['tese'].strip() + 
        judgement['observacao'].strip() + 
        judgement['doutrina'].strip()
    ).replace("\n", "")

def get_features_suite_03(judgement):
    target_folder = os.path.join(os.getcwd(), '..', '..', 'txt')
    file_path = os.path.join(target_folder, str(judgement['documentId']) + '.txt')
    if os.path.exists(file_path):
        f = open(file_path, 'r', encoding='utf8')
        return (f.read()).strip()
    return ''


## Dataset construction for feature suites

In [47]:
#current_folder = os.getcwd()
#target_folder = os.path.join(current_folder, '../../db/*.merge.json')
#files = glob.glob(target_folder)
#print ('Found', len(files), ' files:')
#for i in range(0, len(files)):
#    print((os.path.basename(files[i])))


In [44]:
ds_suite_01 = []
ds_suite_02 = []
ds_suite_03 = []
i = 0
total_suite_01 = 0
total_suite_02 = 0
total_suite_03 = 0

for row in collection.find():
    curr_row_suite_01 = get_features_suite_01(row)
    curr_row_suite_02 = get_features_suite_02(row)
    curr_row_suite_03 = get_features_suite_03(row)
    ds_suite_01.append([row['categoria'], get_features_suite_01(row)])  
    ds_suite_02.append([row['categoria'], get_features_suite_02(row)])  
    ds_suite_03.append([row['categoria'], get_features_suite_03(row)])  
    total_suite_01 += len(curr_row_suite_01.split(' '))
    total_suite_02 += len(curr_row_suite_02.split(' '))
    total_suite_03 += len(str(curr_row_suite_03).split(' '))
    i = i + 1

print ('Total judgements:')
print ('- Feature suite [01] word count:', total_suite_01)
print ('--- COUNT words:', total_suite_01)
print ('--- AVG words/document:', round(total_suite_01 / i, 2))

print ('\n- Feature suite [02] word count:')
print ('--- COUNT words:', total_suite_02)
print ('--- AVG words/document:', round(total_suite_02 / i, 2))

print ('\n- Feature suite [03]')
print ('--- COUNT words:', total_suite_03)
print ('--- AVG words/document:', round(total_suite_03 / i, 2))

Total judgements:
- Feature suite [01] word count: 10856860
--- COUNT words: 10856860
--- AVG words/document: 147.12

- Feature suite [02] word count:
--- COUNT words: 16997445
--- AVG words/document: 230.32

- Feature suite [03]
--- COUNT words: 575235270
--- AVG words/document: 7794.73


## Data manipulation

### Imports

In [50]:
import nltk
#nltk.download('punkt')
import string
from collections import defaultdict
from nltk.corpus import stopwords

### Stopwords

In [112]:
stop_words01 = set(stopwords.words('portuguese'))
print ('Total pt-BR stopwords:', len(stop_words01))

stop_words02 = set(stop_words01)
stop_words02.add('turma')
stop_words02.add('art')
stop_words02.add('agr')
stop_words02.add('2ªt')
stop_words02.add('-')
stop_words02.add('1ªt')
stop_words02.add('agravo')
print ('Total pt-BR stopwords appending common legal area words:', len(stop_words02))

# document category abbrev
stop_words03 = set(stop_words02)
for row in collection.find({}, {"_id": 0, "categoria": 1}):
   stop_words03.add(row['categoria'].lower())

print ('Total pt-BR stopwords appending common legal area words and document categories:', len(stop_words03))

Total pt-BR stopwords: 207
Total pt-BR stopwords appending common legal area words: 214
Total pt-BR stopwords appending common legal area words and document categories: 253


### Text preprocessing

In [79]:
def clean_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    return text
                 
def get_tokens(text, stopwords):
    tokens = nltk.word_tokenize(text, 'portuguese')
    tokens = [t for t in tokens if not t in stopwords]
    return tokens

def get_dataset_most_common_words(dataset, stopwords, word_limit=5):    
    tokens = defaultdict(list)

    for doc in dataset:
        doc_label = doc[0]
        doc_tokens = get_tokens(clean_text(doc[1]), stopwords)    
        tokens[doc_label].extend(doc_tokens)

    for category_label, category_tokens in tokens.items():
        fd = nltk.FreqDist(category_tokens)
        print(category_label)
        print(fd.most_common(word_limit), '\n')
        
#get_dataset_most_common_words(ds_suite_01, stop_words01, 1)

### Training and Evaluation

#### Import common libraries

In [103]:
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import pickle
import time
from tabulate import tabulate

#### Define common training procedures

In [104]:
def get_splits (docs):
    random.shuffle(docs)
    
    X_train = []
    y_train = []
    
    X_test = []
    y_test = []
    
    pivot = int(.8 * len(docs))
    
    for i in range(0, pivot):
        X_train.append(docs[i][1])
        y_train.append(docs[i][0])
        
    for i in range(pivot, len(docs)):
        X_test.append(docs[i][1])
        y_test.append(docs[i][0])
        
    return X_train, X_test, y_train, y_test

def evaluate_class (classifier, vectorizer, X_test, y_test):
    start_time = time.time()
    X_test_tfidf = vectorizer.transform(X_test)
    y_pred = classifier.predict(X_test_tfidf)
    end_time = time.time() - start_time
    
    precision = metrics.precision_score(y_test, y_pred, average='weighted', zero_division=1)
    recall = metrics.recall_score(y_test, y_pred, average='weighted', zero_division=1)
    f1 = metrics.f1_score(y_test, y_pred, average='weighted', zero_division=1)
    
    #print("%s\t%f\t%f\t%f\n" % (title, precision, recall, f1, round(end_time, 4)))
    return (precision, recall, f1, round(end_time, 4))
    
def train_and_evaluate_model(dataset, stopwords):
    X_train, X_test, y_train, y_test = get_splits(dataset)
    vectorizer = CountVectorizer(stop_words=stopwords,ngram_range=(1,3), min_df=3, analyzer='word')

    dtm = vectorizer.fit_transform(X_train)
    nbc = MultinomialNB().fit(dtm, y_train)
    train_class = evaluate_class(nbc, vectorizer, X_train, y_train)
    test_class = evaluate_class(nbc, vectorizer, X_test, y_test)
    
    print(
        tabulate([
            ['Train', train_class[0], train_class[1], train_class[2], train_class[3]], 
            ['Test', test_class[0], test_class[1], test_class[2], test_class[3]] 
        ], headers=['', 'Precision', 'Recall', 'F1-score', 'time(s)'])
    )

#### Training

##### [1.1] Training configuration suite (ds_suite_01, stop_words01)

In [105]:
train_and_evaluate_model(ds_suite_01, stop_words01)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.834193  0.827789    0.825401    18.3544
Test      0.774846  0.767886    0.76233      4.444


##### [1.2] Training configuration suite (ds_suite_01, stop_words02)

In [113]:
train_and_evaluate_model(ds_suite_01, stop_words02)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.816129  0.809157    0.807787    19.8911
Test      0.748879  0.74248     0.738061     4.384


##### [1.3] Training configuration suite (ds_suite_01, stop_words03)

In [114]:
train_and_evaluate_model(ds_suite_01, stop_words03)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.812011  0.805159    0.803341    21.4483
Test      0.757553  0.748442    0.744005     6.964


##### [2.1] Training configuration suite (ds_suite_02, stop_words01)

In [115]:
train_and_evaluate_model(ds_suite_02, stop_words01)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.824572  0.815458    0.810372    37.9002
Test      0.763342  0.753591    0.743763     9.0522


##### [2.2] Training configuration suite (ds_suite_02, stop_words02)

In [117]:
train_and_evaluate_model(ds_suite_02, stop_words02)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.81498   0.807277    0.802747     36.03
Test      0.754554  0.743767    0.73513       8.709


##### [2.3] Training configuration suite (ds_suite_02, stop_words03)

In [118]:
train_and_evaluate_model(ds_suite_02, stop_words03)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.807775  0.798062    0.793123     34.072
Test      0.747456  0.73374     0.72522       8.305


##### [3.1] Training configuration suite (ds_suite_03, stop_words01)

In [119]:
train_and_evaluate_model(ds_suite_03, stop_words01)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.881444  0.873505    0.866901   1117.76
Test      0.811505  0.807927    0.795075    321.931
