# Naive Bayes Legal Document Classifier

## Import common libraries

In [1]:
import numpy as np
import os, sys
import pandas as pd
import glob
from pymongo import MongoClient

print('Libraries loaded')

Libraries loaded


## Start database connection

In [2]:
conn = MongoClient()
db = conn.lexit
collection = db.judgements
print ('Database successfully connected')

Database successfully connected


## Define feature suite factory procedures

In [3]:
def get_features_suite_01(judgement):
    return judgement['ementa'].strip().replace("\n", "")

def get_features_suite_02(judgement):
    return (
        judgement['paginaInternaTitulo'].strip() + 
        judgement['ementa'].strip() + 
        judgement['decisao'].strip() + 
        judgement['tese'].strip() + 
        judgement['observacao'].strip() + 
        judgement['doutrina'].strip()
    ).replace("\n", "")

def get_features_suite_03(judgement):
    target_folder = os.path.join(os.getcwd(), '..', '..', 'txt')
    file_path = os.path.join(target_folder, str(judgement['documentId']) + '.txt')
    if os.path.exists(file_path):
        f = open(file_path, 'r', encoding='utf8')
        return (f.read()).strip()
    return ''


## Dataset construction for feature suites

In [4]:
ds_suite_01 = []
ds_suite_02 = []
ds_suite_03 = []
i = 0
total_suite_01 = 0
total_suite_02 = 0
total_suite_03 = 0

for row in collection.find():
    curr_row_suite_01 = get_features_suite_01(row)
    curr_row_suite_02 = get_features_suite_02(row)
    curr_row_suite_03 = get_features_suite_03(row)
    
    ds_suite_01.append([row['categoria'], get_features_suite_01(row)])  
    ds_suite_02.append([row['categoria'], get_features_suite_02(row)])  
    ds_suite_03.append([row['categoria'], get_features_suite_03(row)])  
    
    total_suite_01 += len(curr_row_suite_01.split(' '))
    total_suite_02 += len(curr_row_suite_02.split(' '))
    total_suite_03 += len(str(curr_row_suite_03).split(' '))
    i = i + 1

print ('Total documents:', i)

print ('\n- Feature suite [01]', total_suite_01)
print ('--- COUNT words:', total_suite_01)
print ('--- AVG words/document:', round(total_suite_01 / i, 2))

print ('\n- Feature suite [02]')
print ('--- COUNT words:', total_suite_02)
print ('--- AVG words/document:', round(total_suite_02 / i, 2))

print ('\n- Feature suite [03]')
print ('--- COUNT words:', total_suite_03)
print ('--- AVG words/document:', round(total_suite_03 / i, 2))

Total documents: 73798

- Feature suite [01] 10856860
--- COUNT words: 10856860
--- AVG words/document: 147.12

- Feature suite [02]
--- COUNT words: 16997445
--- AVG words/document: 230.32

- Feature suite [03]
--- COUNT words: 575235270
--- AVG words/document: 7794.73


## Data manipulation

### Imports

In [5]:
import nltk
import string
from collections import defaultdict
from nltk.corpus import stopwords

print ('Data manipulation libraries loaded')

Data manipulation libraries loaded


### Stopwords

In [6]:
stop_words01 = set(stopwords.words('portuguese'))
print ('Total pt-BR stopwords:', len(stop_words01))

stop_words02 = set(stop_words01)
stop_words02.add('turma')
stop_words02.add('art')
stop_words02.add('agr')
stop_words02.add('2ªt')
stop_words02.add('-')
stop_words02.add('1ªt')
stop_words02.add('agravo')
print ('Total pt-BR stopwords appending common legal area words:', len(stop_words02))

# document category abbrev
stop_words03 = set(stop_words02)
for row in collection.find({}, {"_id": 0, "categoria": 1}):
   stop_words03.add(row['categoria'].lower())

print ('Total pt-BR stopwords appending common legal area words and document categories:', len(stop_words03))

Total pt-BR stopwords: 207
Total pt-BR stopwords appending common legal area words: 214
Total pt-BR stopwords appending common legal area words and document categories: 253


### Text preprocessing

In [7]:
def clean_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    return text
                 
def get_tokens(text, stopwords):
    tokens = nltk.word_tokenize(text, 'portuguese')
    tokens = [t for t in tokens if not t in stopwords]
    return tokens

def get_dataset_most_common_words(dataset, stopwords, word_limit=5):    
    tokens = defaultdict(list)

    for doc in dataset:
        doc_label = doc[0]
        doc_tokens = get_tokens(clean_text(doc[1]), stopwords)    
        tokens[doc_label].extend(doc_tokens)

    for category_label, category_tokens in tokens.items():
        fd = nltk.FreqDist(category_tokens)
        print(category_label)
        print(fd.most_common(word_limit), '\n')
        
#get_dataset_most_common_words(ds_suite_01, stop_words01, 1)

### Training and Evaluation

#### Import common libraries

In [8]:
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import pickle
import time
from tabulate import tabulate

print ('Training libraries loaded')

Training libraries loaded


#### Define common training procedures

In [9]:
def get_splits (docs):
    random.shuffle(docs)
    
    X_train = []
    y_train = []
    
    X_test = []
    y_test = []
    
    pivot = int(.8 * len(docs))
    
    for i in range(0, pivot):
        X_train.append(docs[i][1])
        y_train.append(docs[i][0])
        
    for i in range(pivot, len(docs)):
        X_test.append(docs[i][1])
        y_test.append(docs[i][0])
        
    return X_train, X_test, y_train, y_test

def evaluate_class (classifier, vectorizer, X_test, y_test):
    start_time = time.time()
    
    X_test_tfidf = vectorizer.transform(X_test)
    y_pred = classifier.predict(X_test_tfidf)
    
    end_time = time.time() - start_time
    
    precision = metrics.precision_score(y_test, y_pred, average='weighted', zero_division=1)
    recall = metrics.recall_score(y_test, y_pred, average='weighted', zero_division=1)
    f1 = metrics.f1_score(y_test, y_pred, average='weighted', zero_division=1)
    
    return (precision, recall, f1, round(end_time, 4))
    
def train_and_evaluate_model(dataset, stopwords, features=10000):
    X_train, X_test, y_train, y_test = get_splits(dataset)
    vectorizer = CountVectorizer(
        stop_words=stopwords,
        ngram_range=(1,3), 
        min_df=3, 
        analyzer='word',
        max_features=features
    )

    dtm = vectorizer.fit_transform(X_train)
    start_time = time.time()
    
    nbc = MultinomialNB().fit(dtm, y_train)
    
    train_class = evaluate_class(nbc, vectorizer, X_train, y_train)
    test_class = evaluate_class(nbc, vectorizer, X_test, y_test)
    
    end_time = time.time() - start_time
    
    print(
        tabulate([
            ['Train', train_class[0], train_class[1], train_class[2], train_class[3]], 
            ['Test', test_class[0], test_class[1], test_class[2], test_class[3]] 
        ], headers=['', 'Precision', 'Recall', 'F1-score', 'time(s)'])
    )
    print('\nTotal execution time (s): ', round(end_time, 4))

#### Training

In [10]:
# vectorizer max features (number of most common words)
vec_feat_01 = 100
vec_feat_02 = 500
vec_feat_03 = 1000
vec_feat_04 = 5000
vec_feat_05 = 10000

# stopwords
sw = stop_words01

##### [1.1] Training configuration suite (ds_suite_01, vec_feat_01)

In [11]:
train_and_evaluate_model(ds_suite_01, sw, vec_feat_01)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.705701  0.654223    0.673929     13.788
Test      0.711001  0.657317    0.677914      3.454

Total execution time (s):  18.289


##### [1.2] Training configuration suite (ds_suite_01, vec_feat_02)

In [12]:
train_and_evaluate_model(ds_suite_01, sw, vec_feat_02)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.755631  0.679799    0.700332     14.443
Test      0.750486  0.674187    0.695838      3.585

Total execution time (s):  19.126


##### [1.3] Training configuration suite (ds_suite_01, vec_feat_03)

In [13]:
train_and_evaluate_model(ds_suite_01, sw, vec_feat_03)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.762055  0.67233     0.690479     14.451
Test      0.753549  0.662398    0.681846      3.506

Total execution time (s):  19.079


##### [1.4] Training configuration suite (ds_suite_01, vec_feat_04)

In [14]:
train_and_evaluate_model(ds_suite_01, sw, vec_feat_04)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.772744  0.687286    0.703276     15.559
Test      0.764566  0.671138    0.689704      3.913

Total execution time (s):  20.683


##### [1.5] Training configuration suite (ds_suite_01, vec_feat_05)

In [15]:
train_and_evaluate_model(ds_suite_01, sw, vec_feat_05)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.779852  0.691453    0.706951     15.309
Test      0.764146  0.67168     0.687914      3.783

Total execution time (s):  20.35


---

##### [2.1] Training configuration suite (ds_suite_02, vec_feat_01)

In [16]:
train_and_evaluate_model(ds_suite_02, sw, vec_feat_01)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.7253    0.588943    0.631404     25.838
Test      0.725002  0.593157    0.634592      6.593

Total execution time (s):  33.514


##### [2.2] Training configuration suite (ds_suite_02, vec_feat_02)

In [17]:
train_and_evaluate_model(ds_suite_02, sw, vec_feat_02)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.753837  0.627376    0.6625       26.308
Test      0.756734  0.625813    0.662625      6.631

Total execution time (s):  34.076


##### [2.3] Training configuration suite (ds_suite_02, vec_feat_03)

In [18]:
train_and_evaluate_model(ds_suite_02, sw, vec_feat_03)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.764495  0.648786    0.679059     26.98
Test      0.768454  0.648035    0.680275      6.775

Total execution time (s):  34.926


##### [2.4] Training configuration suite (ds_suite_02, sw, vec_feat_04)

In [19]:
train_and_evaluate_model(ds_suite_02, sw, vec_feat_04)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.767234  0.655036    0.679165      28.39
Test      0.764272  0.648645    0.674086       7.33

Total execution time (s):  37.085


##### [2.5] Training configuration suite (ds_suite_02, vec_feat_05)

In [20]:
train_and_evaluate_model(ds_suite_02, sw, vec_feat_05)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.770464  0.664352    0.685643     28.409
Test      0.75917   0.649255    0.671878      7.021

Total execution time (s):  36.784


---

##### [3.1] Training configuration suite (ds_suite_03, vec_feat_01)

In [21]:
train_and_evaluate_model(ds_suite_03, sw, vec_feat_01)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.593501  0.170382    0.205095    899.104
Test      0.572434  0.168293    0.203712    247.158

Total execution time (s):  1147.8686


##### [3.2] Training configuration suite (ds_suite_03, vec_feat_02)

In [22]:
train_and_evaluate_model(ds_suite_03, sw, vec_feat_02)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.785658  0.658644     0.69365    896.644
Test      0.783128  0.667005     0.69988    228.919

Total execution time (s):  1127.4945


##### [3.3] Training configuration suite (ds_suite_03, vec_feat_03)

In [23]:
train_and_evaluate_model(ds_suite_03, sw, vec_feat_03)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.825926  0.74181     0.76604     927.153
Test      0.818084  0.730081    0.755453    242.472

Total execution time (s):  1171.9573


##### [3.4] Training configuration suite (ds_suite_03, vec_feat_04)

In [24]:
train_and_evaluate_model(ds_suite_03, sw, vec_feat_04)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.803755  0.752227    0.766102    952.063
Test      0.799782  0.745325    0.760359    241.23

Total execution time (s):  1196.6723


##### [3.5] Training configuration suite (ds_suite_03, vec_feat_05)

In [26]:
train_and_evaluate_model(ds_suite_03, sw, vec_feat_05)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.797271  0.737576    0.754467    942.596
Test      0.790818  0.729404    0.746389    244.52

Total execution time (s):  1190.3266
