# Random Forest Legal Document Classifier

## Import common libraries

In [1]:
import numpy as np
import os, sys
import pandas as pd
import glob
from pymongo import MongoClient

print('Libraries loaded')

Libraries loaded


## Start database connection

In [2]:
conn = MongoClient()
db = conn.lexit
collection = db.judgements

print ('Database successfully connected')

Database successfully connected


## Define feature suite factory procedures

In [3]:
def get_features_suite_01(judgement):
    return judgement['ementa'].strip().replace("\n", "")

def get_features_suite_02(judgement):
    return (
        judgement['paginaInternaTitulo'].strip() + 
        judgement['ementa'].strip() + 
        judgement['decisao'].strip() + 
        judgement['tese'].strip() + 
        judgement['observacao'].strip() + 
        judgement['doutrina'].strip()
    ).replace("\n", "")

def get_features_suite_03(judgement):
    target_folder = os.path.join(os.getcwd(), '..', '..', 'txt')
    file_path = os.path.join(target_folder, str(judgement['documentId']) + '.txt')
    if os.path.exists(file_path):
        f = open(file_path, 'r', encoding='utf8')
        return (f.read()).strip()
    return ''


## Dataset construction for feature suites

In [4]:
ds_suite_01 = []
ds_suite_02 = []
ds_suite_03 = []
i = 0
total_suite_01 = 0
total_suite_02 = 0
total_suite_03 = 0

for row in collection.find():
    curr_row_suite_01 = get_features_suite_01(row)
    curr_row_suite_02 = get_features_suite_02(row)
    curr_row_suite_03 = get_features_suite_03(row)
    
    ds_suite_01.append([row['categoria'], get_features_suite_01(row)])  
    ds_suite_02.append([row['categoria'], get_features_suite_02(row)])  
    ds_suite_03.append([row['categoria'], get_features_suite_03(row)])  
    
    total_suite_01 += len(curr_row_suite_01.split(' '))
    total_suite_02 += len(curr_row_suite_02.split(' '))
    total_suite_03 += len(str(curr_row_suite_03).split(' '))
    i = i + 1

print ('Total documents:', i)

print ('\n- Feature suite [01]', total_suite_01)
print ('--- COUNT words:', total_suite_01)
print ('--- AVG words/document:', round(total_suite_01 / i, 2))

print ('\n- Feature suite [02]')
print ('--- COUNT words:', total_suite_02)
print ('--- AVG words/document:', round(total_suite_02 / i, 2))

print ('\n- Feature suite [03]')
print ('--- COUNT words:', total_suite_03)
print ('--- AVG words/document:', round(total_suite_03 / i, 2))

Total documents: 73798

- Feature suite [01] 10856860
--- COUNT words: 10856860
--- AVG words/document: 147.12

- Feature suite [02]
--- COUNT words: 16997445
--- AVG words/document: 230.32

- Feature suite [03]
--- COUNT words: 575235270
--- AVG words/document: 7794.73


## Data manipulation

### Imports

In [5]:
import nltk
import string
from collections import defaultdict
from nltk.corpus import stopwords

print ('Data manipulation libraries loaded')

Data manipulation libraries loaded


### Stopwords

In [6]:
stop_words01 = set(stopwords.words('portuguese'))
print ('Total pt-BR stopwords:', len(stop_words01))

stop_words02 = set(stop_words01)
stop_words02.add('turma')
stop_words02.add('art')
stop_words02.add('agr')
stop_words02.add('2ªt')
stop_words02.add('-')
stop_words02.add('1ªt')
stop_words02.add('agravo')
print ('Total pt-BR stopwords appending common legal area words:', len(stop_words02))

# document category abbrev
stop_words03 = set(stop_words02)
for row in collection.find({}, {"_id": 0, "categoria": 1}):
   stop_words03.add(row['categoria'].lower())

print ('Total pt-BR stopwords appending common legal area words and document categories:', len(stop_words03))

Total pt-BR stopwords: 207
Total pt-BR stopwords appending common legal area words: 214
Total pt-BR stopwords appending common legal area words and document categories: 253


### Text preprocessing

In [7]:
def clean_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    return text
                 
def get_tokens(text, stopwords):
    tokens = nltk.word_tokenize(text, 'portuguese')
    tokens = [t for t in tokens if not t in stopwords]
    return tokens

def get_dataset_most_common_words(dataset, stopwords, word_limit=5):    
    tokens = defaultdict(list)

    for doc in dataset:
        doc_label = doc[0]
        doc_tokens = get_tokens(clean_text(doc[1]), stopwords)    
        tokens[doc_label].extend(doc_tokens)

    for category_label, category_tokens in tokens.items():
        fd = nltk.FreqDist(category_tokens)
        print(category_label)
        print(fd.most_common(word_limit), '\n')
        
#get_dataset_most_common_words(ds_suite_01, stop_words01, 1)

### Training and Evaluation

#### Import common libraries

In [8]:
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import pickle
import time
from tabulate import tabulate

print ('Training libraries loaded')

Training libraries loaded


#### Define common training procedures

In [9]:
def get_splits (docs):
    random.shuffle(docs)
    
    X_train = []
    y_train = []
    
    X_test = []
    y_test = []
    
    pivot = int(.8 * len(docs))
    
    for i in range(0, pivot):
        X_train.append(docs[i][1])
        y_train.append(docs[i][0])
        
    for i in range(pivot, len(docs)):
        X_test.append(docs[i][1])
        y_test.append(docs[i][0])
        
    return X_train, X_test, y_train, y_test

def evaluate_class (classifier, vectorizer, X_test, y_test):
    start_time = time.time()
    
    X_test_tfidf = vectorizer.transform(X_test)
    y_pred = classifier.predict(X_test_tfidf)
    
    end_time = time.time() - start_time
    
    precision = metrics.precision_score(y_test, y_pred, average='weighted', zero_division=1)
    recall = metrics.recall_score(y_test, y_pred, average='weighted', zero_division=1)
    f1 = metrics.f1_score(y_test, y_pred, average='weighted', zero_division=1)
    
    return (precision, recall, f1, round(end_time, 4))
    
def train_and_evaluate_model(dataset, stopwords, features=10000):
    X_train, X_test, y_train, y_test = get_splits(dataset)
    
    vectorizer = CountVectorizer(
        stop_words=stopwords,
        ngram_range=(1,3), 
        min_df=3, 
        analyzer='word', 
        max_features=features
    )
    
    dtm = vectorizer.fit_transform(X_train)
    start_time = time.time()
    
    rfc = RandomForestClassifier(
        n_estimators=50, 
        random_state=42, 
        n_jobs=4
        #max_depth=6
    ).fit(dtm, y_train)    
    
    train_class = evaluate_class(rfc, vectorizer, X_train, y_train)
    test_class = evaluate_class(rfc, vectorizer, X_test, y_test)
    
    end_time = time.time() - start_time
    
    print(
        tabulate([
            ['Train', train_class[0], train_class[1], train_class[2], train_class[3]], 
            ['Test', test_class[0], test_class[1], test_class[2], test_class[3]] 
        ], headers=['', 'Precision', 'Recall', 'F1-score', 'time(s)'])
    )
    print('\nTotal execution time (s): ', round(end_time, 4))

#### Training

In [10]:
# vectorizer max features (number of most common words)
vec_feat_01 = 100
vec_feat_02 = 500
vec_feat_03 = 1000
vec_feat_04 = 5000
vec_feat_05 = 10000

# stopwords
sw = stop_words01

##### [1.1] Training configuration suite (ds_suite_01, vec_feat_01)

In [11]:
train_and_evaluate_model(ds_suite_01, sw, vec_feat_01)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.976428  0.976151    0.976        14.01
Test      0.851063  0.853388    0.846451      3.499

Total execution time (s):  33.3579


##### [1.2] Training configuration suite (ds_suite_01, vec_feat_02)

In [12]:
train_and_evaluate_model(ds_suite_01, sw, vec_feat_02)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.983816  0.983638    0.983559     14.466
Test      0.893139  0.89458     0.891392      3.837

Total execution time (s):  42.152


##### [1.3] Training configuration suite (ds_suite_01, vec_feat_03)

In [13]:
train_and_evaluate_model(ds_suite_01, sw, vec_feat_03)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.984301  0.984146    0.984077     14.823
Test      0.90009   0.901084    0.898362      3.723

Total execution time (s):  44.643


##### [1.4] Training configuration suite (ds_suite_01, vec_feat_04)

In [14]:
train_and_evaluate_model(ds_suite_01, sw, vec_feat_04)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.985017  0.984874    0.984804     15.09
Test      0.907803  0.908943    0.906029      3.817

Total execution time (s):  56.422


##### [1.5] Training configuration suite (ds_suite_01, vec_feat_05)

In [15]:
train_and_evaluate_model(ds_suite_01, sw, vec_feat_05)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.984873  0.984857    0.9848       15.379
Test      0.907514  0.90874     0.906443      3.955

Total execution time (s):  66.192


---

##### [2.1] Training configuration suite (ds_suite_02, vec_feat_01)

In [16]:
train_and_evaluate_model(ds_suite_02, sw, vec_feat_01)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.999915  0.999915    0.999915     26.448
Test      0.889167  0.892954    0.885635      6.473

Total execution time (s):  54.178


##### [2.2] Training configuration suite (ds_suite_02, vec_feat_02)

In [17]:
train_and_evaluate_model(ds_suite_02, sw, vec_feat_02)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.999898  0.999898    0.999898     26.756
Test      0.93239   0.935027    0.931014      6.592

Total execution time (s):  58.414


##### [2.3] Training configuration suite (ds_suite_02, vec_feat_03)

In [18]:
train_and_evaluate_model(ds_suite_02, sw, vec_feat_03)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.999966  0.999966    0.999966     27.171
Test      0.949159  0.95        0.947549      6.704

Total execution time (s):  59.983


##### [2.4] Training configuration suite (ds_suite_02, vec_feat_04)

In [19]:
train_and_evaluate_model(ds_suite_02, sw, vec_feat_04)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     1         1           1            27.642
Test      0.943836  0.944309    0.941528      7.213

Total execution time (s):  67.3


##### [2.5] Training configuration suite (ds_suite_02, vec_feat_05)

In [20]:
train_and_evaluate_model(ds_suite_02, sw, vec_feat_05)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.999983  0.999983    0.999983     28.447
Test      0.942629  0.943157    0.94036       7.261

Total execution time (s):  69.708


---

##### [3.1] Training configuration suite (ds_suite_03, vec_feat_01)

In [21]:
train_and_evaluate_model(ds_suite_03, sw, vec_feat_01)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.979255  0.96692     0.970304    833.048
Test      0.774653  0.775542    0.760228    208.242

Total execution time (s):  1090.267


##### [3.2] Training configuration suite (ds_suite_03, vec_feat_02)

In [22]:
train_and_evaluate_model(ds_suite_03, sw, vec_feat_02)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.979188  0.966869    0.970248    856.992
Test      0.943424  0.931707    0.930979    214.159

Total execution time (s):  1110.4426


##### [3.3] Training configuration suite (ds_suite_03, vec_feat_03)

In [11]:
train_and_evaluate_model(ds_suite_03, sw, vec_feat_03)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.979209  0.96692     0.97027     825.713
Test      0.959179  0.945799    0.946928    211.936

Total execution time (s):  1072.9289


##### [3.4] Training configuration suite (ds_suite_03, vec_feat_04)

In [12]:
train_and_evaluate_model(ds_suite_03, sw, vec_feat_04)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.979283  0.967089    0.970412    842.048
Test      0.968189  0.954268    0.956545    207.184

Total execution time (s):  1089.7269


##### [3.5] Training configuration suite (ds_suite_03, vec_feat_05)

In [13]:
train_and_evaluate_model(ds_suite_03, sw, vec_feat_05)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.97908   0.966445    0.969915    818.924
Test      0.969421  0.956978    0.958654    203.406

Total execution time (s):  1067.4074
