# Random Forest Legal Document Classifier

## Import common libraries

In [None]:
import numpy as np
import os, sys
import pandas as pd
import glob
from pymongo import MongoClient

print('Libraries loaded')

## Start database connection

In [None]:
conn = MongoClient()
db = conn.lexit
collection = db.judgements

print ('Database successfully connected')

## Define feature suite factory procedures

In [None]:
def get_features_suite_01(judgement):
    return judgement['ementa'].strip().replace("\n", "")

def get_features_suite_02(judgement):
    return (
        judgement['paginaInternaTitulo'].strip() + 
        judgement['ementa'].strip() + 
        judgement['decisao'].strip() + 
        judgement['tese'].strip() + 
        judgement['observacao'].strip() + 
        judgement['doutrina'].strip()
    ).replace("\n", "")

def get_features_suite_03(judgement):
    target_folder = os.path.join(os.getcwd(), '..', '..', 'txt')
    file_path = os.path.join(target_folder, str(judgement['documentId']) + '.txt')
    if os.path.exists(file_path):
        f = open(file_path, 'r', encoding='utf8')
        return (f.read()).strip()
    return ''


## Dataset construction for feature suites

In [None]:
ds_suite_01 = []
ds_suite_02 = []
ds_suite_03 = []
i = 0
total_suite_01 = 0
total_suite_02 = 0
total_suite_03 = 0

for row in collection.find():
    curr_row_suite_01 = get_features_suite_01(row)
    curr_row_suite_02 = get_features_suite_02(row)
    curr_row_suite_03 = get_features_suite_03(row)
    
    ds_suite_01.append([row['categoria'], get_features_suite_01(row)])  
    ds_suite_02.append([row['categoria'], get_features_suite_02(row)])  
    ds_suite_03.append([row['categoria'], get_features_suite_03(row)])  
    
    total_suite_01 += len(curr_row_suite_01.split(' '))
    total_suite_02 += len(curr_row_suite_02.split(' '))
    total_suite_03 += len(str(curr_row_suite_03).split(' '))
    i = i + 1

print ('Total documents:', i)

print ('\n- Feature suite [01]', total_suite_01)
print ('--- COUNT words:', total_suite_01)
print ('--- AVG words/document:', round(total_suite_01 / i, 2))

print ('\n- Feature suite [02]')
print ('--- COUNT words:', total_suite_02)
print ('--- AVG words/document:', round(total_suite_02 / i, 2))

print ('\n- Feature suite [03]')
print ('--- COUNT words:', total_suite_03)
print ('--- AVG words/document:', round(total_suite_03 / i, 2))

## Data manipulation

### Imports

In [None]:
import nltk
import string
from collections import defaultdict
from nltk.corpus import stopwords

print ('Data manipulation libraries loaded')

### Stopwords

In [None]:
stop_words01 = set(stopwords.words('portuguese'))
print ('Total pt-BR stopwords:', len(stop_words01))

stop_words02 = set(stop_words01)
stop_words02.add('turma')
stop_words02.add('art')
stop_words02.add('agr')
stop_words02.add('2ªt')
stop_words02.add('-')
stop_words02.add('1ªt')
stop_words02.add('agravo')
print ('Total pt-BR stopwords appending common legal area words:', len(stop_words02))

# document category abbrev
stop_words03 = set(stop_words02)
for row in collection.find({}, {"_id": 0, "categoria": 1}):
   stop_words03.add(row['categoria'].lower())

print ('Total pt-BR stopwords appending common legal area words and document categories:', len(stop_words03))

### Text preprocessing

In [None]:
def clean_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    return text
                 
def get_tokens(text, stopwords):
    tokens = nltk.word_tokenize(text, 'portuguese')
    tokens = [t for t in tokens if not t in stopwords]
    return tokens

def get_dataset_most_common_words(dataset, stopwords, word_limit=5):    
    tokens = defaultdict(list)

    for doc in dataset:
        doc_label = doc[0]
        doc_tokens = get_tokens(clean_text(doc[1]), stopwords)    
        tokens[doc_label].extend(doc_tokens)

    for category_label, category_tokens in tokens.items():
        fd = nltk.FreqDist(category_tokens)
        print(category_label)
        print(fd.most_common(word_limit), '\n')
        
#get_dataset_most_common_words(ds_suite_01, stop_words01, 1)

### Training and Evaluation

#### Import common libraries

In [None]:
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import pickle
import time
from tabulate import tabulate

print ('Training libraries loaded')

#### Define common training procedures

In [None]:
def get_splits (docs):
    random.shuffle(docs)
    
    X_train = []
    y_train = []
    
    X_test = []
    y_test = []
    
    pivot = int(.8 * len(docs))
    
    for i in range(0, pivot):
        X_train.append(docs[i][1])
        y_train.append(docs[i][0])
        
    for i in range(pivot, len(docs)):
        X_test.append(docs[i][1])
        y_test.append(docs[i][0])
        
    return X_train, X_test, y_train, y_test

def evaluate_class (classifier, vectorizer, X_test, y_test):
    start_time = time.time()
    
    X_test_tfidf = vectorizer.transform(X_test)
    y_pred = classifier.predict(X_test_tfidf)
    
    end_time = time.time() - start_time
    
    precision = metrics.precision_score(y_test, y_pred, average='weighted', zero_division=1)
    recall = metrics.recall_score(y_test, y_pred, average='weighted', zero_division=1)
    f1 = metrics.f1_score(y_test, y_pred, average='weighted', zero_division=1)
    
    return (precision, recall, f1, round(end_time, 4))
    
def train_and_evaluate_model(dataset, stopwords, features=10000):
    X_train, X_test, y_train, y_test = get_splits(dataset)
    
    vectorizer = CountVectorizer(
        stop_words=stopwords,
        ngram_range=(1,3), 
        min_df=3, 
        analyzer='word', 
        max_features=features
    )
    
    dtm = vectorizer.fit_transform(X_train)
    start_time = time.time()
    
    rfc = RandomForestClassifier(
        n_estimators=50, 
        random_state=42, 
        n_jobs=4
        #max_depth=6
    ).fit(dtm, y_train)    
    
    train_class = evaluate_class(rfc, vectorizer, X_train, y_train)
    test_class = evaluate_class(rfc, vectorizer, X_test, y_test)
    
    end_time = time.time() - start_time
    
    print(
        tabulate([
            ['Train', train_class[0], train_class[1], train_class[2], train_class[3]], 
            ['Test', test_class[0], test_class[1], test_class[2], test_class[3]] 
        ], headers=['', 'Precision', 'Recall', 'F1-score', 'time(s)'])
    )
    print('\nTotal execution time (s): ', round(end_time, 4))

#### Training

##### [1.1] Training configuration suite (ds_suite_01, stop_words01)

In [None]:
train_and_evaluate_model(ds_suite_01, stop_words01)

##### [1.2] Training configuration suite (ds_suite_01, stop_words02)

In [None]:
train_and_evaluate_model(ds_suite_01, stop_words02)

##### [1.3] Training configuration suite (ds_suite_01, stop_words03)

In [None]:
train_and_evaluate_model(ds_suite_01, stop_words03)

##### [2.1] Training configuration suite (ds_suite_02, stop_words01)

In [None]:
train_and_evaluate_model(ds_suite_02, stop_words01)

##### [2.2] Training configuration suite (ds_suite_02, stop_words02)

In [None]:
train_and_evaluate_model(ds_suite_02, stop_words02)

##### [2.3] Training configuration suite (ds_suite_02, stop_words03)

In [None]:
train_and_evaluate_model(ds_suite_02, stop_words03)

##### [3.1] Training configuration suite (ds_suite_03, stop_words01)

In [None]:
train_and_evaluate_model(ds_suite_03, stop_words01)

##### [3.2] Training configuration suite (ds_suite_03, stop_words02)

In [None]:
train_and_evaluate_model(ds_suite_03, stop_words02)

##### [3.3] Training configuration suite (ds_suite_03, stop_words03)

In [None]:
train_and_evaluate_model(ds_suite_03, stop_words03)