# Support Vector Machine Legal Document Classifier

## Import common libraries

In [1]:
import numpy as np
import os, sys
import pandas as pd
import glob
from pymongo import MongoClient

print('Libraries loaded')

Libraries loaded


## Start database connection

In [2]:
conn = MongoClient()
db = conn.lexit
collection = db.judgements

print ('Database successfully connected')

Database successfully connected


## Define feature suite factory procedures

In [3]:
def get_features_suite_01(judgement):
    return judgement['ementa'].strip().replace("\n", "")

def get_features_suite_02(judgement):
    return (
        judgement['paginaInternaTitulo'].strip() + 
        judgement['ementa'].strip() + 
        judgement['decisao'].strip() + 
        judgement['tese'].strip() + 
        judgement['observacao'].strip() + 
        judgement['doutrina'].strip()
    ).replace("\n", "")

def get_features_suite_03(judgement):
    target_folder = os.path.join(os.getcwd(), '..', '..', 'txt')
    file_path = os.path.join(target_folder, str(judgement['documentId']) + '.txt')
    if os.path.exists(file_path):
        f = open(file_path, 'r', encoding='utf8')
        return (f.read()).strip()
    return ''


## Dataset construction for feature suites

In [4]:
ds_suite_01 = []
ds_suite_02 = []
ds_suite_03 = []
i = 0
total_suite_01 = 0
total_suite_02 = 0
total_suite_03 = 0

for row in collection.find():
    curr_row_suite_01 = get_features_suite_01(row)
    curr_row_suite_02 = get_features_suite_02(row)
    curr_row_suite_03 = get_features_suite_03(row)
    
    ds_suite_01.append([row['categoria'], get_features_suite_01(row)])  
    ds_suite_02.append([row['categoria'], get_features_suite_02(row)])  
    ds_suite_03.append([row['categoria'], get_features_suite_03(row)])  
    
    total_suite_01 += len(curr_row_suite_01.split(' '))
    total_suite_02 += len(curr_row_suite_02.split(' '))
    total_suite_03 += len(str(curr_row_suite_03).split(' '))
    i = i + 1

print ('Total documents:', i)

print ('\n- Feature suite [01]', total_suite_01)
print ('--- COUNT words:', total_suite_01)
print ('--- AVG words/document:', round(total_suite_01 / i, 2))

print ('\n- Feature suite [02]')
print ('--- COUNT words:', total_suite_02)
print ('--- AVG words/document:', round(total_suite_02 / i, 2))

print ('\n- Feature suite [03]')
print ('--- COUNT words:', total_suite_03)
print ('--- AVG words/document:', round(total_suite_03 / i, 2))

Total documents: 73798

- Feature suite [01] 10856860
--- COUNT words: 10856860
--- AVG words/document: 147.12

- Feature suite [02]
--- COUNT words: 16997445
--- AVG words/document: 230.32

- Feature suite [03]
--- COUNT words: 575235270
--- AVG words/document: 7794.73


## Data manipulation

### Imports

In [5]:
import nltk
import string
from collections import defaultdict
from nltk.corpus import stopwords

print ('Data manipulation libraries loaded')

Data manipulation libraries loaded


### Stopwords

In [6]:
stop_words01 = set(stopwords.words('portuguese'))
print ('Total pt-BR stopwords:', len(stop_words01))

stop_words02 = set(stop_words01)
stop_words02.add('turma')
stop_words02.add('art')
stop_words02.add('agr')
stop_words02.add('2ªt')
stop_words02.add('-')
stop_words02.add('1ªt')
stop_words02.add('agravo')
print ('Total pt-BR stopwords appending common legal area words:', len(stop_words02))

# document category abbrev
stop_words03 = set(stop_words02)
for row in collection.find({}, {"_id": 0, "categoria": 1}):
   stop_words03.add(row['categoria'].lower())

print ('Total pt-BR stopwords appending common legal area words and document categories:', len(stop_words03))

Total pt-BR stopwords: 207
Total pt-BR stopwords appending common legal area words: 214
Total pt-BR stopwords appending common legal area words and document categories: 253


### Text preprocessing

In [7]:
def clean_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    return text
                 
def get_tokens(text, stopwords):
    tokens = nltk.word_tokenize(text, 'portuguese')
    tokens = [t for t in tokens if not t in stopwords]
    return tokens

def get_dataset_most_common_words(dataset, stopwords, word_limit=5):    
    tokens = defaultdict(list)

    for doc in dataset:
        doc_label = doc[0]
        doc_tokens = get_tokens(clean_text(doc[1]), stopwords)    
        tokens[doc_label].extend(doc_tokens)

    for category_label, category_tokens in tokens.items():
        fd = nltk.FreqDist(category_tokens)
        print(category_label)
        print(fd.most_common(word_limit), '\n')
        
#get_dataset_most_common_words(ds_suite_01, stop_words01, 1)

### Training and Evaluation

#### Import common libraries

In [8]:
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
import pickle
import time
from tabulate import tabulate

print ('Training libraries loaded')

Training libraries loaded


#### Define common training procedures

In [9]:
def get_splits (docs):
    random.shuffle(docs)
    
    X_train = []
    y_train = []
    
    X_test = []
    y_test = []
    
    pivot = int(.8 * len(docs))
    
    for i in range(0, pivot):
        X_train.append(docs[i][1])
        y_train.append(docs[i][0])
        
    for i in range(pivot, len(docs)):
        X_test.append(docs[i][1])
        y_test.append(docs[i][0])
        
    return X_train, X_test, y_train, y_test

def evaluate_class (classifier, vectorizer, X_test, y_test):
    start_time = time.time()
    
    X_test_tfidf = vectorizer.transform(X_test)
    y_pred = classifier.predict(X_test_tfidf)
    
    end_time = time.time() - start_time
    
    precision = metrics.precision_score(y_test, y_pred, average='weighted', zero_division=1)
    recall = metrics.recall_score(y_test, y_pred, average='weighted', zero_division=1)
    f1 = metrics.f1_score(y_test, y_pred, average='weighted', zero_division=1)
    
    return (precision, recall, f1, round(end_time, 4))
    
def train_and_evaluate_model(dataset, stopwords, features=10000):
    X_train, X_test, y_train, y_test = get_splits(dataset)
    vectorizer = CountVectorizer(
        stop_words=stopwords,
        ngram_range=(1,3), 
        min_df=3, 
        analyzer='word',
        max_features=features
    )

    dtm = vectorizer.fit_transform(X_train)
    start_time = time.time()
    
    svmc = SGDClassifier(
        loss='hinge', 
        penalty='l2',
        alpha=1e-3, 
        random_state=42,
        max_iter=5, 
        tol=None
    ).fit(dtm, y_train)
    
    train_class = evaluate_class(svmc, vectorizer, X_train, y_train)
    test_class = evaluate_class(svmc, vectorizer, X_test, y_test)
    
    end_time = time.time() - start_time
    
    print(
        tabulate([
            ['Train', train_class[0], train_class[1], train_class[2], train_class[3]], 
            ['Test', test_class[0], test_class[1], test_class[2], test_class[3]] 
        ], headers=['', 'Precision', 'Recall', 'F1-score', 'time(s)'])
    )
    print('\nTotal execution time (s): ', round(end_time, 4))

#### Training

In [22]:
# vectorizer max features (number of most common words)
vec_feat_01 = 100
vec_feat_02 = 500
vec_feat_03 = 1000
vec_feat_04 = 5000
vec_feat_05 = 10000

# stopwords
sw = stop_words01

##### [1.1] Training configuration suite (ds_suite_01, vec_feat_01)

In [11]:
train_and_evaluate_model(ds_suite_01, sw, vec_feat_01)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.772523   0.78441    0.768716     15.558
Test      0.770555   0.78374    0.76699       3.83

Total execution time (s):  22.408


##### [1.2] Training configuration suite (ds_suite_01, vec_feat_02)

In [25]:
train_and_evaluate_model(ds_suite_01, sw, vec_feat_02)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.88245   0.884549    0.881035    14.8654
Test      0.870351  0.873035    0.868024     3.702

Total execution time (s):  22.1864


##### [1.3] Training configuration suite (ds_suite_01, vec_feat_03)

In [12]:
train_and_evaluate_model(ds_suite_01, sw, vec_feat_03)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.903036  0.902842    0.900298     15.412
Test      0.889318  0.890379    0.887039      3.592

Total execution time (s):  22.914


##### [1.4] Training configuration suite (ds_suite_01, vec_feat_04)

In [26]:
train_and_evaluate_model(ds_suite_01, sw, vec_feat_04)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.928768  0.928165    0.927185     15.582
Test      0.910169  0.909282    0.907563      3.712

Total execution time (s):  24.353


##### [1.5] Training configuration suite (ds_suite_01, vec_feat_05)

In [13]:
train_and_evaluate_model(ds_suite_01, sw, vec_feat_05)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.93529   0.934957    0.933924     18.078
Test      0.916427  0.916463    0.914696      4.232

Total execution time (s):  28.6908


---

##### [2.1] Training configuration suite (ds_suite_02, vec_feat_01)

##### [2.1] Training configuration suite (ds_suite_02, vec_feat_01)

In [14]:
train_and_evaluate_model(ds_suite_02, sw, vec_feat_01)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.792793  0.795877    0.778974     26.775
Test      0.78551   0.790583    0.773016      6.92

Total execution time (s):  37.063


##### [2.2] Training configuration suite (ds_suite_02, vec_feat_02)

In [27]:
train_and_evaluate_model(ds_suite_02, sw, vec_feat_02)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.919702  0.916376    0.914162    28.266
Test      0.907094  0.904607    0.901902     6.9344

Total execution time (s):  39.6624


##### [2.3] Training configuration suite (ds_suite_02, vec_feat_03)

In [15]:
train_and_evaluate_model(ds_suite_02, sw, vec_feat_03)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.947612  0.94739     0.946487     28.519
Test      0.933056  0.933198    0.93121       7.394

Total execution time (s):  40.963


##### [2.4] Training configuration suite (ds_suite_02, vec_feat_04)

In [28]:
train_and_evaluate_model(ds_suite_02, sw, vec_feat_04)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.971176  0.970748    0.970604     28.203
Test      0.951138  0.950068    0.949759      7.046

Total execution time (s):  41.636


##### [2.5] Training configuration suite (ds_suite_02, vec_feat_05)

In [17]:
train_and_evaluate_model(ds_suite_02, sw, vec_feat_05)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.978263  0.978031    0.978075     28.988
Test      0.955089  0.953794    0.953837      7.166

Total execution time (s):  43.486


---

##### [3.1] Training configuration suite (ds_suite_03, vec_feat_01)

In [18]:
train_and_evaluate_model(ds_suite_03, sw, vec_feat_01)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.747142  0.683052    0.691614    849.178
Test      0.753499  0.683062    0.693644    213.129

Total execution time (s):  1067.4357


##### [3.2] Training configuration suite (ds_suite_03, vec_feat_02)

In [24]:
train_and_evaluate_model(ds_suite_03, sw, vec_feat_02)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.953231  0.928775    0.934997    811.546
Test      0.9498    0.925813    0.93162     200.011

Total execution time (s):  1019.7264


##### [3.3] Training configuration suite (ds_suite_03, vec_feat_03)

In [19]:
train_and_evaluate_model(ds_suite_03, sw, vec_feat_03)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.95666   0.932772    0.934982    848.277
Test      0.954094  0.929743    0.932845    204.449

Total execution time (s):  1063.832


##### [3.4] Training configuration suite (ds_suite_03, vec_feat_04)

In [21]:
train_and_evaluate_model(ds_suite_03, sw, vec_feat_04)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.966293  0.957553    0.958272    838.054
Test      0.961686  0.949322    0.951049    205.632

Total execution time (s):  1062.3347


##### [3.5] Training configuration suite (ds_suite_03, vec_feat_05)

In [23]:
train_and_evaluate_model(ds_suite_03, sw, vec_feat_05)

         Precision    Recall    F1-score    time(s)
-----  -----------  --------  ----------  ---------
Train     0.977178  0.961279    0.96611     847.398
Test      0.973879  0.955149    0.960274    204.935

Total execution time (s):  1074.2136
