<a href="https://colab.research.google.com/github/leobonn1/devai/blob/main/brazilian_court_decisions_tf_idf_vectorizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [53]:
!pip install datasets
!pip install catboost



In [54]:
from datasets import load_dataset

dataset = load_dataset('joelniklaus/brazilian_court_decisions')

In [55]:
train_texts = dataset['train']['decision_description']
train_labels = dataset['train']['judgment_label']

test_texts = dataset['test']['decision_description']
test_labels= dataset['test']['judgment_label']

print(f'\nTrain size: {len(train_texts)} -- {len(train_labels)}')
print(f'Test size: {len(test_texts)} -- {len(test_labels)}')



Train size: 3234 -- 3234
Test size: 405 -- 405


In [56]:
from collections import Counter

print(f'Train Labels Distribution: {Counter(train_labels)}')
print(f'Test Labels Distribution: {Counter(test_labels)}')

Train Labels Distribution: Counter({'no': 1960, 'partial': 677, 'yes': 597})
Test Labels Distribution: Counter({'no': 234, 'partial': 93, 'yes': 78})


In [57]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

label_encoder.fit(train_labels)

train_labels = label_encoder.transform(train_labels)
test_labels = label_encoder.transform(test_labels)

print(f'Train Labels Distribution: {Counter(train_labels)}')
print(f'Test Labels Distribution: {Counter(test_labels)}')

Train Labels Distribution: Counter({0: 1960, 1: 677, 2: 597})
Test Labels Distribution: Counter({0: 234, 1: 93, 2: 78})


In [58]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

label_encoder.fit(train_labels)

train_labels = label_encoder.transform(train_labels)
test_labels = label_encoder.transform(test_labels)

print(f'Train Labels Distribution: {Counter(train_labels)}')
print(f'Test Labels Distribution: {Counter(test_labels)}')

Train Labels Distribution: Counter({0: 1960, 1: 677, 2: 597})
Test Labels Distribution: Counter({0: 234, 1: 93, 2: 78})


In [59]:
!python -m spacy download pt_core_news_sm

import spacy

from tqdm import tqdm

def preprocess_texts(list_texts):
  nlp = spacy.load('pt_core_news_sm', disable=['ner'])
  new_texts = []
  with tqdm(total=len(list_texts), desc='Preprocessing') as pbar:
    for text in list_texts:
      doc = nlp(text)
      tokens = [t.lemma_.lower() for t in doc if t.pos_ != 'PUNCT' and not t.is_stop]
      texto_normalizado = ' '.join(tokens)
      new_texts.append(texto_normalizado)
      pbar.update(1)
  return new_texts

train_texts = preprocess_texts(train_texts)

test_texts = preprocess_texts(test_texts)

2023-12-11 20:20:16.911113: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-11 20:20:16.911175: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-11 20:20:16.911200: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Collecting pt-core-news-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.6.0/pt_core_news_sm-3.6.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m93.8 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can no

Preprocessing: 100%|██████████| 3234/3234 [00:33<00:00, 96.66it/s]
Preprocessing: 100%|██████████| 405/405 [00:04<00:00, 82.16it/s] 


In [65]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#vectorizer_option = 'binary'
#vectorizer_option = 'count'
vectorizer_option = 'tf_idf'

vectorizer = None

if vectorizer_option == 'binary':
  vectorizer = CountVectorizer(binary=True, max_features=None, ngram_range=(1, 1))
elif vectorizer_option == 'count':
  vectorizer = CountVectorizer(binary=False, max_features=None, ngram_range=(1, 1))
elif vectorizer_option == 'tf_idf':
  vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_features=500)

print(f'Vectorizer Option: {vectorizer_option}')

Vectorizer Option: tf_idf


In [66]:
X_train = vectorizer.fit_transform(train_texts).toarray()
X_test = vectorizer.transform(test_texts).toarray()

print(f'\nExample Raw Text: {train_texts[0]}')
print(f'\nExample Vectorized Text: {X_train[0]}')
print(f'Vocabulary: {len(vectorizer.vocabulary_)}')


Example Raw Text: direito penal processual penal revisão criminal artigo 621 código processo penal requerente condenado júri popular prática crimes homicídio duplamente qualificado homicídio qualificado tentado pleito refazimento dosimetria pena imposta requerente admissibilidade via revisional precedentes alegação erro processo dosimetria pena comportamento vítima circunstância judicial neutra considerada desfavorável sentenciando precedentes superior tribunal justiça entendimento câmara criminal tribunal justiça afastamento culpabilidade ausência exposição motivos incremento pena-base afastado desvalor valoração atribuída circunstâncias crime mantida fundamentação idônea pena-base reduzida compensação agravante motivação torpe atenuante confissão espontânea pena privativa liberdade redimensionada crime tentado aplicada fração redutora máxima ante distância atos praticados requerente consumação crime pena redimensionada

Example Vectorized Text: [0.         0.         0.         0.  

In [75]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from catboost import CatBoostClassifier
import lightgbm as ltb
from sklearn.neural_network import MLPClassifier
from sklearn import svm

classifiers = {
    #'Logistic_Regression': LogisticRegression(class_weight='balanced',
    #                                          max_iter=1000),
    #'Multinomial_NB': MultinomialNB(),
    'KNN': KNeighborsClassifier(n_neighbors=3, weights = 'uniform'),
    'Random_Forest': RandomForestClassifier(),
    'Decision_Tree' : DecisionTreeClassifier(),
    'Extra_Tree' : ExtraTreesClassifier(),
    'Cat_Boost' : CatBoostClassifier(iterations=1000),
    'Light_GBM' : ltb.LGBMClassifier(),
    'MLP' :  MLPClassifier(random_state=1, max_iter=300),
    'SVM' : svm.SVC()
}

In [69]:
X_test

array([[0.        , 0.        , 0.        , ..., 0.        , 0.09962891,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [76]:
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.metrics import classification_report, ConfusionMatrixDisplay

for classifier_name, classifier in classifiers.items():
  print('***********************************************')
  print(f'\nClassifier: {classifier_name}')

  classifier.fit(X_train, train_labels)

  y_pred = classifier.predict(X_test)

  report = classification_report(test_labels, y_pred, output_dict=True)
  df = pd.DataFrame(report).transpose()
  df.to_csv(f'{vectorizer_option}_{classifier_name}.csv')
  print(df)

  ConfusionMatrixDisplay.from_predictions(test_labels, y_pred)

  plt.show()


***********************************************

Classifier: KNN


IndexError: ignored

In [78]:
classifier.weights

'uniform'

In [79]:
neigh_ind = classifier.kneighbors(X_test, return_distance=False)

In [80]:
neigh_ind

array([[-3016291270276543133,      135734599565552,      135734592426816],
       [     135734592426896,      135734599565552,                  803],
       [     135734592297584,      135734599565552,                 1947],
       ...,
       [     135734599565520,      135734592431856,                  902],
       [     135734599565520,      135734592450032,                  840],
       [-2193175205529966673,      135734599565552,      135734592450096]])

In [82]:
len(classifier._y)

3234

In [85]:
classifier._y[50]

0