In [1]:
!pip install sklearn_crfsuite



In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite.metrics import flat_classification_report

In [3]:
import spacy
from spacy.tokenizer import Tokenizer
!python -m spacy download pt_core_news_sm
import pt_core_news_sm
from tqdm import tqdm

sp = pt_core_news_sm.load()

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('pt_core_news_sm')


In [4]:
import os.path

def create_dataset():
    # Abre o arquivo de leis, para formata-lo igual o CRFtoNER
    with open('Datasets/dataset.ptbr_leis.txt', 'r') as f_open, \
         open('Datasets/ptbr_leis.csv', 'w+') as file:

        # Abre todas as linhas do arquivo
        lines = f_open.readlines()

        # Escreve o header do csv
        file.write("Sentence #\tWord\tPOS\tTag\n")

        list_sent = []

        # Percorre linha por linha
        sent_i = 1
        for line in tqdm(lines):

            # Divide a linha em palavras
            info = line.split()

            # Se for menor que duas, é um '\n',
            # Então, é uma nova sentença
            if len(info) != 2:
                
                string_sent = ' '.join([s[0] for s in list_sent])
                
                sent_sp = sp(string_sent)

                string = ""
                i = 0
                for s in sent_sp:
                    if string + s.text == list_sent[i][0]:

                        file.write("Sentence: %d\t" % sent_i)
                        file.write("%s\t" % list_sent[i][0])
                        file.write("%s\t" % s.pos_)
                        file.write("%s\n" % list_sent[i][1])
                        
                        string = ''
                        i += 1
                    else:
                        string += s.text

                list_sent = []
                sent_i += 1

            # Escreve a linha do arquivo
            else:
                 list_sent.append((info[0], info[1]))


if not os.path.isfile('Datasets/ptbr_leis.csv'):
    print("Criando dataset...")
    create_dataset()
    
print("Dataset criado!")

Dataset criado!


In [5]:
#Reading the csv file
df = pd.read_csv('Datasets/ptbr_leis.csv', 
                 encoding = "UTF-8", 
                 sep="\t")

In [6]:
#Display first 10 rows
df.head(10)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,EMENTA,NOUN,O
1,Sentence: 1,:,PUNCT,O
2,Sentence: 1,APELAÇÃO,PROPN,O
3,Sentence: 1,CÍVEL,PROPN,O
4,Sentence: 1,-,PUNCT,O
5,Sentence: 1,AÇÃO,PROPN,O
6,Sentence: 1,DE,ADP,O
7,Sentence: 1,INDENIZAÇÃO,PROPN,O
8,Sentence: 1,POR,ADP,O
9,Sentence: 1,DANOS,PROPN,O


In [7]:
df.describe()

Unnamed: 0,Sentence #,Word,POS,Tag
count,229277,229219,229277,229277
unique,7827,18020,17,13
top,Sentence: 7258,",",NOUN,O
freq,755,16743,41508,197228


In [8]:
#Displaying the unique Tags
df['Tag'].unique()

array(['O', 'B-ORGANIZACAO', 'I-ORGANIZACAO', 'B-LEGISLACAO',
       'I-LEGISLACAO', 'B-JURISPRUDENCIA', 'I-JURISPRUDENCIA', 'B-PESSOA',
       'I-PESSOA', 'B-TEMPO', 'B-LOCAL', 'I-LOCAL', 'I-TEMPO'],
      dtype=object)

In [9]:
df.groupby('Tag').size().reset_index(name='counts')

Unnamed: 0,Tag,counts
0,B-JURISPRUDENCIA,1104
1,B-LEGISLACAO,1920
2,B-LOCAL,611
3,B-ORGANIZACAO,2400
4,B-PESSOA,1525
5,B-TEMPO,1334
6,I-JURISPRUDENCIA,2863
7,I-LEGISLACAO,11119
8,I-LOCAL,806
9,I-ORGANIZACAO,4271


In [10]:
df.groupby('POS').size().reset_index(name='counts')

Unnamed: 0,POS,counts
0,ADJ,12218
1,ADP,28771
2,ADV,7159
3,AUX,2716
4,CCONJ,4109
5,DET,19420
6,INTJ,315
7,NOUN,41508
8,NUM,6465
9,PART,1


In [11]:
#Checking null values, if any.
df.isnull().sum()

Sentence #     0
Word          58
POS            0
Tag            0
dtype: int64

In [12]:
df = df.fillna(method = 'ffill')

In [13]:
# This is a class te get sentence. The each sentence will be list of tuples with its tag and pos.
class sentence(object):
    def __init__(self, df):
        self.n_sent = 1
        self.df = df
        self.empty = False
        agg = lambda s : [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(),
                                                       s['POS'].values.tolist(),
                                                       s['Tag'].values.tolist())]
        self.grouped = self.df.groupby("Sentence #").apply(agg)
        self.sentences = [s for s in self.grouped]
        
    def get_text(self):
        try:
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent +=1
            return s
        except:
            return None

In [14]:
#Displaying one full sentence
getter = sentence(df)
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]

In [15]:
#sentence with its pos and tag.
sent = getter.get_text()
print(sent)

[('EMENTA', 'NOUN', 'O'), (':', 'PUNCT', 'O'), ('APELAÇÃO', 'PROPN', 'O'), ('CÍVEL', 'PROPN', 'O'), ('-', 'PUNCT', 'O'), ('AÇÃO', 'PROPN', 'O'), ('DE', 'ADP', 'O'), ('INDENIZAÇÃO', 'PROPN', 'O'), ('POR', 'ADP', 'O'), ('DANOS', 'PROPN', 'O'), ('MORAIS', 'PROPN', 'O'), ('-', 'PUNCT', 'O'), ('PRELIMINAR', 'PROPN', 'O'), ('-', 'PUNCT', 'O'), ('ARGUIDA', 'PROPN', 'O'), ('PELO', 'PROPN', 'O'), ('MINISTÉRIO', 'PROPN', 'B-ORGANIZACAO'), ('PÚBLICO', 'VERB', 'I-ORGANIZACAO'), ('EM', 'ADP', 'O'), ('GRAU', 'PROPN', 'O'), ('RECURSAL', 'PROPN', 'O'), ('-', 'PUNCT', 'O'), ('NULIDADE', 'PROPN', 'O'), ('-', 'PUNCT', 'O'), ('AUSÊNCIA', 'PROPN', 'O'), ('DE', 'ADP', 'O'), ('INTERVENÇÃO', 'PROPN', 'O'), ('DO', 'PROPN', 'O'), ('PARQUET', 'PROPN', 'O'), ('PARQUET', 'PROPN', 'O'), ('INSTÂNCIA', 'PROPN', 'O'), ('A', 'DET', 'O'), ('QUO', 'PROPN', 'O'), ('-', 'PUNCT', 'O'), ('PRESENÇA', 'PROPN', 'O'), ('DE', 'PROPN', 'O'), ('INCAPAZ', 'PROPN', 'O'), ('-', 'PUNCT', 'O'), ('PREJUÍZO', 'PROPN', 'O'), ('EXISTENTE', 

In [16]:
sentences = getter.sentences

In [17]:
def isNumeroRomano(numeral):
    numeral = numeral.upper()
    validRomanNumerals = ["M", "D", "C", "L", "X", "V", "I", "(", ")"]
    valid = True
    for letters in numeral:
        if letters not in validRomanNumerals:
            valid = False
            break
    return valid

def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'postag': postag, # 1) Tag
        'word.lower()': word.lower(), # 2) Word
        'word': word, # 13) cap
        'word.istitle()': word.istitle(), # 14) ini
        'word.isdigit()': word.isdigit(), # 15) digit
        'word.isupper()': word.isupper(), # extra
        'isNumeroRomano()': isNumeroRomano(word), #extra
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(), # 3) prevW
            '-1:postag': postag1, # 4) prevT
            '-1:word': word1, # 5) prevCap

        })
    elif i > 1:
        word2 = sent[i-2][0]
        postag2 = sent[i-2][1]
        features.update({
            '-2:word.lower()' : word2.lower(), # 6) prev2W
            '-2:postag': postag2, # 7) prev2T
            '-2:word': word2, # 8) prev2Cap
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(), # 9) nextW
            '+1:postag': postag1, # 10) nextT
            '+1:word': word, # 11) nextCap
        })
    
    elif i < len(sent)-2:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+2:word.lower()': word1.lower(), # 9) nextW
            '+2:postag': postag1, # 10) nextT
            '+2:word': word, # 11) nextCap
        })
    
    else:
        features['EOS'] = True

    return features



def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]
  

In [18]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [20]:
crf = CRF(algorithm = 'lbfgs',
         c1 = 0.1,
         c2 = 0.1,
         max_iterations = 100,
         all_possible_transitions = False)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=False, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [21]:
#Predicting on the test set.
y_pred = crf.predict(X_test)

In [22]:
f1_score = flat_f1_score(y_test, y_pred, average = 'weighted')
print(f1_score)

0.9845966655610088


In [23]:
report = flat_classification_report(y_test, y_pred)
print(report)

                  precision    recall  f1-score   support

B-JURISPRUDENCIA       0.94      0.84      0.89       235
    B-LEGISLACAO       0.95      0.91      0.93       363
         B-LOCAL       0.92      0.89      0.90       111
   B-ORGANIZACAO       0.94      0.89      0.91       456
        B-PESSOA       0.97      0.94      0.96       309
         B-TEMPO       0.93      0.76      0.84       241
I-JURISPRUDENCIA       0.93      0.90      0.92       606
    I-LEGISLACAO       0.97      0.95      0.96      2153
         I-LOCAL       0.97      0.90      0.93       162
   I-ORGANIZACAO       0.92      0.91      0.92       832
        I-PESSOA       0.99      0.95      0.97       640
         I-TEMPO       0.99      0.91      0.95       148
               O       0.99      1.00      0.99     38523

       micro avg       0.98      0.98      0.98     44779
       macro avg       0.95      0.90      0.93     44779
    weighted avg       0.98      0.98      0.98     44779

