In [1]:
!pip install sklearn_crfsuite



In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite.metrics import flat_classification_report

In [3]:
# Abre o arquivo de leis, para formata-lo igual o CRFtoNER
with open('Datasets/dataset.ptbr_leis.txt', 'r') as f_open, \
     open('Datasets/ptbr_leis.csv', 'w+') as file:
    
    # Abre todas as linhas do arquivo
    lines = f_open.readlines()
    
    # Escreve o header do csv
    file.write("Sentence #\tWord\tPOS\tTag\n")
    
    # Percorre linha por linha
    sent_i = 1
    for line in lines:
        
        # Divide a linha em palavras
        info = line.split()
        
        # Se for menor que duas, é um '\n',
        # Então, é uma nova sentença
        if len(info) != 2:
            sent_i += 1
        
        # Escreve a linha do arquivo
        else:
            
            # No formato:
            # Sentença 
            file.write("Sentence: %d\t" % sent_i)
            file.write("%s\t" % info[0])
            file.write("?\t")
            file.write("%s\n" % info[1])

In [4]:
#Reading the csv file
df = pd.read_csv('Datasets/ptbr_leis.csv', encoding = "UTF-8", sep="\t")

In [5]:
#Display first 10 rows
df.head(10)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,EMENTA,?,O
1,Sentence: 1,:,?,O
2,Sentence: 1,APELAÇÃO,?,O
3,Sentence: 1,CÍVEL,?,O
4,Sentence: 1,-,?,O
5,Sentence: 1,AÇÃO,?,O
6,Sentence: 1,DE,?,O
7,Sentence: 1,INDENIZAÇÃO,?,O
8,Sentence: 1,POR,?,O
9,Sentence: 1,DANOS,?,O


In [6]:
df.describe()

Unnamed: 0,Sentence #,Word,POS,Tag
count,229277,229219,229277,229277
unique,7827,18020,1,13
top,Sentence: 7258,",",?,O
freq,755,16743,229277,197228


In [7]:
#Displaying the unique Tags
df['Tag'].unique()

array(['O', 'B-ORGANIZACAO', 'I-ORGANIZACAO', 'B-LEGISLACAO',
       'I-LEGISLACAO', 'B-JURISPRUDENCIA', 'I-JURISPRUDENCIA', 'B-PESSOA',
       'I-PESSOA', 'B-TEMPO', 'B-LOCAL', 'I-LOCAL', 'I-TEMPO'],
      dtype=object)

In [8]:
df.groupby('Tag').size().reset_index(name='counts')

Unnamed: 0,Tag,counts
0,B-JURISPRUDENCIA,1104
1,B-LEGISLACAO,1920
2,B-LOCAL,611
3,B-ORGANIZACAO,2400
4,B-PESSOA,1525
5,B-TEMPO,1334
6,I-JURISPRUDENCIA,2863
7,I-LEGISLACAO,11119
8,I-LOCAL,806
9,I-ORGANIZACAO,4271


In [9]:
#Checking null values, if any.
df.isnull().sum()

Sentence #     0
Word          58
POS            0
Tag            0
dtype: int64

In [10]:
df = df.fillna(method = 'ffill')
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,EMENTA,?,O
1,Sentence: 1,:,?,O
2,Sentence: 1,APELAÇÃO,?,O
3,Sentence: 1,CÍVEL,?,O
4,Sentence: 1,-,?,O


In [11]:
# This is a class te get sentence. The each sentence will be list of tuples with its tag and pos.
class sentence(object):
    def __init__(self, df):
        self.n_sent = 1
        self.df = df
        self.empty = False
        agg = lambda s : [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(),
                                                       s['POS'].values.tolist(),
                                                       s['Tag'].values.tolist())]
        self.grouped = self.df.groupby("Sentence #").apply(agg)
        self.sentences = [s for s in self.grouped]
        
    def get_text(self):
        try:
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent +=1
            return s
        except:
            return None

In [18]:
#Displaying one full sentence
getter = sentence(df)
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]

In [21]:
#sentence with its pos and tag.
sent = getter.get_text()
print(sent)

[('-', '?', 'O'), ('O', '?', 'O'), ('art', '?', 'B-LEGISLACAO'), ('.', '?', 'I-LEGISLACAO'), ('178', '?', 'I-LEGISLACAO'), (',', '?', 'I-LEGISLACAO'), ('II', '?', 'I-LEGISLACAO'), (',', '?', 'I-LEGISLACAO'), ('do', '?', 'I-LEGISLACAO'), ('CPC', '?', 'I-LEGISLACAO'), ('prescreve', '?', 'O'), ('que', '?', 'O'), ('compete', '?', 'O'), ('ao', '?', 'O'), ('Ministério', '?', 'B-ORGANIZACAO'), ('Público', '?', 'I-ORGANIZACAO'), ('intervir', '?', 'O'), ('nas', '?', 'O'), ('causas', '?', 'O'), ('em', '?', 'O'), ('que', '?', 'O'), ('há', '?', 'O'), ('interesses', '?', 'O'), ('de', '?', 'O'), ('incapazes', '?', 'O'), (',', '?', 'O'), ('dispondo', '?', 'O'), ('o', '?', 'O'), ('art', '?', 'O'), ('.', '?', 'O'), ('279', '?', 'O'), ('do', '?', 'O'), ('mesmo', '?', 'O'), ('diploma', '?', 'O'), ('que', '?', 'O'), ('o', '?', 'O'), ('processo', '?', 'O'), ('será', '?', 'O'), ('nulo', '?', 'O'), ('quando', '?', 'O'), ('o', '?', 'O'), ('Ministério', '?', 'B-ORGANIZACAO'), ('Público', '?', 'I-ORGANIZACAO'),

In [19]:
sentences = getter.sentences

sentences[1]

[('II', '?', 'O'), ('.', '?', 'O')]

In [28]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        #'postag': postag,
        #'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            #'-1:postag': postag1,
            #'-1:postag[:2]': postag1[:2],

        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            #'+1:postag': postag1,
            #'+1:postag[:2]': postag1[:2],

        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]
  

In [29]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [31]:
crf = CRF(algorithm = 'lbfgs',
         c1 = 0.1,
         c2 = 0.1,
         max_iterations = 100,
         all_possible_transitions = False)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=False, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [32]:
#Predicting on the test set.
y_pred = crf.predict(X_test)

In [33]:
f1_score = flat_f1_score(y_test, y_pred, average = 'weighted')
print(f1_score)

0.9851018990237566


In [34]:
report = flat_classification_report(y_test, y_pred)
print(report)

                  precision    recall  f1-score   support

B-JURISPRUDENCIA       0.92      0.88      0.90       234
    B-LEGISLACAO       0.91      0.92      0.91       418
         B-LOCAL       0.84      0.88      0.86       111
   B-ORGANIZACAO       0.93      0.90      0.92       464
        B-PESSOA       0.99      0.96      0.98       286
         B-TEMPO       0.95      0.92      0.93       280
I-JURISPRUDENCIA       0.90      0.89      0.90       647
    I-LEGISLACAO       0.96      0.94      0.95      2572
         I-LOCAL       0.89      0.93      0.91       163
   I-ORGANIZACAO       0.90      0.94      0.92       808
        I-PESSOA       0.99      0.97      0.98       570
         I-TEMPO       0.96      0.90      0.93       201
               O       0.99      0.99      0.99     41248

       micro avg       0.99      0.99      0.99     48002
       macro avg       0.93      0.92      0.93     48002
    weighted avg       0.99      0.99      0.99     48002

