In [None]:
!pip install sklearn_crfsuite

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite.metrics import flat_classification_report

In [None]:
import spacy
from spacy.tokenizer import Tokenizer
!python -m spacy download pt_core_news_sm
import pt_core_news_sm
from tqdm import tqdm

sp = pt_core_news_sm.load()

In [None]:
import os.path

def create_dataset():
    # Abre o arquivo de leis, para formata-lo igual o CRFtoNER
    with open('Datasets/dataset.ptbr_leis.txt', 'r') as f_open, \
         open('Datasets/ptbr_leis.csv', 'w+') as file:

        # Abre todas as linhas do arquivo
        lines = f_open.readlines()

        # Escreve o header do csv
        file.write("Sentence #\tWord\tPOS\tTag\n")

        list_sent = []

        # Percorre linha por linha
        sent_i = 1
        for line in tqdm(lines):

            # Divide a linha em palavras
            info = line.split()

            # Se for menor que duas, é um '\n',
            # Então, é uma nova sentença
            if len(info) != 2:

                string_sent = ''
                for s in list_sent:
                    string_sent += s[0] + ' '

                sent_sp = sp(string_sent)

                string = ""
                i = 0
                for s in sent_sp:
                    if string + s.text == list_sent[i][0]:

                        file.write("Sentence: %d\t" % sent_i)
                        file.write("%s\t" % list_sent[i][0])
                        file.write("%s\t" % s.pos_)
                        file.write("%s\n" % list_sent[i][1])

                        string = ''
                        i += 1
                    else:
                        string += s.text

                list_sent = []
                sent_i += 1

            # Escreve a linha do arquivo
            else:
                 list_sent.append((info[0], info[1]))


if not os.path.isfile('Datasets/ptbr_leis.csv'):
    print("Criando dataset...")
    create_dataset()
    
print("Dataset criado!")

In [None]:
#Reading the csv file
df = pd.read_csv('Datasets/ptbr_leis.csv', 
                 encoding = "UTF-8", 
                 sep="\t")

In [None]:
#Display first 10 rows
df.head(10)

In [None]:
df.describe()

In [None]:
#Displaying the unique Tags
df['Tag'].unique()

In [None]:
df.groupby('Tag').size().reset_index(name='counts')

In [None]:
df.groupby('POS').size().reset_index(name='counts')

In [None]:
#Checking null values, if any.
df.isnull().sum()

In [None]:
df = df.fillna(method = 'ffill')

In [None]:
# This is a class te get sentence. The each sentence will be list of tuples with its tag and pos.
class sentence(object):
    def __init__(self, df):
        self.n_sent = 1
        self.df = df
        self.empty = False
        agg = lambda s : [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(),
                                                       s['POS'].values.tolist(),
                                                       s['Tag'].values.tolist())]
        self.grouped = self.df.groupby("Sentence #").apply(agg)
        self.sentences = [s for s in self.grouped]
        
    def get_text(self):
        try:
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent +=1
            return s
        except:
            return None

In [None]:
#Displaying one full sentence
getter = sentence(df)
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]

In [None]:
#sentence with its pos and tag.
sent = getter.get_text()
print(sent)

In [None]:
sentences = getter.sentences

In [None]:
def isNumeroRomano(numeral):
    numeral = numeral.upper()
    validRomanNumerals = ["M", "D", "C", "L", "X", "V", "I", "(", ")"]
    valid = True
    for letters in numeral:
        if letters not in validRomanNumerals:
            valid = False
            break
    return valid

def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'Número Romano': isNumeroRomano(word),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'word.isupper()': word.isupper(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:Número Romano': isNumeroRomano(word1),
            '-1:word.istitle()': word1.istitle(),
            '+1:word.isdigit()': word1.isdigit(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],

        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:Número Romano': isNumeroRomano(word1),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],

        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]
  

In [None]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
crf = CRF(algorithm = 'lbfgs',
         c1 = 0.1,
         c2 = 0.1,
         max_iterations = 100,
         all_possible_transitions = False)
crf.fit(X_train, y_train)

In [None]:
#Predicting on the test set.
y_pred = crf.predict(X_test)

In [None]:
f1_score = flat_f1_score(y_test, y_pred, average = 'weighted')
print(f1_score)

In [None]:
report = flat_classification_report(y_test, y_pred)
print(report)