In [4]:
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import numpy as np
from joblib import dump, load

# Processamento dos dados de entrada

É criada um dataframe contendo um "bloco" por linha onde cada bloco possui um tipo. As colunas contém o texto do bloco e o tipo de emenda do bloco.

In [2]:
# Processamento dos Dados
path = "./dados/tagFiles"


tagList = ["I-","E-","B-"]
files = []
for dirpath, dirnames, filenames in os.walk(path):
    for filename in filenames:
        files.append(os.path.normpath(os.path.join(dirpath,filename)))

emendas = pd.DataFrame(columns = ['text','emdType'])
tupEmd = []


for file in files:
    with open(file, encoding = "utf-8") as f:
        emdTxt = []
        previousType = None
        for line in f.readlines():
            token,emdType = line.split()
            if any(x in emdType for x in tagList):
                emdType = emdType[2:]
            if previousType != emdType and previousType != None:
                tupEmd.append([" ".join(emdTxt), previousType])
                emdTxt = []
                
            emdTxt.append(token)
            previousType = emdType
        tupEmd.append([" ".join(emdTxt), emdType])


for index in range(len(tupEmd)):
    emendas.loc[index,'text'] = tupEmd[index][0]
    emendas.loc[index,'emdType'] = tupEmd[index][1]

# Classificador de Emendas ADD

In [4]:
emdAddColumn = emendas['emdType'].where(emendas['emdType'] == 'ADD', "N")

In [21]:
emendasADD = emendas.copy()
emendasADD = emendasADD.drop('emdType',axis = 1)
emendasADD['emdType'] = emdAddColumn

emendasADD.head(10)

Unnamed: 0,text,emdType
0,MPV 870 00001 COMISSÃO MISTA DA MEDIDA PROVISÓ...,N
1,Suprime o inciso XIV e o §2º do art . 21 da Me...,N
2,MPV 870 00002 COMISSÃO MISTA DA MEDIDA PROVISÓ...,N
3,"Suprime o inciso II , do art . 5o , da Medida ...",N
4,MPV 870 00003 COMISSÃO MISTA DA MEDIDA PROVISÓ...,N
5,Altera a gestão do Serviço Florestal Brasileir...,ADD
6,MPV 870 00004 COMISSÃO MISTA DA MEDIDA PROVISÓ...,N
7,"Art . 1º . Os arts . 19 e 57 , II , da Medida ...",N
8,MPV 870 00005 CMARA DOS DEPUTADOS DEPUTADO FED...,N
9,passam a vigorar CD/19979.34213-60 com as segu...,N


In [38]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(emendasADD['text'])
y = emendasADD['emdType']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.66, random_state=42)



clf = SVC(gamma = 'auto')
clf.fit(X_train,y_train)

print(clf.score(X_test,y_test))

y_pred = clf.predict(X_test)

print(confusion_matrix(y_pred,y_test, labels = ['ADD','N']))

0.9029850746268657
[[  0   0]
 [ 26 242]]


# Classificador Global

In [13]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(emendas['text'])
y = emendas['emdType']

['vetores_dicionario']

In [15]:
vectorizer.get_feature_names()

['00',
 '00001',
 '00002',
 '00003',
 '00004',
 '00005',
 '00006',
 '00007',
 '00008',
 '00009',
 '00010',
 '00011',
 '00012',
 '00014',
 '00015',
 '00017',
 '00019',
 '00020',
 '00021',
 '00022',
 '00023',
 '00024',
 '00025',
 '00026',
 '00029',
 '00030',
 '00031',
 '00032',
 '00033',
 '00034',
 '00035',
 '00036',
 '00037',
 '00038',
 '00039',
 '00040',
 '00041',
 '00042',
 '00043',
 '00044',
 '00045',
 '00047',
 '00048',
 '00049',
 '00050',
 '00051',
 '00052',
 '00053',
 '00054',
 '00055',
 '00056',
 '00057',
 '00061',
 '00062',
 '00063',
 '00064',
 '00065',
 '00066',
 '00067',
 '00068',
 '00069',
 '00070',
 '00071',
 '00072',
 '00073',
 '00074',
 '00075',
 '00076',
 '00077',
 '00078',
 '00079',
 '00080',
 '00081',
 '00082',
 '00083',
 '00084',
 '00085',
 '00086',
 '00087',
 '00088',
 '00089',
 '00090',
 '00091',
 '00092',
 '00093',
 '00094',
 '00095',
 '00096',
 '00097',
 '00098',
 '00099',
 '00100',
 '00101',
 '00102',
 '00103',
 '00104',
 '00105',
 '00106',
 '00107',
 '00108',
 '0

## Separação das Bases

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

## Oversampling

In [5]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)

## Criação do Modelo

In [6]:
clf = SVC(gamma = 'auto')
clf.fit(X_resampled,y_resampled)

print(clf.score(X_test,y_test))



0.8283582089552238


## Salvando o modelo treinado

In [9]:
from joblib import dump,load

pipeline = Pipeline([('vectorizer', vectorizer), ('clf_emend', clf)])
dump(pipeline,"emend_clf_pipe")

['classificador_de_emendas']

## Predição e Matriz de Confusão

In [20]:
y_pred = clf.predict(X_test)

cm = confusion_matrix(y_pred,y_test, labels = ['ADD','MOD','SUP','O'])
print(cm)

[[ 7  4  0  0]
 [ 1 23  0  0]
 [ 7 10 12  0]
 [ 0  0  1 69]]


## Acurácia das Classes

In [44]:
cmAcc = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

print(cmAcc.diagonal())

[0.63636364 0.95833333 0.4137931  0.98571429]


In [45]:
cmDf = pd.DataFrame(cm,columns = ['ADD','MOD','SUP','O'])
cmDf['types'] = ['ADD','MOD','SUP','O']

In [46]:
cols = cmDf.columns.tolist()
cols.insert(0,cols.pop(4))
cmDf = cmDf[cols]
cmDf['Accuracy'] = cmAcc.diagonal()

['types', 'ADD', 'MOD', 'SUP', 'O']


In [47]:
cmDf.head()

Unnamed: 0,types,ADD,MOD,SUP,O,Accuracy
0,ADD,7,4,0,0,0.636364
1,MOD,1,23,0,0,0.958333
2,SUP,7,10,12,0,0.413793
3,O,0,0,1,69,0.985714


## Output resultados

In [48]:
cmDf.to_csv('./outputs/emdendasPredResultados.csv')

# Output de Emendas

In [24]:
addDF = emendas.copy()
addDF = addDF.iloc[y_test.index]
addDF['emdTypePred'] = y_pred
addDF.head()

Unnamed: 0,text,emdType,emdTypePred
70,MPV 870 00040 CONGRESSO NACIONAL ETIQ UETA APR...,O,O
218,"Dê-se , ao artigo 65 da Medida Provisória nº 8...",MOD,MOD
258,"Os Arts . 37 , 38 , 43 e 44 , da Medida Provis...",MOD,MOD
33,Acrescente-se ao art . 37 da Medida Provisória...,ADD,SUP
42,MPV 870 00024 COMISSÃO MISTA DA MEDIDA PROVISÓ...,O,O


In [29]:
addDF.loc[addDF['emdType'] == "ADD"].to_csv('./outputs/emendasAddPred.csv')

In [25]:
addDF.to_csvaddDF.to_csv('./outputs/emdendasPred.csv')