## Importing Libraries

In [None]:
!pip install cleanlab[all]

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report

from cleanlab import Datalab
import json

RANDOM_SEED = 214
np.random.seed(RANDOM_SEED)

  from .autonotebook import tqdm as notebook_tqdm


## Importing data

In [2]:
df_pls_theme = pd.read_parquet('../../data/proposicoes_temas_one_hot_encoding.parquet')
df_pls_theme.head()

tema,id,ementa,Administração Pública,"Agricultura, Pecuária, Pesca e Extrativismo","Arte, Cultura e Religião",Cidades e Desenvolvimento Urbano,Comunicações,Defesa e Segurança,Direito Civil e Processual Civil,Direito Penal e Processual Penal,...,Finanças Públicas e Orçamento,Homenagens e Datas Comemorativas,"Indústria, Comércio e Serviços",Meio Ambiente e Desenvolvimento Sustentável,"Política, Partidos e Eleições",Previdência e Assistência Social,Saúde,Trabalho e Emprego,"Viação, Transporte e Mobilidade",Outro
0,14919,"Dispõe sobre a Política Nacional de Salários, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,14920,"Modifica o art. 6º da Lei nº 9.424, de 24 de d...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,14921,Dispõe sobre salário-família e dá outras provi...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,14922,"Modifica a Lei nº 4.117, de 1962, que ""institu...",0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,14923,Concede isenção do imposto sobre produtos indu...,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0


Contagem dos temas

In [3]:
df_pls_theme\
    .drop_duplicates('ementa')\
    .drop(columns=['id','ementa'])\
    .sum(axis=0)\
    .sort_values(ascending=False)

tema
Direitos Humanos e Minorias                    8544
Trabalho e Emprego                             7775
Saúde                                          7648
Finanças Públicas e Orçamento                  7168
Administração Pública                          6915
Direito Penal e Processual Penal               5704
Educação                                       5267
Indústria, Comércio e Serviços                 4540
Viação, Transporte e Mobilidade                4459
Defesa e Segurança                             3529
Direito Civil e Processual Civil               3413
Meio Ambiente e Desenvolvimento Sustentável    2988
Previdência e Assistência Social               2818
Homenagens e Datas Comemorativas               2683
Economia                                       2513
Direito e Defesa do Consumidor                 2360
Comunicações                                   2254
Cidades e Desenvolvimento Urbano               1995
Outro                                          1973
Energia

Selecionando o tema a ser classificado

In [4]:
BINARY_CLASS = "Homenagens e Datas Comemorativas"
IN_BINARY_CLASS = "in_" + BINARY_CLASS.lower().replace(" ", "_")

df_pls_theme = df_pls_theme.drop_duplicates(subset=["ementa"])
df_pls_theme = df_pls_theme[["ementa", BINARY_CLASS]]
df_pls_theme = df_pls_theme.rename(
    columns={BINARY_CLASS: IN_BINARY_CLASS}
)

df_pls_theme.info()

<class 'pandas.core.frame.DataFrame'>
Index: 61934 entries, 0 to 65976
Data columns (total 2 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               --------------  ----- 
 0   ementa                               61934 non-null  object
 1   in_homenagens_e_datas_comemorativas  61934 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.4+ MB


## Data preprocessing
[wip] leis ...

In [5]:
X = df_pls_theme.ementa
y = df_pls_theme[IN_BINARY_CLASS]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, shuffle=True, random_state=RANDOM_SEED
)

In [64]:
clf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(lowercase=True, strip_accents='ascii', max_features=5000, max_df=0.9)),
    ('clf', RandomForestClassifier(random_state=RANDOM_SEED, n_jobs=-1))
])

params = {
    'clf__n_estimators': [200, 300],
    'clf__max_depth': [None, 10, 30],
}

In [34]:
grid_search = GridSearchCV(
    clf_pipeline, params, cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_SEED)
    ,scoring='f1', n_jobs=2, verbose=1
)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
CPU times: user 1min 6s, sys: 18.5 s, total: 1min 25s
Wall time: 2min 47s


In [47]:
print(grid_search.best_score_)
grid_search.best_estimator_

0.7992734162850049


## Cleaning the dataset

### Manually inspecting the errors

In [67]:
clean_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(lowercase=True, strip_accents='ascii', max_features=6000, max_df=0.95)),
    ('clf', RandomForestClassifier(random_state=RANDOM_SEED, n_jobs=-1, n_estimators=300))
])

In [68]:
y_proba = cross_val_predict(
    clean_pipeline, 
    df_pls_theme['ementa'], 
    df_pls_theme[IN_BINARY_CLASS],
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED), 
    method='predict_proba', 
    verbose=2,
    n_jobs=-1
)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.1min finished


In [72]:
lab = Datalab(
    data=df_pls_theme,
    label_name=IN_BINARY_CLASS,
)

In [109]:
lab.find_issues(pred_probs=y_proba)

Finding label issues ...
Finding outlier issues ...
Fitting OOD estimator based on provided pred_probs ...
Audit complete. 1027 issues found in the dataset.




In [103]:
lab.get_issue_summary("label")

Unnamed: 0,issue_type,score,num_issues
0,label,0.994962,312


In [96]:
y_clean_labels = lab.get_issues("label")[['predicted_label', 'is_label_issue']]
y_outlier = lab.get_issues("outlier")[['is_outlier_issue']]

df_ples_theme_clean = df_pls_theme.copy().reset_index(drop=True)
df_ples_theme_clean['predicted_label'] = y_clean_labels['predicted_label']
df_ples_theme_clean['is_label_issue'] = y_clean_labels['is_label_issue']
df_ples_theme_clean['is_outlier_issue'] = y_outlier['is_outlier_issue']

In [97]:
df_ples_theme_clean.query("is_label_issue")[['ementa', IN_BINARY_CLASS, 'predicted_label']]

tema,ementa,in_homenagens_e_datas_comemorativas,predicted_label
278,"Institui, na República Federativa do Brasil, a...",0,1
286,"Institui o Dia do Evangélico, determinando fer...",0,1
805,Institui o dia 2 de julho como Dia da Libertaç...,0,1
1203,"Denomina ""Aeroporto de Porto Velho / Governado...",0,1
1504,Institui o 12 de agosto como Dia Nacional da J...,0,1
...,...,...,...
60482,Institui o dia nacional do skate,0,1
60499,Cria a premiação “Aluno Nota Dez” e “Escola No...,1,0
60932,"Institui o selo “Quebra-Cabeça”, com a finalid...",1,0
60947,Concede isenção do Imposto sobre Produtos Indu...,1,0


### Automatically fixing the dataset

In [6]:
clean_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(lowercase=True, strip_accents='ascii', max_features=6000, max_df=0.95)),
    ('clf', RandomForestClassifier(random_state=RANDOM_SEED, n_jobs=-1, n_estimators=300))
])

In [12]:
df_pls_theme

tema,ementa,in_homenagens_e_datas_comemorativas
0,"Dispõe sobre a Política Nacional de Salários, ...",0
1,"Modifica o art. 6º da Lei nº 9.424, de 24 de d...",0
2,Dispõe sobre salário-família e dá outras provi...,0
3,"Modifica a Lei nº 4.117, de 1962, que ""institu...",0
4,Concede isenção do imposto sobre produtos indu...,0
...,...,...
65971,Dispõe sobre a permuta dos agentes de seguranç...,0
65973,"Confere ao município de Laranjal Paulista, loc...",1
65974,Dispõe sobre a obrigatoriedade de plataformas ...,0
65975,"Altera a Lei nº 13.277, de 29 de abril de 2016...",1


In [23]:
metrics = []
N_FIXES = 6
df = df_pls_theme.copy()
df[IN_BINARY_CLASS+'_0'] = df[IN_BINARY_CLASS]

for i in range(N_FIXES):
    in_binary_class_i = IN_BINARY_CLASS+f'_{i}'

    y_proba_i = cross_val_predict(
        clean_pipeline, 
        df['ementa'], 
        df[in_binary_class_i],
        cv=StratifiedKFold(n_splits=4, shuffle=True, random_state=RANDOM_SEED), 
        method='predict_proba', 
        verbose=3,
        n_jobs=-1
    )

    y_i_true = df[in_binary_class_i]
    y_i = np.argmax(y_proba_i, axis=1)

    metrics.append({
        'accuracy': accuracy_score(y_i_true, y_i),
        'precision': precision_score(y_i_true, y_i),
        'recall': recall_score(y_i_true, y_i),
        'f1': f1_score(y_i_true, y_i),
        'confusion_matrix': confusion_matrix(y_i_true, y_i),
        'classification_report': classification_report(y_i_true, y_i),
        'n_fixes': i,
    })


    # Find issues and add to dataframe
    datalab_i = Datalab(
        data=df,
        label_name=in_binary_class_i,
    )
    datalab_i.find_issues(pred_probs=y_proba_i)
    metrics[-1]['n_issues'] = datalab_i.get_issue_summary("label")['num_issues'][0]
    
    
    data_label_issues_i = datalab_i.get_issues("label")

    df[IN_BINARY_CLASS+f'_{i+1}'] = data_label_issues_i['predicted_label'].to_list()
    df['is_label_issue'] = data_label_issues_i['is_label_issue'].to_list()
    df[IN_BINARY_CLASS+f'_{i+1}'] = df[IN_BINARY_CLASS+f'_{i+1}'].mask(~df['is_label_issue'], df[IN_BINARY_CLASS+f'_{i}'])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.9min finished


Finding label issues ...
Finding outlier issues ...
Fitting OOD estimator based on provided pred_probs ...
Audit complete. 1022 issues found in the dataset.


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.6min finished


Finding label issues ...
Finding outlier issues ...
Fitting OOD estimator based on provided pred_probs ...
Audit complete. 718 issues found in the dataset.


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.6min finished


Finding label issues ...
Finding outlier issues ...
Fitting OOD estimator based on provided pred_probs ...
Audit complete. 727 issues found in the dataset.


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.5min finished


Finding label issues ...
Finding outlier issues ...
Fitting OOD estimator based on provided pred_probs ...
Audit complete. 709 issues found in the dataset.


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.4min finished


Finding label issues ...
Finding outlier issues ...
Fitting OOD estimator based on provided pred_probs ...
Audit complete. 682 issues found in the dataset.


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.5min finished


Finding label issues ...
Finding outlier issues ...
Fitting OOD estimator based on provided pred_probs ...
Audit complete. 684 issues found in the dataset.


**Saving results**

In [36]:
df_metrics_results = pd.DataFrame(metrics)
df_metrics_results.to_csv(f'./data/{IN_BINARY_CLASS}_conf_learning_metrics_results.csv', index=False)

In [37]:
df_clean_labels = (
    df
    [['ementa', IN_BINARY_CLASS+f'_{N_FIXES}']]
    .rename(columns={IN_BINARY_CLASS+f'_{N_FIXES}': IN_BINARY_CLASS})
)

df_clean_labels.to_parquet(
    f'./data/{IN_BINARY_CLASS}_conf_learning_clean_labels.parquet', 
    index=False
)