# Theme Classification - Human Rights

This notebook is an experiment to trying to detect if a PL (Law Proposal) is related to Human Rights or not using its ementa (summary). 

## Importing libraries

In [4]:
import pandas as pd
import time
import json

# Model selection and evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold, GridSearchCV

# Text vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer

# Classifiers
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier

# Pipeline
from sklearn.pipeline import Pipeline


RANDOM_STATE = 214

## Importing data

In [5]:
df_pls_theme = pd.read_parquet('../data/proposicoes_temas_one_hot_encoding.parquet')

Print basic information.

In [6]:
df_pls_theme.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65977 entries, 0 to 65976
Data columns (total 27 columns):
 #   Column                                       Non-Null Count  Dtype 
---  ------                                       --------------  ----- 
 0   id                                           65977 non-null  int64 
 1   ementa                                       65977 non-null  object
 2   Administração Pública                        65977 non-null  int64 
 3   Agricultura, Pecuária, Pesca e Extrativismo  65977 non-null  int64 
 4   Arte, Cultura e Religião                     65977 non-null  int64 
 5   Cidades e Desenvolvimento Urbano             65977 non-null  int64 
 6   Comunicações                                 65977 non-null  int64 
 7   Defesa e Segurança                           65977 non-null  int64 
 8   Direito Civil e Processual Civil             65977 non-null  int64 
 9   Direito Penal e Processual Penal             65977 non-null  int64 
 10  Direito e 

## Data preprocessing

Droping ementa duplicates to avoid data leakage on training.

In [7]:
df_pls_theme = df_pls_theme.drop_duplicates(subset=['ementa'])
df_pls_theme = df_pls_theme[['ementa','Direitos Humanos e Minorias']]
df_pls_theme = df_pls_theme.rename(columns={'Direitos Humanos e Minorias': 'in_human_rights'})

df_pls_theme.info()

<class 'pandas.core.frame.DataFrame'>
Index: 61934 entries, 0 to 65976
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   ementa           61934 non-null  object
 1   in_human_rights  61934 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.4+ MB


In [8]:
df_pls_theme.head(10)

tema,ementa,in_human_rights
0,"Dispõe sobre a Política Nacional de Salários, ...",0
1,"Modifica o art. 6º da Lei nº 9.424, de 24 de d...",0
2,Dispõe sobre salário-família e dá outras provi...,0
3,"Modifica a Lei nº 4.117, de 1962, que ""institu...",0
4,Concede isenção do imposto sobre produtos indu...,0
5,"Institui o sistema Distrital Misto, majoritári...",0
6,Estabelece penalidades pelo uso de telefone ce...,0
7,Estabelece dia da semana para realização de pr...,1
8,"Altera a Lei nº 8.666, de 21 de junho de 1993,...",0
9,"Altera o caput do art. 12 da Lei nº 9.492, de ...",0


## Train test split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    df_pls_theme["ementa"],
    df_pls_theme["in_human_rights"],
    stratify=df_pls_theme["in_human_rights"],
    test_size=0.15,
    random_state=RANDOM_STATE
)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((52643,), (52643,), (9291,), (9291,))

## Model Selection

Perform model selection using cross validation with TF-IDF standard features.

In [8]:
vectorizer = [
    (
        "tfidf",
        TfidfVectorizer(
            ngram_range=(1, 2),
            max_features=2000,
            decode_error="replace",
            strip_accents="ascii",
            lowercase=True,
        ),
    ),
]

models = [
    ("Logistic Regression", LogisticRegression(verbose=0, random_state=RANDOM_STATE)),
    ("SGDClassifier", SGDClassifier(random_state=RANDOM_STATE)),
    ("Linear SVC", LinearSVC(random_state=RANDOM_STATE)),
    ("Random Forest", RandomForestClassifier(random_state=RANDOM_STATE)),
    ("XGBoost", XGBClassifier(random_state=RANDOM_STATE)),
    ("Decision Tree", DecisionTreeClassifier(random_state=RANDOM_STATE)),
    ("Dummy", DummyClassifier(random_state=RANDOM_STATE, strategy="most_frequent")),
]

pipelines = [
    Pipeline(vectorizer + [model]) for model in models
]

metrics = []

Training the models and storing the results

In [9]:
stratified_kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)

for pipeline in pipelines:

    print(f"Training model: {pipeline.steps[-1][0]}")

    t1 = time.time()
    cross_val_scores = cross_validate(
        pipeline,
        X_train,
        y_train,
        cv=stratified_kfold,
        scoring=["accuracy", "f1", "precision", "recall"],
    )
    training_time = time.time() - t1

    metrics.append(
        {
            "model": pipeline.steps[-1][0],
            "mean_accuracy": cross_val_scores["test_accuracy"].mean(),
            "mean_f1": cross_val_scores["test_f1"].mean(),
            "mean_precision": cross_val_scores["test_precision"].mean(),
            "mean_recall": cross_val_scores["test_recall"].mean(),
            "training_time": training_time,
        }
    )

metrics_df = pd.DataFrame(metrics)
metrics_df.head()

Training model: Logistic Regression
Training model: SGDClassifier
Training model: Linear SVC
Training model: Random Forest
Training model: XGBoost
Training model: Decision Tree


Unnamed: 0,model,mean_accuracy,mean_f1,mean_precision,mean_recall,training_time
0,Logistic Regression,0.910131,0.567427,0.844342,0.427293,14.294176
1,SGDClassifier,0.907756,0.549099,0.842897,0.407188,13.826842
2,Linear SVC,0.912106,0.594151,0.818308,0.4664,16.539962
3,Random Forest,0.91222,0.602017,0.803701,0.481272,231.500479
4,XGBoost,0.911688,0.586566,0.828049,0.454144,58.938207


Saving the metrics

In [10]:
metrics_df.to_csv('../results/binary_classifier_model_selection.csv', index=False)

## Hyperparameter Tuning

In [10]:
model_pipe = Pipeline(
    [
        (
            "tfidf_vec",
            TfidfVectorizer(
                ngram_range=(1, 2),
                max_features=2000,
                decode_error="replace",
                strip_accents="ascii",
                lowercase=True,
            ),
        ),
        ("xgb_clf", XGBClassifier(random_state=RANDOM_STATE)),
    ]
)

params = {
    "xgb_clf__max_depth": [3, 6],
    "xgb_clf__n_estimators": [300, 500],
    "xgb_clf__learning_rate": [0.01, 0.1],
    # "xgb_clf__max_delta_step": [0, 1],
}

kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
grid_search = GridSearchCV(
    model_pipe,
    params,
    cv=kfold,
    scoring=["accuracy", "f1", "precision", "recall"],
    refit="recall",
    n_jobs=1,
    verbose=3,
)

In [26]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV 1/3] END xgb_clf__learning_rate=0.01, xgb_clf__max_depth=3, xgb_clf__n_estimators=300; accuracy: (test=0.896) f1: (test=0.428) precision: (test=0.890) recall: (test=0.282) total time=  29.4s
[CV 2/3] END xgb_clf__learning_rate=0.01, xgb_clf__max_depth=3, xgb_clf__n_estimators=300; accuracy: (test=0.893) f1: (test=0.403) precision: (test=0.878) recall: (test=0.261) total time=  29.5s
[CV 3/3] END xgb_clf__learning_rate=0.01, xgb_clf__max_depth=3, xgb_clf__n_estimators=300; accuracy: (test=0.897) f1: (test=0.434) precision: (test=0.885) recall: (test=0.288) total time=  30.2s
[CV 1/3] END xgb_clf__learning_rate=0.01, xgb_clf__max_depth=3, xgb_clf__n_estimators=500; accuracy: (test=0.898) f1: (test=0.450) precision: (test=0.886) recall: (test=0.302) total time=  37.3s
[CV 2/3] END xgb_clf__learning_rate=0.01, xgb_clf__max_depth=3, xgb_clf__n_estimators=500; accuracy: (test=0.895) f1: (test=0.429) precision: (test=0.872) recal

In [15]:
model_pipe = Pipeline(
    [
        (
            "tfidf_vec",
            TfidfVectorizer(
                ngram_range=(1, 2),
                max_features=3000,
                decode_error="replace",
                strip_accents="ascii",
                lowercase=True,
            ),
        ),
        ("rndf_clf", RandomForestClassifier(n_jobs=-1)),
    ]
)

params = {
    "rndf_clf__n_estimators": [500, 800],
    "rndf_clf__max_depth": [None],
    "rndf_clf__min_samples_split": [2],
    "rndf_clf__class_weight": ["balanced", None],
}

kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
grid_search = GridSearchCV(
    model_pipe,
    params,
    cv=kfold,
    scoring=["accuracy", "f1", "precision", "recall"],
    refit="f1",
    n_jobs=1,
    verbose=3,
)

In [16]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV 1/3] END rndf_clf__class_weight=balanced, rndf_clf__max_depth=None, rndf_clf__min_samples_split=2, rndf_clf__n_estimators=500; accuracy: (test=0.915) f1: (test=0.618) precision: (test=0.808) recall: (test=0.500) total time= 1.5min
[CV 2/3] END rndf_clf__class_weight=balanced, rndf_clf__max_depth=None, rndf_clf__min_samples_split=2, rndf_clf__n_estimators=500; accuracy: (test=0.915) f1: (test=0.619) precision: (test=0.805) recall: (test=0.503) total time= 1.7min
[CV 3/3] END rndf_clf__class_weight=balanced, rndf_clf__max_depth=None, rndf_clf__min_samples_split=2, rndf_clf__n_estimators=500; accuracy: (test=0.914) f1: (test=0.616) precision: (test=0.808) recall: (test=0.498) total time= 1.4min
[CV 1/3] END rndf_clf__class_weight=balanced, rndf_clf__max_depth=None, rndf_clf__min_samples_split=2, rndf_clf__n_estimators=800; accuracy: (test=0.913) f1: (test=0.613) precision: (test=0.799) recall: (test=0.497) total time= 2.3min


In [17]:
results = pd.DataFrame(grid_search.cv_results_)
# save results
results.to_csv('../results/binary_classifier_grid_search.csv', index=False)

In [18]:
import pickle

In [22]:
## Save the best model using pickle
best_model = grid_search.best_estimator_

with open('../models/binary_classifier_best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)    

In [31]:
# score in test set

y_pred = best_model.predict(X_test)

results_test = {
    "accuracy": accuracy_score(y_test, y_pred),
    "f1": f1_score(y_test, y_pred),
    "precision": precision_score(y_test, y_pred),
    "recall": recall_score(y_test, y_pred),
}


In [35]:
# save the results from the best model in test set in the models folder

with open('../models/binary_classifier_best_model_results.json', 'wb') as f:
    results_test_json = json.dumps(results_test)
    f.write(results_test_json.encode('utf-8'))