## Statement Type Classifier

Training classifier to predict if statement is regulative or constitutive.

In [4]:
import pandas as pd

from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, f1_score

PATH_DATA = Path("../data/tagger_evaluation")

df = pd.read_excel(PATH_DATA.joinpath("annotated_data.xlsx"), header=1, engine='openpyxl')
df = df.loc[(df['IG syntax (regulative, constitutive)'].isin(["regulative", "constitutive"])) & (~df["Statement"].isna()), :]

In [5]:
X = df['Statement']
y = (df['IG syntax (regulative, constitutive)'] == "regulative").astype(int)

In [13]:
list(X.iloc[y])[0:4]

['The employee is unable to work.',
 'An employer shall provide to each employee employed by the employer paid sick time.',
 'An employer shall provide to each employee employed by the employer paid sick time.',
 'An employer shall provide to each employee employed by the employer paid sick time.']

In [4]:
# split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [3]:
import numpy
numpy.mean(y)

0.29429429429429427

In [56]:
# classification pipeline
cl = Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1, 3)))
])
cl.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer(ngram_range=(1, 3)))])

In [175]:
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, f1_score

y_test_pred = cl.predict(X_test)
y_train_pred = cl.predict(X_train)

In [176]:
print(f"TRAIN:\nAUC: {roc_auc_score(y_train, y_train_pred)},\nACC: {accuracy_score(y_train, y_train_pred)},\nF1: {f1_score(y_train, y_train_pred)}")
print(f"TEST:\nAUC: {roc_auc_score(y_test, y_test_pred)},\nACC: {accuracy_score(y_test, y_test_pred)},\nF1: {f1_score(y_test, y_test_pred)}")

TRAIN:
AUC: 1.0,
ACC: 1.0,
F1: 1.0
TEST:
AUC: 0.9374999999999999,
ACC: 0.9523809523809523,
F1: 0.9259259259259259


In [6]:
from sklearn.feature_selection import RFECV, SelectKBest, mutual_info_classif

In [50]:
mutual_info_selector = SelectKBest(mutual_info_classif, k='all')
recurse_importance_selector = RFECV(
    estimator=RandomForestClassifier(),
    min_features_to_select=20,
    n_jobs=-1,
    verbose=True,
    step=10,
)

In [51]:
rf = RandomForestClassifier()

In [57]:
selection_pipe = Pipeline(
    [
        (
            'processing',
            cl
        ),
        (
            'feature_selection',
            Pipeline(
                [
                        ("mutual_info_selector", mutual_info_selector),
                        ("recurse_importance_selector", recurse_importance_selector),
                ]
            )
        ),
        (
            'classifier',
            rf
        )
    ]
)

In [58]:
selection_pipe.fit(X_train, y_train)

Fitting estimator with 3770 features.
Fitting estimator with 3760 features.
Fitting estimator with 3750 features.
Fitting estimator with 3740 features.
Fitting estimator with 3730 features.
Fitting estimator with 3720 features.
Fitting estimator with 3710 features.
Fitting estimator with 3700 features.
Fitting estimator with 3690 features.
Fitting estimator with 3680 features.
Fitting estimator with 3670 features.
Fitting estimator with 3660 features.
Fitting estimator with 3650 features.
Fitting estimator with 3640 features.
Fitting estimator with 3630 features.
Fitting estimator with 3620 features.
Fitting estimator with 3610 features.
Fitting estimator with 3600 features.
Fitting estimator with 3590 features.
Fitting estimator with 3580 features.
Fitting estimator with 3570 features.
Fitting estimator with 3560 features.
Fitting estimator with 3550 features.
Fitting estimator with 3540 features.
Fitting estimator with 3530 features.
Fitting estimator with 3520 features.
Fitting esti

Pipeline(steps=[('processing',
                 Pipeline(steps=[('vectorizer',
                                  CountVectorizer(ngram_range=(1, 3)))])),
                ('feature_selection',
                 Pipeline(steps=[('mutual_info_selector',
                                  SelectKBest(k='all',
                                              score_func=<function mutual_info_classif at 0x00000255AB211D30>)),
                                 ('recurse_importance_selector',
                                  RFECV(estimator=RandomForestClassifier(),
                                        min_features_to_select=20, n_jobs=-1,
                                        step=10, verbose=True))])),
                ('classifier', RandomForestClassifier())])

In [62]:
y_test_pred = selection_pipe.predict(X_test)
y_train_pred = selection_pipe.predict(X_train)

print(f"TRAIN:\nAUC: {roc_auc_score(y_train, y_train_pred)},\nACC: {accuracy_score(y_train, y_train_pred)},\nF1: {f1_score(y_train, y_train_pred)}")
print(f"TEST:\nAUC: {roc_auc_score(y_test, y_test_pred)},\nACC: {accuracy_score(y_test, y_test_pred)},\nF1: {f1_score(y_test, y_test_pred)}")

TRAIN:
AUC: 0.9932432432432432,
ACC: 0.9959839357429718,
F1: 0.9931972789115647
TEST:
AUC: 0.9458333333333334,
ACC: 0.9404761904761905,
F1: 0.9019607843137256


In [64]:
# selection_pipe.named_steps['processing'].named_steps.vectorizer.get_feature_names()

In [61]:
from joblib import dump

In [65]:
dump(selection_pipe, 'sentence_type_classifier.joblib')

['sentence_type_classifier.joblib']

In [68]:
import numpy

In [66]:
y_test_pred

array([0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1])

In [72]:
list(y_test_pred.astype(numpy.bool_))

[False,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 True,
 False,
 False,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 True]