## Statement Type Classifier

Training classifier to predict if statement is regulative or constitutive.

In [1]:
import pandas as pd

from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

PATH_DATA = Path("../data/tagger_evaluation")

df = pd.read_excel(PATH_DATA.joinpath("annotated_data.xlsx"), header=1)
df = df.loc[(df['IG syntax (regulative, constitutive)'].isin(["regulative", "constitutive"])) & (~df["Statement"].isna()), :]

In [2]:
X = df['Statement']
y = (df['IG syntax (regulative, constitutive)'] == "regulative").astype(int)

In [3]:
# split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [5]:
# classification pipeline
cl = Pipeline([
    ("vectorizer", TfidfVectorizer(max_features=50)),
    ('rf', RandomForestClassifier())
])
cl.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', TfidfVectorizer(max_features=50)),
                ('rf', RandomForestClassifier())])

In [175]:
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, f1_score

y_test_pred = cl.predict(X_test)
y_train_pred = cl.predict(X_train)

In [176]:
print(f"TRAIN:\nAUC: {roc_auc_score(y_train, y_train_pred)},\nACC: {accuracy_score(y_train, y_train_pred)},\nF1: {f1_score(y_train, y_train_pred)}")
print(f"TEST:\nAUC: {roc_auc_score(y_test, y_test_pred)},\nACC: {accuracy_score(y_test, y_test_pred)},\nF1: {f1_score(y_test, y_test_pred)}")

TRAIN:
AUC: 1.0,
ACC: 1.0,
F1: 1.0
TEST:
AUC: 0.9374999999999999,
ACC: 0.9523809523809523,
F1: 0.9259259259259259
