In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("data/goemotions_1.csv")

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
pd.set_option('display.max_colwidth', None)
label_of_interest = 'excitement'

df[['text', label_of_interest]].loc[lambda d: d[label_of_interest] == 0].sample(2)

In [None]:
X, y = list(df['text']), df[label_of_interest]

pipe = make_pipeline(
    CountVectorizer(), 
    LogisticRegression(class_weight='balanced', max_iter=1000)
)

In [None]:
from sklearn.pipeline import make_union
from whatlies.language import BytePairLanguage

pipe_emb = make_pipeline(
    make_union(
        BytePairLanguage("en", vs=1_000), 
        BytePairLanguage("en", vs=100_000)
    ),
    LogisticRegression(class_weight='balanced', max_iter=1000)
)

In [None]:
from doubtlab import DoubtLab
from doubtlab.reason import ProbaReason, DisagreeReason, ShortConfidenceReason

In [None]:
pipe.fit(X, y)
pipe_emb.fit(X, y)

In [None]:
reasons = {
    'proba': ProbaReason(pipe),
    'disagree': DisagreeReason(pipe, pipe_emb),
    'short1': ShortConfidenceReason(pipe),
    'short2': ShortConfidenceReason(pipe_emb),
}

doubt = DoubtLab(**reasons)
indices = doubt.get_indices(X, y)

In [None]:
predicates = doubt.get_predicates(X, y)

In [None]:
idx = predicates.index

In [None]:
df.iloc[idx][['text', label_of_interest]].head(5)

In [None]:
df.iloc[indices][['text', label_of_interest]].loc[lambda d: d['excitement'] == 1].head(5)

In [None]:
predicates[predicates.sum(axis=1) > 0]