In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data/goemotions_1.csv")

In [3]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

In [53]:
pd.set_option('display.max_colwidth', None)
label_of_interest = 'excitement'

print(df[['text', label_of_interest]].loc[lambda d: d[label_of_interest] == 0].sample(4).to_markdown())

|       | text                                                                    |   excitement |
|------:|:------------------------------------------------------------------------|-------------:|
| 27233 | my favourite singer ([NAME]) helped write one of the songs so i love it |            0 |
|  1385 | No i didn’t all i know is that i binged 3 seasoms of it.                |            0 |
| 17077 | I liked [NAME]...                                                       |            0 |
| 55699 | A "wise" man once told me: > DO > YOUR > OWN >RESEARCH >!               |            0 |


In [5]:
X, y = list(df['text']), df[label_of_interest]

pipe = make_pipeline(
    CountVectorizer(), 
    LogisticRegression(class_weight='balanced', max_iter=1000)
)

In [6]:
from sklearn.pipeline import make_union
from whatlies.language import BytePairLanguage

pipe_emb = make_pipeline(
    make_union(
        BytePairLanguage("en", vs=1_000), 
        BytePairLanguage("en", vs=100_000)
    ),
    LogisticRegression(class_weight='balanced', max_iter=1000)
)

In [8]:
from doubtlab.ensemble import DoubtEnsemble
from doubtlab.reason import ProbaReason, DisagreeReason, ShortConfidenceReason

In [9]:
pipe.fit(X, y)
pipe_emb.fit(X, y)

Pipeline(steps=[('featureunion',
                 FeatureUnion(transformer_list=[('bytepairlanguage-1',
                                                 BytePairLanguage(lang='en',
                                                                  vs=1000)),
                                                ('bytepairlanguage-2',
                                                 BytePairLanguage(lang='en',
                                                                  vs=100000))])),
                ('logisticregression',
                 LogisticRegression(class_weight='balanced', max_iter=1000))])

In [54]:
reasons = {
    'proba': ProbaReason(pipe),
    'disagree': DisagreeReason(pipe, pipe_emb),
    'short_pipe': ShortConfidenceReason(pipe),
    'short_pipe_emb': ShortConfidenceReason(pipe_emb),
}

doubt = DoubtEnsemble(**reasons)

In [55]:
predicates = doubt.get_predicates(X, y)

In [56]:
df_sorted = df.iloc[predicates.index][['text', label_of_interest]]
df_label = pd.concat([df_sorted, predicates], axis=1)

In [70]:
print(df_label[['text', label_of_interest]].loc[lambda d: d['excitement'] == 0].head(10).to_markdown(index=False))

| text                             |   excitement |
|:---------------------------------|-------------:|
| Happy Easter everyone!!          |            0 |
| Happy Easter everyone!!          |            0 |
| Happy Easter everyone!!          |            0 |
| Congratulations mate!!           |            0 |
| Yes every time                   |            0 |
| New flavour! I love it!          |            0 |
| Wow! Prayers for everyone there. |            0 |
| Wow! Prayers for everyone there. |            0 |
| Hey vro!                         |            0 |
| Oh my gooooooooood               |            0 |


## Differences 

It's pretty interesting to observe that different predicates really represent different reasons of doubt.

### CountVector Confidence Short

In [73]:
print(df_label
 .sort_values("predicate_short_pipe", ascending=False)
 .head(10)[['text', label_of_interest]]
 .drop_duplicates()
 .to_markdown(index=False))

| text                                                                                                                                      |   excitement |
|:------------------------------------------------------------------------------------------------------------------------------------------|-------------:|
| I am inexplicably excited by [NAME]. I get so excited by how he curls passes                                                              |            0 |
| Omg this is so amazing ! Keep up the awesome work and have a fantastic New Year !                                                         |            0 |
| Sounds like a fun game. Our home game around here is .05/.10. Its fun but not very exciting.                                              |            0 |
| So no replays for arsenal penalty calls.. Cool cool cool cool cool cool cool cool                                                         |            0 |
| Wow, your posting history is a real... interesting ride.

### BytePair Confidence Short

In [76]:
print(df_label
 .sort_values("predicate_short_pipe_emb", ascending=False)
 .head(20)[['text', label_of_interest]]
 .drop_duplicates()
 .to_markdown(index=False))

| text                       |   excitement |
|:---------------------------|-------------:|
| Woot woot!                 |            0 |
| WOW!!!                     |            0 |
| Happy birthday!            |            0 |
| Happy Birthday!            |            0 |
| Happy one week anniversary |            0 |
| Happy Birthday!!!          |            0 |
| Pop pop!                   |            0 |
| Enjoy the ride!            |            0 |
| Very interesting!!!        |            0 |
| My exact reaction          |            0 |
| happy birthday dude!       |            0 |
| Enjoy                      |            0 |
| Oh wow!!!                  |            0 |
| This sounds interesting    |            0 |


## CountVector Proba

In [78]:
print(df_label
 .sort_values("predicate_proba", ascending=False)
 .head(10)[['text', label_of_interest]]
 .drop_duplicates()
 .to_markdown(index=False))

| text                                                                                                                          |   excitement |
|:------------------------------------------------------------------------------------------------------------------------------|-------------:|
| Happy Easter everyone!!                                                                                                       |            0 |
| This game is on [NAME]...                                                                                                     |            0 |
| I swear if it's the Cowboys and the Patriots in the Super Bowl I'm going to burn something down.                              |            0 |
| I'm on red pills :)                                                                                                           |            0 |
| Wow. I hope that asst manager will be looking for a new job soon.                                                             | 