In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data/goemotions_1.csv")

In [3]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
pd.set_option('display.max_colwidth', None)
label_of_interest = 'excitement'

df[['text', label_of_interest]].loc[lambda d: d[label_of_interest] == 0].sample(4)

Unnamed: 0,text,excitement
37422,Nice ;),0
37430,Date me and fix me. Then date someone who shares my inability to cheat. There's gotta be someone like me out there.,0
25283,that this ridiculous comparison is this detailed and explicit makes me think the whole beating-a-little-girl-thing is part of his disgusting sexual fantasies.,0
35008,I see you saw the new season of gotham,0


In [22]:
df[label_of_interest].value_counts()

0    68100
1     1900
Name: excitement, dtype: int64

In [8]:
X, y = list(df['text']), df[label_of_interest]

pipe = make_pipeline(
    CountVectorizer(), 
    LogisticRegression(class_weight='balanced', max_iter=1000)
)

In [9]:
from sklearn.pipeline import make_union
from whatlies.language import BytePairLanguage

pipe_emb = make_pipeline(
    make_union(
        BytePairLanguage("en", vs=1_000), 
        BytePairLanguage("en", vs=100_000)
    ),
    LogisticRegression(class_weight='balanced', max_iter=1000)
)

In [12]:
from doubtlab.ensemble import DoubtEnsemble
from doubtlab.reason import ProbaReason, DisagreeReason, ShortConfidenceReason

In [13]:
pipe.fit(X, y)
pipe_emb.fit(X, y)

Pipeline(steps=[('featureunion',
                 FeatureUnion(transformer_list=[('bytepairlanguage-1',
                                                 BytePairLanguage(lang='en',
                                                                  vs=1000)),
                                                ('bytepairlanguage-2',
                                                 BytePairLanguage(lang='en',
                                                                  vs=100000))])),
                ('logisticregression',
                 LogisticRegression(class_weight='balanced', max_iter=1000))])

In [14]:
reasons = {
    'proba': ProbaReason(pipe),
    'disagree': DisagreeReason(pipe, pipe_emb),
    'short_pipe': ShortConfidenceReason(pipe),
    'short_pipe_emb': ShortConfidenceReason(pipe_emb),
}

doubt = DoubtEnsemble(**reasons)

In [15]:
predicates = doubt.get_predicates(X, y)

In [16]:
df_sorted = df.iloc[predicates.index][['text', label_of_interest]]
df_label = pd.concat([df_sorted, predicates], axis=1)

In [17]:
(df_label[['text', label_of_interest]]
  .loc[lambda d: d['excitement'] == 0]
  .head(10))

Unnamed: 0,text,excitement
6361,Happy Easter everyone!!,0
14190,Happy Easter everyone!!,0
47225,Happy Easter everyone!!,0
25404,Congratulations mate!!,0
22615,Yes every time,0
1847,New flavour! I love it!,0
50341,Wow! Prayers for everyone there.,0
63154,Wow! Prayers for everyone there.,0
22995,Hey vro!,0
66589,Oh my gooooooooood,0


## Differences 

It's pretty interesting to observe that different predicates really represent different reasons of doubt.

### CountVector Confidence Short

In [18]:
(df_label
 .sort_values("predicate_short_pipe", ascending=False)
 .head(10)[['text', label_of_interest]]
 .drop_duplicates())

Unnamed: 0,text,excitement
5676,I am inexplicably excited by [NAME]. I get so excited by how he curls passes,0
42757,Omg this is so amazing ! Keep up the awesome work and have a fantastic New Year !,0
24756,Sounds like a fun game. Our home game around here is .05/.10. Its fun but not very exciting.,0
44459,So no replays for arsenal penalty calls.. Cool cool cool cool cool cool cool cool,0
69395,"Wow, your posting history is a real... interesting ride.",0
2001,No different than people making a big deal about their team winning the super bowl. People find it interesting.,0
30921,"Hey congrats!! That's amazing, you've done such amazing progress! Hope you have a great day :)",0
39475,"I just read your list and now I can't wait, either!! Hurry up with the happy, relieved and peaceful onward and upward!! Congratulations😎",0


### BytePair Confidence Short

In [19]:
(df_label
 .sort_values("predicate_short_pipe_emb", ascending=False)
 .head(20)[['text', label_of_interest]]
 .drop_duplicates())

Unnamed: 0,text,excitement
56660,Woot woot!,0
60381,WOW!!!,0
42584,Happy Birthday!,0
66821,Happy birthday!,0
35491,Happy one week anniversary,0
44679,Happy Birthday!!!,0
58193,Pop pop!,0
52095,Enjoy the ride!,0
3545,Very interesting!!!,0
9843,My exact reaction,0


## CountVector Proba

In [20]:
(df_label
 .sort_values("predicate_proba", ascending=False)
 .head(10)[['text', label_of_interest]]
 .drop_duplicates())

Unnamed: 0,text,excitement
6361,Happy Easter everyone!!,0
15952,You're welcome! Anything else on your mind? Best to air it out,0
24295,[NAME] family must be having a field day there today!,0
24326,Keep gg too bro!,0
58235,She’s cool as hell,0
3984,Grim. Thank you for the info though. I learned something today.,0
58250,READ A BOOK YOU FUCKING CROOK.,0
46667,Nimmo? Never heard of the guy. Never seen that infectious smile and everyday hustle before. Of course the Mets know what they have.,0
34688,It's prevalent in everything. WoW was incredible when it came out *because* everyone was garbage.,0
10350,It worked for me too!!,1
