In [2]:
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

df = pd.read_csv("data/goemotions_1.csv")

In [4]:
df.columns

Index(['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id',
       'created_utc', 'rater_id', 'example_very_unclear', 'admiration',
       'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
       'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust',
       'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy',
       'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
       'remorse', 'sadness', 'surprise', 'neutral'],
      dtype='object')

In [7]:
pd.set_option('display.max_colwidth', None)

df[['text', 'excitement']].loc[lambda d: d['excitement'] == 0].sample(2)

Unnamed: 0,text,excitement
24521,"Only until recently, that being said, I’m really glad. Run the scum over.",0
27915,you got me [NAME],0


In [8]:
df.excitement.value_counts()

0    68100
1     1900
Name: excitement, dtype: int64

In [9]:
X, y = df['text'], df['excitement']

pipe = make_pipeline(
    CountVectorizer(),
    LogisticRegression(class_weight='balanced', max_iter=1000)
)

pipe.fit(X, y)

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('logisticregression',
                 LogisticRegression(class_weight='balanced', max_iter=1000))])

### Trick 1: Model Uncertainty

We can try to locate observations with probabilities around 0.5 (eg between 0.4 and 0.6), i.e. those whose model is uncertain and check their labels.

In [10]:
pipe.predict_proba(X)

array([[0.81905922, 0.18094078],
       [0.87337234, 0.12662766],
       [0.99887488, 0.00112512],
       ...,
       [0.95766948, 0.04233052],
       [0.89402079, 0.10597921],
       [0.979888  , 0.020112  ]])

In [11]:
# make predictions
probas = pipe.predict_proba(X)[:, 0]

# use predictions in hindsight, note that
# probas.shape[0] == df.shape[0]
(df
  .loc[(probas > 0.45) & (probas < 0.55)]
  [['text', 'excitement']]
  .head(7))

Unnamed: 0,text,excitement
8,that's adorable asf,0
46,"If there’s a pattern, yes.",0
107,My fans on patreon will be rewarded soon,0
154,"Ones with close ties to SA, anyway. An escaped apostate won't exactly be itching to run home.",0
158,I really like this ring so I’m glad to hear that.,0
262,OMG THOSE TINY SHOES! *desire to boop snoot intensifies*,0
362,This. I relate to this. So much. Almost too much.,0


Note that observation 262 is clearly an excitation but it was not predicted as such. We can explore more observations this way and pick out the mislabeled ones.

### Trick 2: Model Disagreement

In [12]:
df.loc[lambda d: d['excitement'] != pipe.predict(X)].shape

(5315, 37)

By using the `predict()` function, we get the labels that were incorrectly predicted, here 5315 which is a bit much. The idea is to sort by confidence value for the correct label. If the confidence value is low, the model disagrees with the training data.

In [14]:
def correct_class_confidence(X, y, mod):
    """
    Gives the predicted confidence (or proba) associated
    with the correct label `y` from a given model.
    """
    probas = mod.predict_proba(X)
    values = []
    for i, proba in enumerate(probas):
        proba_dict = {mod.classes_[j]: v for j, v in enumerate(proba)}
        values.append(proba_dict[y[i]])
    return values

In [17]:
(df
  .assign(confidence=correct_class_confidence(X, y, pipe))
  .loc[lambda d: pipe.predict(d['text']) != d['excitement']]
  [['text', 'excitement', 'confidence']]
  .sort_values("confidence")
  .loc[lambda d: d['excitement'] == 0]
  .head(20))

Unnamed: 0,text,excitement,confidence
5676,I am inexplicably excited by [NAME]. I get so excited by how he curls passes,0,0.000148
28707,Omg this is so amazing ! Keep up the awesome work and have a fantastic New Year !,0,0.000187
42757,Omg this is so amazing ! Keep up the awesome work and have a fantastic New Year !,0,0.000187
24756,Sounds like a fun game. Our home game around here is .05/.10. Its fun but not very exciting.,0,0.000262
44459,So no replays for arsenal penalty calls.. Cool cool cool cool cool cool cool cool,0,0.000595
69395,"Wow, your posting history is a real... interesting ride.",0,0.000719
20823,"Wow, your posting history is a real... interesting ride.",0,0.000719
2001,No different than people making a big deal about their team winning the super bowl. People find it interesting.,0,0.000741
30921,"Hey congrats!! That's amazing, you've done such amazing progress! Hope you have a great day :)",0,0.000813
39475,"I just read your list and now I can't wait, either!! Hurry up with the happy, relieved and peaceful onward and upward!! Congratulations😎",0,0.001129


### Trick 3: Cleanlab Noise Indices

In [19]:
from cleanlab.pruning import get_noise_indices

ordered_label_errors = get_noise_indices(
    s=y,
    psx=pipe.predict_proba(X),
    sorted_index_method='prob_given_label',
)

In [20]:
# rows that are worth double checking
ordered_label_errors

array([ 5676, 28707, 42757, ..., 46368, 32785, 25985])

In [21]:
df.iloc[ordered_label_errors][['text', 'excitement']].head(20)

Unnamed: 0,text,excitement
5676,I am inexplicably excited by [NAME]. I get so excited by how he curls passes,0
28707,Omg this is so amazing ! Keep up the awesome work and have a fantastic New Year !,0
42757,Omg this is so amazing ! Keep up the awesome work and have a fantastic New Year !,0
24756,Sounds like a fun game. Our home game around here is .05/.10. Its fun but not very exciting.,0
44459,So no replays for arsenal penalty calls.. Cool cool cool cool cool cool cool cool,0
20823,"Wow, your posting history is a real... interesting ride.",0
69395,"Wow, your posting history is a real... interesting ride.",0
2001,No different than people making a big deal about their team winning the super bowl. People find it interesting.,0
30921,"Hey congrats!! That's amazing, you've done such amazing progress! Hope you have a great day :)",0
39475,"I just read your list and now I can't wait, either!! Hurry up with the happy, relieved and peaceful onward and upward!! Congratulations😎",0


### Trick 4: Cleanlab Predictions

In [22]:
from cleanlab.classification import LearningWithNoisyLabels
from sklearn.linear_model import LogisticRegression

# Wrap around any classifier that has `sample_weights`.
fresh_pipe = make_pipeline(
    CountVectorizer(),
    LogisticRegression(class_weight='balanced', max_iter=1000)
)
lnl = LearningWithNoisyLabels(clf=fresh_pipe)

# Pay attention! It's s=, not y=!
lnl.fit(X=X, s=y.values)

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('logisticregression',
                 LogisticRegression(class_weight='balanced', max_iter=1000))])

In [23]:
new_pipe = make_pipeline(
    CountVectorizer(),
    LogisticRegression(class_weight='balanced', max_iter=1000)
)

new_pipe.fit(X=X, y=y)

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('logisticregression',
                 LogisticRegression(class_weight='balanced', max_iter=1000))])

We then compare the robust model *lnl* with *new_pipe* and see where these two models disagree:

In [27]:
df.loc[lnl.predict(X) != new_pipe.predict(X)][['text', 'excitement']].sample(5)

Unnamed: 0,text,excitement
573,And now you manage a gym on base ..or..?,0
63844,"Ooooh, we got a badass over here!",1
15642,"No, just a bastard pastry",0
23814,"I mean, if I make it look like an accident, just think of all the ribs I can buy with the insurance money...",1
22394,I love this post so much. Really drives the point home and made me laugh!,0


There are other techniques to detect bad labels, cleanlab was a very interesting example. Odds are that efforts doing GridSearch are better spent checking for bad labels so to avoid this, it is best to check for bad labels before performing a grid search.