
# Assignment 1 — Sexism Detection (EXIST 2023 Task 2)

**Group members:** Jacopo Francesco Amoretti, Roberto Frabetti, Ivo Rambaldi

---

## Delivery checklist
- [ ] Task 1 — Corpus (majority vote aggregation, EN filter, label encoding)
- [ ] Task 2 — Data Cleaning (emoji/hashtag/mention/url/symbols/quotes + lemmatization)
- [ ] Task 3 — Text Encoding (GloVe + OOV handling + embedding matrix)
- [ ] Task 4 — Models (BiLSTM baseline and stacked)
- [ ] Task 5 — Training & Evaluation (≥ 3 seeds, macro F1/Prec/Rec, avg ± std)
- [ ] Task 6 — Transformers (Twitter-roBERTa-base-hate + Trainer)
- [ ] Task 7 — Error Analysis (error patterns, confusion/PR, examples)
- [ ] Task 8 — Report (summary of results, figures, metrics table)



## Setup

Run this once at the beginning. It sets seeds, libraries, and project paths.


In [1]:

# === Basic imports ===

import os
import re
import json
import math
import random
import numpy as np
import pandas as pd
from pathlib import Path

# Visualization/plots
import matplotlib.pyplot as plt

# Metrics
from sklearn.metrics import classification_report, precision_recall_fscore_support, confusion_matrix

# Optional: progress bar
try:
    from tqdm.auto import tqdm
except Exception:
    tqdm = lambda x: x

# Seed and device
SEED = 1337
random.seed(SEED)
np.random.seed(SEED)

# Project paths (adjust as needed)
DATA_DIR = Path('data')          # Should contain: train.json, val.json, test.json
GLOVE_DIR = Path('glove')        # Files like glove.6B.100d.txt 
ARTIFACTS_DIR = Path('artifacts') # Save vocab, mappings, embedding matrix, etc.
MODELS_DIR = Path('models')
RESULTS_DIR = Path('results')

for d in [ARTIFACTS_DIR, MODELS_DIR, RESULTS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print('Setup complete.')


Matplotlib is building the font cache; this may take a moment.


Setup complete.


  from .autonotebook import tqdm as notebook_tqdm



# Task 1 — Corpus



In [2]:

# == Majority vote on a list of labels ==

from collections import Counter

def majority_vote(labels):
    cnt = Counter(labels)
    top = cnt.most_common() # Most frequent labels in descending order
    if len(top) == 0:
        return None, False
    if len(top) > 1 and top[0][1] == top[1][1]:
        return None, False  # If no clear majority
    return top[0][0], True # Return majority label

# Mapping between textual and numerical label representations
label2id = {'-': 0, 'DIRECT': 1, 'JUDGEMENTAL': 2, 'REPORTED': 3}
id2label = {v:k for k,v in label2id.items()}


In [3]:

# == Load JSON into a DataFrame ==

train_path = Path('data') / 'training.json'
val_path   = Path('data') / 'validation.json'
test_path  = Path('data') / 'test.json'

def loadJson(path: Path):
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    df = pd.DataFrame.from_dict(data, orient='index') # Each key is an index
    return df

# Load raw splits into DataFrames
df_train_raw = loadJson(train_path)
df_val_raw   = loadJson(val_path)
df_test_raw  = loadJson(test_path)

print('Train raw:', df_train_raw.shape, '| Val raw:', df_val_raw.shape, '| Test raw:', df_test_raw.shape)
df_train_raw.head(2) # Display first 2 rows of training data


Train raw: (6920, 11) | Val raw: (726, 11) | Test raw: (312, 11)


Unnamed: 0,id_EXIST,lang,tweet,number_annotators,annotators,gender_annotators,age_annotators,labels_task1,labels_task2,labels_task3,split
100001,100001,es,"@TheChiflis Ignora al otro, es un capullo.El p...",6,"[Annotator_1, Annotator_2, Annotator_3, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[YES, YES, NO, YES, YES, YES]","[REPORTED, JUDGEMENTAL, -, REPORTED, JUDGEMENT...","[[OBJECTIFICATION], [OBJECTIFICATION, SEXUAL-V...",TRAIN_ES
100002,100002,es,@ultimonomada_ Si comicsgate se parece en algo...,6,"[Annotator_7, Annotator_8, Annotator_9, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 23-45, 18-22]","[NO, NO, NO, NO, YES, NO]","[-, -, -, -, DIRECT, -]","[[-], [-], [-], [-], [OBJECTIFICATION], [-]]",TRAIN_ES


In [4]:


# == Majority voting on labels_task2 and drop ambiguous labels ==

def apply_majority_and_drop(df):
    mv_labels = []
    keep_mask = []
    for _, row in df.iterrows():
        mv, evaluation = majority_vote(row['labels_task2'])  # Apply majority vote to each row
        mv_labels.append(mv)
        keep_mask.append(evaluation)
    df = df.copy()
    df['label'] = mv_labels  # Add final label column
    df = df[pd.Series(keep_mask).values] # Drop rows without clear majority
    return df

# Apply to all dataset splits
df_train_mv = apply_majority_and_drop(df_train_raw)
df_val_mv   = apply_majority_and_drop(df_val_raw)
df_test_mv  = apply_majority_and_drop(df_test_raw)

print('After adding label column and drop ambiguous labels:',
      'Train:', df_train_mv.shape, 'Val:', df_val_mv.shape, 'Test:', df_test_mv.shape)


After adding label column and drop ambiguous labels: Train: (6065, 12) Val: (630, 12) Test: (280, 12)


In [5]:

# == EN filter and column selection ==

keep_columns = ['id_EXIST', 'lang', 'tweet', 'label']

def filter_and_select(df):
    df = df[df['lang'] == 'en'].copy()  # Keep only English tweets
    df = df[keep_columns].copy()        # Select relevant columns
    return df

# Apply filtering to all dataset splits
df_train = filter_and_select(df_train_mv)
df_val   = filter_and_select(df_val_mv)
df_test  = filter_and_select(df_test_mv)

print('EN only:', 'Train:', df_train.shape, 'Val:', df_val.shape, 'Test:', df_test.shape)
df_train.head(3)


EN only: Train: (2873, 4) Val: (150, 4) Test: (280, 4)


Unnamed: 0,id_EXIST,lang,tweet,label
200001,200001,en,FFS! How about laying the blame on the bastard...,-
200002,200002,en,Writing a uni essay in my local pub with a cof...,REPORTED
200003,200003,en,@UniversalORL it is 2021 not 1921. I dont appr...,REPORTED


In [6]:

# == Label encoding ==

def encode_labels(df):
    df = df.copy()
    df['label_id'] = df['label'].map(label2id)  # Convert text labels to numeric IDs
    return df

# Apply label encoding to all dataset splits
df_train = encode_labels(df_train)
df_val   = encode_labels(df_val)
df_test  = encode_labels(df_test)

print(df_train['label'].value_counts())  # Check label distribution
df_train.head(3)     # Inspect encoded DataFrame


label
-              2014
DIRECT          537
REPORTED        184
JUDGEMENTAL     138
Name: count, dtype: int64


Unnamed: 0,id_EXIST,lang,tweet,label,label_id
200001,200001,en,FFS! How about laying the blame on the bastard...,-,0
200002,200002,en,Writing a uni essay in my local pub with a cof...,REPORTED,3
200003,200003,en,@UniversalORL it is 2021 not 1921. I dont appr...,REPORTED,3


In [7]:

# == Save post-Task1 datasets for quick reuse ==

df_train[['id_EXIST','lang','tweet','label','label_id']].to_csv('results/train_task1.csv', index=False)
df_val[['id_EXIST','lang','tweet','label','label_id']].to_csv('results/val_task1.csv', index=False)
df_test[['id_EXIST','lang','tweet','label','label_id']].to_csv('results/test_task1.csv', index=False)
print("Saved post-Task1 datasets to 'results/' directory.")

Saved post-Task1 datasets to 'results/' directory.



# Task 2 — Data Cleaning



In [8]:

# == Text cleaning: basic regex ==

import re

URL = re.compile(r'https?://\S+|www\.\S+') # Match URLs
MENTION = re.compile(r'@\w+') # Match @ symbol
HASHTAG = re.compile(r'#\w+') # Match hashtags
EMOJI = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE) # Match emojis
SPECIAL_QUOTES_REPLACEMENTS = { # Special quotes to normal quotes
    '“': '"', '”': '"', '‘': "'", '’': "'",
    '«': '"', '»': '"', '…': '...'
}

def normalize_quotes(text: str):
    for k, v in SPECIAL_QUOTES_REPLACEMENTS.items():
        text = text.replace(k, v)
    return text

def basic_clean(text: str):
    text = normalize_quotes(text)                           # Standardize quotes
    text = URL.sub(' ', text)                               # Remove URLs
    text = MENTION.sub(' ', text)                           # Remove mentions
    text = HASHTAG.sub(' ', text)                           # Remove hashtags
    text = EMOJI.sub(' ', text)                             # Remove emojis
    text = re.sub(r"[^0-9A-Za-z'\.,!\?\s]", ' ', text)      # Remove unwanted symbols
    text = re.sub(r'\s+', ' ', text).strip()                # Normalize spaces
    return text

print(basic_clean("Check this: https:/try.com @user #hashtag 👍🏻 “quote” — symbols")) # Check the function correctness


Check this https try.com quote symbols


In [9]:

# == Lemmatization with spaCy ==

USE_SPACY = True

# Load spaCy model and verify if it's available
try:
    import spacy
    try:
        nlp = spacy.load('en_core_web_sm', disable=['ner'])  # Load lightweight English model
    except Exception:
        nlp = None
        print("Warning: spaCy model 'en_core_web_sm' not installed. Install it and re-run.")
except Exception:
    USE_SPACY = False
    nlp = None
    print('spaCy not available; skipping lemmatization or use another library.')

def lemmatize_en(texts):
    if nlp is None:
        return texts    # Skip if model not available
    docs = nlp.pipe(texts, batch_size=512) # Process texts in batches for efficiency
    out = []
    for doc in docs:
        lemmas = [t.lemma_.lower() for t in doc if not t.is_space] # Creating lemmas list
        out.append(' '.join(lemmas))
    return out

def apply_clean_and_lemma(df, text_col='tweet'):
    df = df.copy()
    df['clean'] = df[text_col].astype(str).apply(basic_clean)    # Apply regex-based cleaning
    df['clean_lemma'] = lemmatize_en(df['clean'].tolist())  # Lemmatize cleaned text
    return df

# Apply cleaning and lemmatization to all splits
df_train = apply_clean_and_lemma(df_train, 'tweet')
df_val   = apply_clean_and_lemma(df_val, 'tweet')
df_test  = apply_clean_and_lemma(df_test, 'tweet')

df_train[['tweet','clean','clean_lemma']].head(3)   # Inspect transformation results


Unnamed: 0,tweet,clean,clean_lemma
200001,FFS! How about laying the blame on the bastard...,FFS! How about laying the blame on the bastard...,ffs ! how about lay the blame on the bastard w...
200002,Writing a uni essay in my local pub with a cof...,Writing a uni essay in my local pub with a cof...,write a uni essay in my local pub with a coffe...
200003,@UniversalORL it is 2021 not 1921. I dont appr...,it is 2021 not 1921. I dont appreciate that on...,it be 2021 not 1921 . i do not appreciate that...



# Task 3 — Text Encoding



In [10]:
TOKEN_RE = re.compile(r"\w+(?:'\w+)?") # Basic regex tokenizer (keeps apostrophes within words)

def simple_tokenize(text):
    return TOKEN_RE.findall(str(text).lower())  # Tokenize and lowercase text

def build_vocab_from_train(texts, min_freq=1):
    from collections import Counter
    c = Counter()
    for t in texts:
        for tok in simple_tokenize(t):
            c[tok] += 1 # Count token frequency
    vocab = {tok for tok, f in c.items() if f >= min_freq}  # Keep tokens above frequency threshold
    return vocab, c

# Build vocabulary from training texts
train_texts = df_train['clean_lemma'] if 'clean_lemma' in df_train.columns else df_train['clean']
vocab_set, freq = build_vocab_from_train(train_texts.tolist(), min_freq=1)

print('Vocab size (train):', len(vocab_set))

# Display top 20 most frequent tokens
from collections import Counter
tokens = [tok for t in train_texts for tok in simple_tokenize(t)]
Counter(tokens).most_common(20)

Vocab size (train): 9074


[('be', 3793),
 ('the', 2648),
 ('to', 1998),
 ('i', 1992),
 ('a', 1915),
 ('and', 1657),
 ('of', 1393),
 ('not', 1313),
 ('you', 1115),
 ('it', 984),
 ('that', 917),
 ('do', 906),
 ('in', 894),
 ('have', 780),
 ('woman', 755),
 ('for', 728),
 ('they', 558),
 ('this', 546),
 ('on', 535),
 ('like', 515)]

In [11]:

# == Download and load GloVe (Twitter 27B) ==

import os, pathlib, zipfile, urllib.request

EMB_DIM = 100
GLOVE_DIR = pathlib.Path("glove")
GLOVE_DIR.mkdir(parents=True, exist_ok=True)
GLOVE_FILE = GLOVE_DIR / f"glove.twitter.27B.{EMB_DIM}d.txt"

# Download and extract GloVe embeddings if not already present
if not GLOVE_FILE.exists():
    url = "https://nlp.stanford.edu/data/glove.twitter.27B.zip"
    zip_path = GLOVE_DIR / "glove.twitter.27B.zip"
    print("Downloading:", url)
    urllib.request.urlretrieve(url, zip_path)
    with zipfile.ZipFile(zip_path) as zf:
        zf.extract(GLOVE_FILE.name, GLOVE_DIR)
    os.remove(zip_path)
print("GloVe ready at:", GLOVE_FILE)


GLOVE_FILE = Path('glove') / f'glove.twitter.27B.{EMB_DIM}d.txt'

def load_glove(path):
    emb = {}
    if not path.exists():
        print(f'WARNING: GloVe file not found: {path}. Will initialize OOV randomly.')
        return emb
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.rstrip().split(' ')
            w = parts[0]
            vec = np.asarray(parts[1:], dtype=np.float32)   # Convert embedding values to float array
            emb[w] = vec
    print('Loaded GloVe vectors:', len(emb))
    return emb


# Load pre-trained embeddings into dictionary
glove = load_glove(GLOVE_FILE)


GloVe ready at: glove/glove.twitter.27B.100d.txt
Loaded GloVe vectors: 1193515


In [12]:

# == Build embedding matrix and save artifacts ==

SPECIAL_TOKENS = ['<PAD>', '<UNK>']
token_list = sorted(vocab_set)

itos = SPECIAL_TOKENS + token_list  # Index → token mapping
stoi = {tok:i for i, tok in enumerate(itos)}    # Token → index mapping

def rand_vec(d):
    return np.random.normal(0, 0.1, size=(d,)).astype(np.float32)   # Random vector for OOV tokens

# Initialize embedding matrix
embedding_matrix = np.zeros((len(itos), EMB_DIM), dtype=np.float32)
embedding_matrix[stoi['<PAD>']] = np.zeros(EMB_DIM, dtype=np.float32)
embedding_matrix[stoi['<UNK>']] = rand_vec(EMB_DIM)


# Fill embedding matrix using GloVe, random for OOV tokens
oov_count = 0
for tok in token_list:
    idx = stoi[tok]
    if tok in glove:
        embedding_matrix[idx] = glove[tok]
    else:
        embedding_matrix[idx] = rand_vec(EMB_DIM)
        oov_count += 1

print('Total vocab:', len(itos), '| OOV (train vs GloVe):', oov_count)

# Save embeddings and vocabulary files for reuse
np.save(Path('artifacts') / 'embedding_matrix.npy', embedding_matrix)
import pandas as pd
pd.Series(itos).to_csv(Path('artifacts') / 'itos.csv', index=False)
pd.Series(stoi).to_csv(Path('artifacts') / 'stoi.csv')
print('Saved embedding_matrix.npy, itos.csv, stoi.csv to artifacts/')

Total vocab: 9076 | OOV (train vs GloVe): 1067
Saved embedding_matrix.npy, itos.csv, stoi.csv to artifacts/


In [13]:

# == Encode text into token IDs ==

MAX_LEN = 64
PAD_ID = stoi['<PAD>']
UNK_ID = stoi['<UNK>']

def encode_text(text, max_len=MAX_LEN):
    toks = simple_tokenize(text)
    ids = [stoi.get(t, UNK_ID) for t in toks]    # Convert tokens to IDs, use UNK_ID for unseen tokens
    if len(ids) < max_len:
        ids = ids + [PAD_ID] * (max_len - len(ids)) # Pad shorter sequences
    else:
        ids = ids[:max_len] # Truncate longer sequences
    return ids

def encode_dataframe(df, text_col='clean_lemma'):
    X = np.vstack([encode_text(t) for t in df[text_col].tolist()])  # Encode all texts
    y = df['label_id'].values.astype(int)
    return X, y

# Encode datasets into numeric form
X_train, y_train = encode_dataframe(df_train)
X_val,   y_val   = encode_dataframe(df_val)
X_test,  y_test  = encode_dataframe(df_test)

X_train.shape, X_val.shape, X_test.shape  # Check encoded matrix dimensions


((2873, 64), (150, 64), (280, 64))


# Task 4 — Model Definition (BiLSTM)

**Required**  
- **Baseline:** Bidirectional LSTM + final Dense.  
- **Stacked:** add a second BiLSTM on top.  
- Keras example below.


In [14]:

import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np

NUM_CLASSES = 4
EMBED_TRAINABLE = False

def build_baseline_bilstm(vocab_size, emb_dim, embedding_matrix, max_len=64):
    inp = layers.Input(shape=(max_len,), name='input_ids')
    emb = layers.Embedding(input_dim=vocab_size,
                           output_dim=emb_dim,
                           weights=[embedding_matrix],
                           trainable=EMBED_TRAINABLE,
                           mask_zero=True,
                           name='encoder_embedding')(inp)
    x = layers.Bidirectional(layers.LSTM(128))(emb)
    x = layers.Dropout(0.2)(x)
    out = layers.Dense(NUM_CLASSES, activation='softmax')(x)
    model = models.Model(inp, out, name='bilstm_baseline')
    return model

def build_stacked_bilstm(vocab_size, emb_dim, embedding_matrix, max_len=64):
    inp = layers.Input(shape=(max_len,), name='input_ids')
    emb = layers.Embedding(input_dim=vocab_size,
                           output_dim=emb_dim,
                           weights=[embedding_matrix],
                           trainable=EMBED_TRAINABLE,
                           mask_zero=True,
                           name='encoder_embedding')(inp)
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(emb)
    x = layers.Bidirectional(layers.LSTM(64))(x)
    x = layers.Dropout(0.3)(x)
    out = layers.Dense(NUM_CLASSES, activation='softmax')(x)
    model = models.Model(inp, out, name='bilstm_stacked')
    return model

VOCAB_SIZE = embedding_matrix.shape[0]
EMB_DIM = embedding_matrix.shape[1]

baseline = build_baseline_bilstm(VOCAB_SIZE, EMB_DIM, embedding_matrix, MAX_LEN)
stacked  = build_stacked_bilstm(VOCAB_SIZE, EMB_DIM, embedding_matrix, MAX_LEN)

baseline.summary()


ModuleNotFoundError: No module named 'tensorflow'


# Task 5 — Training & Evaluation

Train with ≥ 3 seeds, evaluate on validation (macro F1/Precision/Recall), and report mean ± std.


In [None]:

from sklearn.metrics import precision_recall_fscore_support, classification_report, confusion_matrix
import pandas as pd

def train_and_eval(model_fn, X_tr, y_tr, X_va, y_va, seeds=[1337, 2025, 42], epochs=5, batch_size=64):
    histories = []
    scores = []
    for s in seeds:
        tf.keras.utils.set_random_seed(s)
        model = model_fn(VOCAB_SIZE, EMB_DIM, embedding_matrix, MAX_LEN)
        model.compile(optimizer='adam',
                      loss='sparse_categorical_crossentropy',
                      metrics=['accuracy'])
        h = model.fit(X_tr, y_tr, validation_data=(X_va, y_va),
                      epochs=epochs, batch_size=batch_size, verbose=1)
        histories.append(h.history)
        y_pred = np.argmax(model.predict(X_va), axis=1)
        prec, rec, f1, _ = precision_recall_fscore_support(y_va, y_pred, average='macro', zero_division=0)
        scores.append({'seed': s, 'precision': prec, 'recall': rec, 'f1': f1})
    return histories, pd.DataFrame(scores)

# Example (commented):
# hist_base, df_scores_base = train_and_eval(build_baseline_bilstm, X_train, y_train, X_val, y_val)
# df_scores_base, df_scores_base.mean(), df_scores_base.std()


In [None]:

def evaluate_predictions(y_true, y_pred, labels_map=id2label):
    report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
    df_rep = pd.DataFrame(report).T
    cm = confusion_matrix(y_true, y_pred, labels=sorted(labels_map.keys()))
    return df_rep, cm

def plot_confusion_matrix(cm, labels):
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots(figsize=(5,5))
    im = ax.imshow(cm, interpolation='nearest')
    ax.set_xticks(range(len(labels)))
    ax.set_yticks(range(len(labels)))
    ax.set_xticklabels(labels, rotation=45, ha='right')
    ax.set_yticklabels(labels)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('True')
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, cm[i, j], ha='center', va='center')
    plt.tight_layout()
    plt.show()



# Task 6 — Transformers (Twitter-roBERTa-base-hate)

Model: **cardiffnlp/twitter-roberta-base-hate**  
- Tokenize with HF tokenizer, prepare `Dataset`, use `Trainer` with macro F1, evaluate on test.


In [None]:

# Skeleton for HF Trainer (commented for offline environments)
# from datasets import Dataset
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
# from sklearn.metrics import precision_recall_fscore_support

# MODEL_NAME = "cardiffnlp/twitter-roberta-base-hate"

# def to_hf_dataset(df, text_col='clean_lemma'):
#     return Dataset.from_pandas(df[[text_col, 'label_id']].rename(columns={text_col:'text','label_id':'label'}))

# ds_train = to_hf_dataset(df_train)
# ds_val   = to_hf_dataset(df_val)
# ds_test  = to_hf_dataset(df_test)

# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# def tokenize_fn(ex):
#     return tokenizer(ex['text'], truncation=True, padding='max_length', max_length=64)
# ds_train = ds_train.map(tokenize_fn, batched=True)
# ds_val   = ds_val.map(tokenize_fn, batched=True)
# ds_test  = ds_test.map(tokenize_fn, batched=True)

# model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=4)

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     preds = logits.argmax(axis=-1)
#     prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average='macro', zero_division=0)
#     return {'macro_f1': f1, 'macro_precision': prec, 'macro_recall': rec}

# args = TrainingArguments(
#     output_dir='hf_outputs',
#     evaluation_strategy='epoch',
#     save_strategy='epoch',
#     learning_rate=2e-5,
#     per_device_train_batch_size=32,
#     per_device_eval_batch_size=64,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     load_best_model_at_end=True,
#     metric_for_best_model='macro_f1',
#     logging_steps=50,
# )

# trainer = Trainer(
#     model=model,
#     args=args,
#     train_dataset=ds_train,
#     eval_dataset=ds_val,
#     compute_metrics=compute_metrics,
# )

# # trainer.train()
# # eval_results = trainer.evaluate(ds_test)
# # eval_results



# Task 7 — Error Analysis

Suggestions: confusion matrix for the best model, per-class Precision/Recall table, typical misclassified examples, comments on OOV and imbalance.


In [None]:

# Example (fill after training):
# y_true = y_val
# y_pred = y_pred_val
# err_idx = np.where(y_true != y_pred)[0][:20]
# df_errors = df_val.iloc[err_idx][['tweet','clean_lemma','label','label_id']].copy()
# df_errors['pred_label'] = [id2label[i] for i in y_pred[err_idx]]
# df_errors.head(10)
