In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from imblearn.over_sampling import ADASYN, BorderlineSMOTE, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import FeatureUnion, Pipeline
from transformers import BertModel, BertTokenizer
from typing import Callable, List, Optional, Tuple
from xgboost import XGBClassifier

In [None]:
df = pd.read_csv('/kaggle/input/cleaned-toxic-comments/train_preprocessed.csv')
print(df.shape)

# df = df.head(5000)

In [None]:
sequences = df["comment_text"].values
targets = df['identity_hate'].values
print(sequences.shape)
print(targets.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    sequences, targets, test_size=0.2, random_state=42
)

In [None]:
class BertTransformer(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        bert_tokenizer,
        bert_model,
        max_length: int = 60,
        embedding_func: Optional[Callable[[torch.tensor], torch.tensor]] = None,
    ):
        self.tokenizer = bert_tokenizer
        self.model = bert_model
        self.model.eval()
        self.max_length = max_length
        self.embedding_func = embedding_func

        if self.embedding_func is None:
            self.embedding_func = lambda x: x[0][:, 0, :].squeeze()

    def _tokenize(self, text: str) -> Tuple[torch.tensor, torch.tensor]:
        # Tokenize the text with the provided tokenizer
        tokenized_text = self.tokenizer.encode_plus(
            text, add_special_tokens=True, max_length=self.max_length
        )["input_ids"]

        # Create an attention mask telling BERT to use all words
        attention_mask = [1] * len(tokenized_text)

        # bert takes in a batch so we need to unsqueeze the rows
        return (
            torch.tensor(tokenized_text).unsqueeze(0),
            torch.tensor(attention_mask).unsqueeze(0),
        )

    def _tokenize_and_predict(self, text: str) -> torch.tensor:
        tokenized, attention_mask = self._tokenize(text)

        embeddings = self.model(tokenized, attention_mask)
        return self.embedding_func(embeddings)

    def transform(self, text: List[str]):
        if isinstance(text, pd.Series):
            text = text.tolist()

        with torch.no_grad():
            return torch.stack([self._tokenize_and_predict(string) for string in text])

    def fit(self, X, y=None):
        """No fitting necessary so we just return ourselves"""
        return self

In [None]:
bert_dataset = "bert-base-uncased"


def tfidf():
    return Pipeline([("vect", CountVectorizer()), ("tfidf", TfidfTransformer())])


def bert():
    tokenizer = BertTokenizer.from_pretrained(bert_dataset)
    bert_model = BertModel.from_pretrained(bert_dataset)
    return BertTransformer(tokenizer, bert_model)


In [None]:
smote = SMOTE(random_state=12)
borderline_smote = BorderlineSMOTE(sampling_strategy=0.5,k_neighbors=5,random_state=42, kind="borderline-1")
random_undersampler = RandomUnderSampler(sampling_strategy=1, random_state=42)
adasyn = ADASYN(sampling_strategy=0.5,random_state=42)

model = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("adasyn", adasyn),
        ("smote", borderline_smote),
        ("under-sampling", random_undersampler),
        ("mnb", MultinomialNB(alpha=0.1)),
    ]
)

In [None]:
classifier = XGBClassifier()
model = Pipeline(
    [
        (
            "union",
            FeatureUnion(
                transformer_list=[("bert", bert()), ("tf_idf", tfidf())]
            ),
        ),
        ("classifier", classifier),
    ]
)

In [None]:
model.fit(X_train, y_train)

In [None]:
THRESH = 0.5
pred = model.predict(X_test)
y_pred = (pred > THRESH).astype(int)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, y_pred), display_labels=[0, 1])
print(classification_report(y_test, y_pred))
disp.plot()
plt.show()