# Hate Speech Classifier

In this project I've shown how to deal with text classification task.
I cover such important steps as:
- Data pre-processing
- Model setup
- Training and evaluating with hyperparamters search
- Logging the train process and model versions
- Summary

But first create venv and install requirements

## Data pre-processing

Download

In [None]:
!python src/data.py

Then import libs for pre-processing text data and start the process

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

import polars as pl
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
import string
from catboost import Pool, CatBoostClassifier, metrics, cv
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import optuna
import pickle
from optuna.samplers import TPESampler
SEED = 3322

def clean_text(text):
    table = text.maketrans(
        dict.fromkeys(string.punctuation))

    words = word_tokenize(
        text.lower().strip().translate(table))

    words = [word for word in words if word not in stopwords.words('english')]
    lemmed = [WordNetLemmatizer().lemmatize(word) for word in words]
    return " ".join(lemmed)


PATH = "../data/raw/Ethos_Dataset_Binary.csv"
raw_data = pl.read_csv(PATH,has_header=True,separator=";")

# apply cleaning function
processed_data = raw_data.select(
    pl.col("comment").apply(lambda x: clean_text(x)),
    pl.col("isHate").cast(pl.Int64)
)
# save dataset to data/precessed/processed
processed_data.write_csv(file="../data/processed/processed_data.csv", has_header=True,)
# split X and y
X = processed_data["comment"]
y = processed_data["isHate"]
# encode sentences
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
sentences = []
for obj in processed_data["comment"]:
    sentences.append(model.encode(obj))
X = pl.DataFrame(sentences, orient="row")
# split data into train, val, test by 0.75/0.125/0.125 ratio
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.75)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5)


print(f"Number of rows and columns in the train data set: {X_train.shape}")
print(f"Number of rows and columns in the test data set: {X_test.shape}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dzyat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dzyat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dzyat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [116]:
def objective(trial):
    model = CatBoostClassifier(
        iterations=trial.suggest_int("iterations", 30, 100, step=10),
        learning_rate=trial.suggest_float("learning_rate", 1e-3, 0.5, log=True),
        depth=trial.suggest_int("depth", 4, 10),
        l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
        task_type="GPU",
        verbose=5,
        early_stopping_rounds=30
    )
    model.fit(X_train.to_numpy(), y_train.to_numpy())
    y_pred = model.predict(X_test.to_numpy())
    accuracy_scr = accuracy_score(y_test.to_list(), y_pred.flatten().tolist())
    trial.set_user_attr("accuracy_score",accuracy_scr)
    return accuracy_scr

In [118]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

sampler = TPESampler(seed=SEED)
study = optuna.create_study(study_name="catboost", direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=50, timeout=14400)

0:	learn: 0.6600407	total: 38.5ms	remaining: 3.42s
5:	learn: 0.5217004	total: 225ms	remaining: 3.15s
10:	learn: 0.4267645	total: 388ms	remaining: 2.79s
15:	learn: 0.3480705	total: 555ms	remaining: 2.57s
20:	learn: 0.2884948	total: 719ms	remaining: 2.36s
25:	learn: 0.2442435	total: 875ms	remaining: 2.15s
30:	learn: 0.2084846	total: 1.04s	remaining: 1.97s
35:	learn: 0.1759482	total: 1.2s	remaining: 1.8s
40:	learn: 0.1515925	total: 1.36s	remaining: 1.63s
45:	learn: 0.1309725	total: 1.53s	remaining: 1.46s
50:	learn: 0.1133877	total: 1.69s	remaining: 1.29s
55:	learn: 0.1002239	total: 1.84s	remaining: 1.12s
60:	learn: 0.0890504	total: 2s	remaining: 950ms
65:	learn: 0.0795304	total: 2.15s	remaining: 783ms
70:	learn: 0.0716085	total: 2.31s	remaining: 617ms
75:	learn: 0.0643443	total: 2.46s	remaining: 454ms
80:	learn: 0.0574153	total: 2.63s	remaining: 293ms
85:	learn: 0.0522462	total: 2.79s	remaining: 130ms
89:	learn: 0.0479807	total: 2.92s	remaining: 0us
0:	learn: 0.6347515	total: 16ms	remaini

0:	learn: 0.6820377	total: 25.2ms	remaining: 1.99s
5:	learn: 0.6379108	total: 136ms	remaining: 1.68s
10:	learn: 0.5995148	total: 243ms	remaining: 1.52s
15:	learn: 0.5652987	total: 349ms	remaining: 1.4s
20:	learn: 0.5250878	total: 473ms	remaining: 1.33s
25:	learn: 0.4914604	total: 581ms	remaining: 1.21s
30:	learn: 0.4619793	total: 688ms	remaining: 1.09s
35:	learn: 0.4367543	total: 791ms	remaining: 967ms
40:	learn: 0.4067633	total: 902ms	remaining: 858ms
45:	learn: 0.3813758	total: 1.01s	remaining: 746ms
50:	learn: 0.3578325	total: 1.11s	remaining: 634ms
55:	learn: 0.3352371	total: 1.23s	remaining: 526ms
60:	learn: 0.3149264	total: 1.34s	remaining: 419ms
65:	learn: 0.2978255	total: 1.45s	remaining: 308ms
70:	learn: 0.2796946	total: 1.56s	remaining: 198ms
75:	learn: 0.2638845	total: 1.68s	remaining: 88.2ms
79:	learn: 0.2519899	total: 1.77s	remaining: 0us
0:	learn: 0.6408550	total: 24.9ms	remaining: 723ms
5:	learn: 0.4569807	total: 137ms	remaining: 550ms
10:	learn: 0.3437324	total: 240ms	r

59:	learn: 0.1843094	total: 1.44s	remaining: 0us
0:	learn: 0.6888810	total: 14.4ms	remaining: 705ms
5:	learn: 0.6720641	total: 82.4ms	remaining: 604ms
10:	learn: 0.6540273	total: 145ms	remaining: 516ms
15:	learn: 0.6387393	total: 207ms	remaining: 440ms
20:	learn: 0.6216903	total: 272ms	remaining: 376ms
25:	learn: 0.6068893	total: 331ms	remaining: 306ms
30:	learn: 0.5912259	total: 390ms	remaining: 239ms
35:	learn: 0.5781285	total: 447ms	remaining: 174ms
40:	learn: 0.5660297	total: 504ms	remaining: 111ms
45:	learn: 0.5534994	total: 564ms	remaining: 49ms
49:	learn: 0.5437552	total: 611ms	remaining: 0us
0:	learn: 0.6677785	total: 18.2ms	remaining: 1.26s
5:	learn: 0.5647408	total: 100ms	remaining: 1.07s
10:	learn: 0.4867803	total: 184ms	remaining: 985ms
15:	learn: 0.4284513	total: 265ms	remaining: 893ms
20:	learn: 0.3765081	total: 346ms	remaining: 807ms
25:	learn: 0.3281928	total: 426ms	remaining: 722ms
30:	learn: 0.2852121	total: 514ms	remaining: 646ms
35:	learn: 0.2581758	total: 595ms	rem

90:	learn: 0.5031157	total: 743ms	remaining: 73.5ms
95:	learn: 0.4966040	total: 786ms	remaining: 32.7ms
99:	learn: 0.4914637	total: 817ms	remaining: 0us
0:	learn: 0.6912600	total: 17ms	remaining: 1s
5:	learn: 0.6824325	total: 103ms	remaining: 925ms
10:	learn: 0.6738151	total: 191ms	remaining: 851ms
15:	learn: 0.6660367	total: 278ms	remaining: 765ms
20:	learn: 0.6582221	total: 366ms	remaining: 679ms
25:	learn: 0.6503478	total: 453ms	remaining: 593ms
30:	learn: 0.6424718	total: 536ms	remaining: 501ms
35:	learn: 0.6355369	total: 616ms	remaining: 411ms
40:	learn: 0.6289411	total: 699ms	remaining: 324ms
45:	learn: 0.6213601	total: 785ms	remaining: 239ms
50:	learn: 0.6142002	total: 869ms	remaining: 153ms
55:	learn: 0.6076622	total: 957ms	remaining: 68.3ms
59:	learn: 0.6020638	total: 1.03s	remaining: 0us
0:	learn: 0.5734757	total: 61.8ms	remaining: 3.03s
5:	learn: 0.2451519	total: 381ms	remaining: 2.79s
10:	learn: 0.1214399	total: 690ms	remaining: 2.44s
15:	learn: 0.0677708	total: 1.01s	remai

In [119]:
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials:  50
Best trial:
  Value:  0.848
  Params: 
    iterations: 90
    learning_rate: 0.021480452555619173
    depth: 9
    l2_leaf_reg: 0.2688448628536261


In [123]:
with open("../reports/log_training.pkl", "wb") as file:
    pickle.dump(study.get_trials(),file)

NameError: name 'pickle' is not defined