In [1]:
from datasets import load_dataset
dataset = load_dataset('hate_speech_offensive')

Found cached dataset hate_speech_offensive (/root/.cache/huggingface/datasets/hate_speech_offensive/default/1.0.0/5f5dfc7b42b5c650fe30a8c49df90b7dbb9c7a4b3fe43ae2e66fabfea35113f5)


  0%|          | 0/1 [00:00<?, ?it/s]

In [2]:
df = dataset['train'].to_pandas()
df.head()

Unnamed: 0,count,hate_speech_count,offensive_language_count,neither_count,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


#### Cleaning the data

In [3]:
import re
import nltk
from nltk.corpus import stopwords
import string

stopword=set(stopwords.words('english'))
stemmer = nltk.SnowballStemmer("english")

def clean_text(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text

In [4]:
df['tweet']=df['tweet'].apply(clean_text)
df.head()

Unnamed: 0,count,hate_speech_count,offensive_language_count,neither_count,class,tweet
0,3,0,0,3,2,rt mayasolov woman shouldnt complain clean ho...
1,3,0,3,0,1,rt boy dat coldtyga dwn bad cuffin dat hoe ...
2,3,0,3,0,1,rt urkindofbrand dawg rt ever fuck bitch sta...
3,3,0,2,1,1,rt cganderson vivabas look like tranni
4,6,0,6,0,1,rt shenikarobert shit hear might true might f...


In [5]:
from sklearn.model_selection import train_test_split
seed = 51
test_size = 0.2
X = df['tweet']
y = df['class']

X_train,X_test,y_train,y_test = train_test_split(X, y,test_size=0.2,random_state=seed,stratify=df['class'])
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(19826,) (4957,) (19826,) (4957,)


#### TFIDF ML Approach

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)

In [7]:
# trasforming the data
X_train_transformed = vectorizer.fit_transform(X_train)
X_test_transformed = vectorizer.transform(X_test)

In [8]:
X_train_transformed.shape, X_test_transformed.shape

((19826, 5000), (4957, 5000))

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import xgboost as xgb

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [10]:
def make_pred(clf, X_train, y_train, X_test):
    """ with clf, make pred, return pred"""
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    return preds

LogReg

In [11]:
#instantiate the models with default hyper-parameters
logreg = LogisticRegression()
logreg_predictions = make_pred(logreg, X_train_transformed, y_train, X_test_transformed)
print(classification_report(y_test, logreg_predictions))

              precision    recall  f1-score   support

           0       0.52      0.16      0.25       286
           1       0.91      0.97      0.94      3838
           2       0.87      0.83      0.85       833

    accuracy                           0.90      4957
   macro avg       0.77      0.65      0.68      4957
weighted avg       0.88      0.90      0.88      4957



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
confusion_matrix(y_test, logreg_predictions)

array([[  46,  216,   24],
       [  38, 3717,   83],
       [   4,  140,  689]])

NaiveBayes

In [13]:
nb_model = MultinomialNB()
nb_predictions = make_pred(nb_model, X_train_transformed, y_train, X_test_transformed)
print(classification_report(y_test, nb_predictions))

              precision    recall  f1-score   support

           0       0.50      0.00      0.01       286
           1       0.83      1.00      0.90      3838
           2       0.92      0.37      0.52       833

    accuracy                           0.83      4957
   macro avg       0.75      0.46      0.48      4957
weighted avg       0.82      0.83      0.79      4957



In [14]:
confusion_matrix(y_test, logreg_predictions)

array([[  46,  216,   24],
       [  38, 3717,   83],
       [   4,  140,  689]])

XGB Boost

In [None]:
xgb_model= xgb.XGBClassifier(
        learning_rate=0.1,
        max_depth=7,
        n_estimators=80,
        use_label_encoder=False,
        eval_metric='auc' )

xgb_predictions = make_pred(xgb_model, X_train_transformed, y_train, X_test_transformed)
print(classification_report(y_test, xgb_predictions))

In [None]:
confusion_matrix(y_test, xgb_predictions)

array([[  50,  199,   37],
       [  33, 3609,  196],
       [   2,   38,  793]])

Finetuning the best model

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5],
        'n_estimators' : [40, 60, 80, 100]
}

In [None]:
xgb_model= xgb.XGBClassifier(
        learning_rate=0.1,
        max_depth=7,
        n_estimators=80,
        use_label_encoder=False,
        eval_metric='auc' )



In [None]:
folds = 3
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(xgb_model, param_distributions=params, 
                                   n_iter=param_comb, 
                                   scoring='f1_micro', n_jobs=4, 
                                   cv=skf.split(X_train_transformed, y_train), 
                                   verbose=2, random_state=1001 )

random_search.fit(X_train_transformed, y_train)



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Fitting 3 folds for each of 5 candidates, totalling 15 fits
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already 

In [None]:
print(random_search.cv_results_)

{'mean_fit_time': array([0.48397613, 1.34970284, 0.48469257, 1.69550864, 0.76583099]), 'std_fit_time': array([0.0308516 , 0.1319969 , 0.0665246 , 0.28342226, 0.15450312]), 'mean_score_time': array([0.02062607, 0.02685388, 0.01780701, 0.02830609, 0.01704741]), 'std_score_time': array([0.00669257, 0.00768931, 0.0075051 , 0.01292306, 0.00919365]), 'param_subsample': masked_array(data=[0.8, 1.0, 0.8, 0.8, 1.0],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_n_estimators': masked_array(data=[40, 60, 40, 100, 40],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_min_child_weight': masked_array(data=[5, 1, 10, 10, 1],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_max_depth': masked_array(data=[3, 4, 3, 4, 4],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtyp

In [None]:
best_xgb_predictions = random_search.predict(X_test_transformed)
print(classification_report(y_test, best_xgb_predictions))

              precision    recall  f1-score   support

           0       0.52      0.19      0.28       286
           1       0.94      0.94      0.94      3838
           2       0.77      0.96      0.85       833

    accuracy                           0.90      4957
   macro avg       0.74      0.69      0.69      4957
weighted avg       0.89      0.90      0.89      4957



In [None]:
confusion_matrix(y_test, best_xgb_predictions)

array([[  54,  191,   41],
       [  49, 3590,  199],
       [   0,   37,  796]])

In [None]:
def inference(clf, texts):
    id_to_label = {0: 'hate-speech', 1: 'offensive-language', 2: 'neither'}
    transformed_texts = vectorizer.transform(texts)
    preds = clf.predict(transformed_texts)
    for text, pred in zip(texts, preds):
        print(f'Sent: {text}, Pred: {id_to_label[pred]}')

In [None]:
inference(random_search, ['i fucking hate you, piece of shit'])

Sent: i fucking hate you, piece of shit, Pred: offensive-language


In [None]:
inference(random_search, ['youre so handsome'])

Sent: youre so handsome, Pred: neither


#### Testing on pit2015 data

In [None]:
import pandas as pd

test_path = '/workspace/grasp-data-hometask-semantic-similarity-master/data/test.data'
col_names = ['Topic_Id', 'Topic_Name', 'Sent_1', 'Sent_2', 'Label', 'Sent_1_tag', 'Sent_2_tag']
test_df = pd.read_csv(test_path, sep='\t', lineterminator='\n', names=col_names, header=None)
test_df.head(3)

Unnamed: 0,Topic_Id,Topic_Name,Sent_1,Sent_2,Label,Sent_1_tag,Sent_2_tag
0,51,8 Mile,All the home alones watching 8 mile,8 mile is on thats my movie,3,All/O/DT/B-NP/O the/O/DT/I-NP/O home/O/NN/I-NP...,8/O/NN/B-NP/O mile/O/NN/I-NP/O is/O/VBZ/B-VP/O...
1,51,8 Mile,All the home alones watching 8 mile,The last rap battle in 8 Mile nevr gets old ahah,2,All/O/DT/B-NP/O the/O/DT/I-NP/O home/O/NN/I-NP...,The/O/DT/B-NP/O last/O/JJ/I-NP/O rap/O/NN/I-NP...
2,51,8 Mile,All the home alones watching 8 mile,The rap battle at the end of 8 mile gets me so...,2,All/O/DT/B-NP/O the/O/DT/I-NP/O home/O/NN/I-NP...,The/O/DT/B-NP/O rap/O/NN/I-NP/O battle/O/NN/I-...


In [None]:
sents = test_df['Sent_2'].tolist()[:5]

In [None]:
inference(random_search, sents)

Sent: 8 mile is on thats my movie, Pred: neither
Sent: The last rap battle in 8 Mile nevr gets old ahah, Pred: neither
Sent: The rap battle at the end of 8 mile gets me so hype, Pred: neither
Sent: Rabbit on 8 mile out of place but determined to make it, Pred: neither
Sent: See 8 Mile is always on but it s the tv version so it s gay, Pred: neither


In [None]:
inference(random_search, test_df['Sent_2'].tolist()[50:60])

Sent: Goin to see after earth with the fam, Pred: neither
Sent: will smith s speech in after earth is so relevant, Pred: neither
Sent: Just got done eating chinese with the fam now ganna go see after earth, Pred: neither
Sent: After earth is out and I havent seen it yet, Pred: neither
Sent: wanted to watch After Earth today, Pred: neither
Sent: Finally in the theaters to see after earth, Pred: neither
Sent: the hangover 3 and after earth are both really good, Pred: neither
Sent: I kinda wanna see After Earth as well, Pred: neither
Sent: NOW YOU SEE ME and AFTER EARTH Cant Outpace FAST FURIOUS 6, Pred: neither
Sent: After Earth 039 trumped by 039 Now You See Me 039 as 039 Fast, Pred: neither


In [None]:
def filter(clf, texts, id):
    id_to_label = {0: 'hate-speech', 1: 'offensive-language', 2: 'neither'}
    transformed_texts = vectorizer.transform(texts)
    preds = clf.predict(transformed_texts)
    for text, pred in zip(texts, preds):
        if pred == id:
            print(f'Sent: {text}, Pred: {id_to_label[pred]}')

filter(random_search, test_df['Sent_2'].tolist(), 1)

Sent: Those last 3 battles in 8 Mile are THE shit, Pred: offensive-language
Sent: After Earth is a great ass movie, Pred: offensive-language
Sent: Benitez is alright tho man fuck chelsea fans they suck asshole, Pred: offensive-language
Sent: He can fuck up the Big 12 all he wants, Pred: offensive-language
Sent: So Wiggins Is Settling For Playing In The Garbage Ass Big 12, Pred: offensive-language
Sent: Spo aint in the game chalmers, Pred: offensive-language
Sent: Lucky ass shxt by Chalmers, Pred: offensive-language
Sent: The fuck Chalmers is doing, Pred: offensive-language
Sent: Why is chara playing like a bitch, Pred: offensive-language
Sent: Oh shit I gotta try that new ciroc flavor, Pred: offensive-language
Sent: New Ciroc flavor on the market gotta try that shit, Pred: offensive-language
Sent: Ciroc is shit vodka anyway, Pred: offensive-language
Sent: but yall so damn hype bout the new ciroc, Pred: offensive-language
Sent: I swear Fuck Family Guy for being that funny tonight, Pred:

#### Transformer approach

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=3)

2023-10-16 10:36:41.504314: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-16 10:36:41.734548: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-10-16 10:36:42.398578: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-10-16 10:36:42.398667: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinf

Prepare the dataset

In [26]:
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True)

In [None]:
test_encodings[200]

Encoding(num_tokens=51, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [27]:
import torch

class HateDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        lab = [0.] * 3
        lab[self.labels[idx]] = 1.
        item['labels'] = torch.tensor(lab)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = HateDataset(train_encodings, y_train.tolist())
test_dataset = HateDataset(test_encodings, y_test.tolist())


In [None]:
train_dataset[100]

{'input_ids': tensor([    0,   279, 28609,  3967,   118,  8446, 18657,  3245,   146, 18940,
         26138,  9013, 32594, 33976,  5384,  1780,  8987, 35468,   405,     2,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'labels': tensor([1., 0., 0.])}

In [22]:
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir='./hatespeech_results',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    load_best_model_at_end=True,  # load the best model when finished training (default metric is loss)
    evaluation_strategy="steps",  # evaluate each `logging_steps`
    metric_for_best_model="f1",  # select the base metrics
)

In [29]:
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, recall_score


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    predictions = sigmoid(torch.Tensor(predictions))
    predictions = torch.argmax(predictions, dim=1)
    labels = torch.argmax(torch.Tensor(labels), dim=1)
    #print(predictions)
    #print(labels)

    f1_micro_average = f1_score(y_true=labels, y_pred=predictions, average='micro')
    accuracy = accuracy_score(labels, predictions)
    recall = recall_score(labels, predictions, average='micro')
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'accuracy': accuracy,
               'recall': recall}
    return metrics

In [None]:
import mlflow
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,             # evaluation dataset
    compute_metrics=compute_metrics
)

with mlflow.start_run() as mlrun:
    trainer.train()
    mlflow.end_run()

Evaluating

In [49]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForSequenceClassification.from_pretrained("/workspace/grasp-data-hometask-semantic-similarity-master/scripts/hatespeech_results/checkpoint-3500", num_labels=3)

In [50]:
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir='./hatespeech_results',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    load_best_model_at_end=True,  # load the best model when finished training (default metric is loss)
    evaluation_strategy="steps",  # evaluate each `logging_steps`
    metric_for_best_model="f1",  # select the base metrics
)

In [51]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,             # evaluation dataset
    compute_metrics=compute_metrics
)

In [52]:
import mlflow
with mlflow.start_run() as mlrun:
    print(trainer.evaluate())
    mlflow.end_run()

{'eval_loss': 0.2431873381137848, 'eval_f1': 0.8539439176921525, 'eval_accuracy': 0.8539439176921525, 'eval_recall': 0.8539439176921525, 'eval_runtime': 5.2193, 'eval_samples_per_second': 949.748, 'eval_steps_per_second': 118.79}
