## Import necessary libraries

In [3]:
import torch

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

import tqdm as notebook_tqdm

In [4]:
import os
os.environ["WANDB_DISABLED"] = "true"

## Set device to GPU (if available)

In [5]:
torch.manual_seed(42)

if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


## Get dataset

In [None]:
df = pd.read_csv("/kaggle/input/hinglish-hate/hate-dataset-train.csv")
print(df.shape)
df.head()

(26923, 2)


Unnamed: 0,data,label
0,an extremist hindu crying for no reason,hate
1,हमारे मूर्धन्य हमारा खुदा हैं एक hi होता हैं,non-hate
2,इसमें देखो कौन पैसे के लिए दौड़ता he हिन्दू एक...,non-hate
3,वही नारा अब हम लोगों को भी follow करना पड़ेगा,non-hate
4,तुम जैसे कुछ बूंद लोगों की वजह se सबकी pakista...,non-hate


## ML Algorithms testing

- Encode categorical values in `y`
- Tokenize the data in `X` with TF-IDF
- Split the datasets into train and test

In [7]:
X = df['data']
y = df['label']

encoder = LabelEncoder()
encoder.fit(y)
y_encoded = encoder.transform(y)
print(f"Encoder has the following classes: {encoder.classes_}")
print(f"The new data type for y is {type(y_encoded)}")

tfidf = TfidfVectorizer()
X_enc = tfidf.fit_transform(X)
X_enc_dense = X_enc.toarray()

Encoder has the following classes: ['hate' 'non-hate']
The new data type for y is <class 'numpy.ndarray'>


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_enc_dense, y_encoded, test_size=0.2, random_state=42)
len(X_train), len(y_train)

(21538, 21538)

### SVM algorithm training and evaluation

In [6]:
svm_model = svm.SVC(kernel='linear')
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test) 

precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Precision: 0.7011217478889121
Recall: 0.695636025998143
F1-score: 0.6680690921514153


### Random Forest Classifier Training and Evaluation

In [6]:
RF_model = RandomForestClassifier()
RF_model.fit(X_train, y_train)

y_pred =RF_model.predict(X_test)

precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Precision: 0.6883612467899686
Recall: 0.6785515320334262
F1-score: 0.6393727341539612


In [10]:
xgb_model =  XGBClassifier(eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Precision: 0.7026033669545919
Recall: 0.6911792014856082
F1-score: 0.6570311125115097
