In [1]:
# load dataset (https://github.com/intelligence-csd-auth-gr/Ethos-Hate-Speech-Dataset)

import pandas as pd

df = pd.read_csv("ethos.csv", sep=";")

df.columns = ["text", "sentiment"]

print(df)

                                                  text  sentiment
0            You should know women's sports are a joke        1.0
1      You look like Sloth with deeper Down’s syndrome        1.0
2    You look like Russian and speak like Indian. B...        1.0
3                 Women deserve to be abused, I guess.        1.0
4    Women are made for making babies and cooking d...        1.0
..                                                 ...        ...
993   From the midnight sun where the hot springs blow        0.0
994                        Don't say I'm not your type        0.0
995   And therefore never send to know for whom the...        0.0
996                      And I can't stand another day        0.0
997   All values, unless otherwise stated, are in U...        0.0

[998 rows x 2 columns]


In [2]:
# preprocess data

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
  doc = nlp(text.lower())

  tokens = [token.lemma_ for token in doc if token.is_alpha and token.text not in STOP_WORDS]
  return " ".join(tokens)

df["text"] = df["text"].apply(preprocess)
df["sentiment"] = (df["sentiment"] >= 0.5).astype(int)

print(df["text"], df["sentiment"])

0                                  know woman sport joke
1                          look like sloth deep syndrome
2      look like russian speak like indian disgusting...
3                              woman deserve abuse guess
4                            woman make baby cook dinner
                             ...                        
993                         midnight sun hot spring blow
994                                                 type
995                                  send know bell toll
996                                            stand day
997                                   value state dollar
Name: text, Length: 998, dtype: object 0      1
1      1
2      1
3      1
4      1
      ..
993    0
994    0
995    0
996    0
997    0
Name: sentiment, Length: 998, dtype: int64


In [3]:
df["sentiment"].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
0,565
1,433


In [67]:
# train sentiment analysis model

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

x_train, x_test, y_train, y_test = train_test_split(
    df["text"], df["sentiment"],
    test_size=0.1, random_state=42
)

vectorizer = TfidfVectorizer(ngram_range=(1, 3))
x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)

model = LogisticRegression(max_iter=100)
model.fit(x_train_vec, y_train)

y_pred = model.predict(x_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.69
              precision    recall  f1-score   support

           0       0.74      0.80      0.77        65
           1       0.57      0.49      0.52        35

    accuracy                           0.69       100
   macro avg       0.65      0.64      0.65       100
weighted avg       0.68      0.69      0.68       100



In [72]:
text = input("> ")

vec = vectorizer.transform([preprocess(text)])

pred, prob = model.predict(vec)[0], model.predict_proba(vec)[0][1]

print(f"{text} {prob:.5f}")

if pred:
  print("Hate speech detected!")

> hawk tuah
hawk tuah 0.42091
