In [36]:
# load dataset (https://github.com/intelligence-csd-auth-gr/Ethos-Hate-Speech-Dataset)

import pandas as pd

df = pd.read_csv("ethos.csv", sep=";")

df.columns = ["text", "sentiment"]

df["sentiment"] = (df["sentiment"] >= 0.5).astype(int)

print(df)

                                                  text  sentiment
0            You should know women's sports are a joke          1
1      You look like Sloth with deeper Down’s syndrome          1
2    You look like Russian and speak like Indian. B...          1
3                 Women deserve to be abused, I guess.          1
4    Women are made for making babies and cooking d...          1
..                                                 ...        ...
993   From the midnight sun where the hot springs blow          0
994                        Don't say I'm not your type          0
995   And therefore never send to know for whom the...          0
996                      And I can't stand another day          0
997   All values, unless otherwise stated, are in U...          0

[998 rows x 2 columns]


In [37]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [38]:
# preprocess data

from spacy.lang.en.stop_words import STOP_WORDS
import string

def preprocess(sentence):
  doc = nlp(sentence)

  tokens = [token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_ for token in doc]

  return [token for token in tokens if token not in [*STOP_WORDS] and token not in string.punctuation]

In [39]:
# train sentiment analysis model

from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

tfidf=TfidfVectorizer(tokenizer=preprocess)
classifier=LinearSVC()

x_train, x_test, y_train, y_test=train_test_split(df["text"], df["sentiment"], test_size=0.2, random_state=0)

In [71]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

clf = Pipeline([('tfidf', tfidf), ('clf', classifier)])

clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

confusion_matrix(y_test, y_pred)

print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

           0       0.71      0.75      0.73       121
           1       0.58      0.53      0.56        79

    accuracy                           0.67       200
   macro avg       0.65      0.64      0.64       200
weighted avg       0.66      0.67      0.66       200



In [76]:
text = input("> ")

pred = clf.decision_function([text])[0]

if pred > 0.4:
  print(f"Hate speech detected!\nConfidence: {pred}")
else:
  print(f'we gucci')

> fuck you cunt
Hate speech detected!
Confidence: 0.8493392564790354
