<a href="https://colab.research.google.com/github/lhcee3/Online-Harms-Detection/blob/main/Online_harms_detection_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing Dependencies

In [None]:

!pip install kagglehub nltk scikit-learn pandas

import kagglehub
import pandas as pd
import numpy as np
import nltk
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Download & Load Datasets

In [None]:
import kagglehub
import pandas as pd
import os

# Download Hate Speech Dataset
path1 = kagglehub.dataset_download("mrmorj/hate-speech-and-offensive-language-dataset")
df1 = pd.read_csv(f"{path1}/labeled_data.csv")

# Download LDNOOBW Dataset
path2 = kagglehub.dataset_download("tushifire/ldnoobw")


df2_files = os.listdir(path2)
df2_files = [file for file in df2_files if file not in ["LICENSE", "README.md", "USERS.md"]]


bad_words = set()
for file in df2_files:
    file_path = os.path.join(path2, file)
    with open(file_path, "r", encoding="utf-8") as f:
        bad_words.update([line.strip().lower() for line in f])

print("Total bad words collected:", len(bad_words))


Downloading from https://www.kaggle.com/api/v1/datasets/download/mrmorj/hate-speech-and-offensive-language-dataset?dataset_version_number=1...


100%|██████████| 1.01M/1.01M [00:00<00:00, 53.3MB/s]

Extracting files...





Downloading from https://www.kaggle.com/api/v1/datasets/download/tushifire/ldnoobw?dataset_version_number=2...


100%|██████████| 23.6k/23.6k [00:00<00:00, 23.6MB/s]

Extracting files...
Total bad words collected: 2612





Preprocess Hate Speech Dataset

In [None]:
import spacy
import pandas as pd
import re
from spacy.lang.en.stop_words import STOP_WORDS


nlp = spacy.load("en_core_web_sm")

# Text preprocessing function using spaCy
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()


    doc = nlp(text)


    tokens = [token.lemma_ for token in doc if token.text not in STOP_WORDS and not token.is_punct]

    return " ".join(tokens)


df1['processed_text'] = df1['tweet'].apply(preprocess_text)
df1['label'] = df1['class']


X = df1['processed_text']
y = df1['label']


SVM Training

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# Improved TF-IDF Vectorization
vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 2),
    min_df=5
)
X_tfidf = vectorizer.fit_transform(X)


smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)


X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {"C": [0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(LinearSVC(class_weight="balanced"), param_grid, cv=5, scoring="accuracy")
grid_search.fit(X_train, y_train)


best_svm = grid_search.best_estimator_
y_pred = best_svm.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized Model Accuracy: {accuracy:.4f}")
print("\nOptimized Classification Report:\n", classification_report(y_test, y_pred))

# Saving the model
import joblib
joblib.dump(best_svm, "optimized_svm_model.pkl")
joblib.dump(vectorizer, "optimized_tfidf_vectorizer.pkl")


Optimized Model Accuracy: 0.9305

Optimized Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.96      0.93      3849
           1       0.98      0.84      0.91      3794
           2       0.92      0.99      0.95      3871

    accuracy                           0.93     11514
   macro avg       0.93      0.93      0.93     11514
weighted avg       0.93      0.93      0.93     11514



['optimized_tfidf_vectorizer.pkl']

In [None]:
import joblib


joblib.dump(best_svm, "optimized_svm_model.pkl")


joblib.dump(vectorizer, "optimized_tfidf_vectorizer.pkl")

print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!


In [None]:
import os
print(os.listdir())  # Should show "['.config', 'optimized_tfidf_vectorizer.pkl', 'optimized_svm_model.pkl', 'sample_data']"


['.config', 'optimized_svm_model.pkl', 'optimized_tfidf_vectorizer.pkl', 'sample_data']


To Try The Model


In [None]:
import joblib
import spacy
import re


svm_model = joblib.load("optimized_svm_model.pkl")
tfidf_vectorizer = joblib.load("optimized_tfidf_vectorizer.pkl")


nlp = spacy.load("en_core_web_sm")


def is_question(text):
    return text.strip().endswith("?")


def preprocess_text(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if token.is_alpha]
    return " ".join(tokens)


def classify_text(text):
    if is_question(text):
        return "Neutral/Acceptable (Detected as a Question)"

    processed_text = preprocess_text(text)
    text_vectorized = tfidf_vectorizer.transform([processed_text])
    prediction = svm_model.predict(text_vectorized)[0]

    print(f"DEBUG: Raw Prediction Output: {prediction}")


    label_mapping = {
        0: "Hate Speech",
        1: "Offensive Language",
        2: "Neutral/Acceptable"
    }

    return label_mapping.get(prediction, "Unknown")

# Interactive Input in Google Colab
def interactive_test():
    while True:
        user_input = input("\nEnter a statement (or type 'exit' to stop): ")
        if user_input.lower() == "exit":
            print("Exiting...")
            break
        result = classify_text(user_input)
        print(f"Classification: {result}")


interactive_test()



Enter a statement (or type 'exit' to stop): he is bad guy
DEBUG: Raw Prediction Output: 1
Classification: Offensive Language

Enter a statement (or type 'exit' to stop): exit
Exiting...
