In [2]:
# ===============================
# Imports
# ===============================
import pandas as pd
import re
import nltk
import numpy as np

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score
)


In [9]:

# ===============================
# NLTK setup
# ===============================
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    return wordnet.NOUN

# ===============================
# Load Dataset
# ===============================
df = pd.read_csv(
    r"C:\Users\Jannatul Mawya\OneDrive\Pictures\Cyberbullying Thesis\impermium_verification_labels.csv"
)


#text_col =  None
#label_col = None
text_col =  df['Comment']
label_col = df['Insult']

'''
for col in df.columns:
    if df[col].dtype == object and text_col is None:
      text_col = col
    elif pd.api.types.is_numeric_dtype(df[col]):
        label_col = col
'''
if text_col is None or label_col is None:
    raise ValueError("Could not detect text/label columns")

print(f"Detected TEXT column : {text_col}")
print(f"Detected LABEL column: {label_col}")



df.dropna(inplace=True)
df['Insult'] = df['Insult'].astype(int)
print("DataFrame created:\n")
# ===============================
# Text Cleaning (LESS aggressive)
# ===============================
def clean_text(text):
    text = re.sub(r"http\S+|<.*?>", " ", str(text).lower())
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha()]
    pos_tags = nltk.pos_tag(tokens)
    lemmas = [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos))
        for word, pos in pos_tags
    ]
    return " ".join(lemmas)

df['clean_comment'] = df['Comment'].apply(clean_text)

# ===============================
# TF-IDF Vectorization (STRONG)
# ===============================
vectorizer = TfidfVectorizer(
     ngram_range=(1, 2),
     max_features=50000,
     min_df=3,
     max_df=0.9,
     sublinear_tf=True
)

X = vectorizer.fit_transform(df['clean_comment'])
y = df['Insult']

# ===============================
# Train-Test Split
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# ===============================
# Models (TEXT-OPTIMIZED)
# ===============================
lr = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    solver='liblinear'
)

svm = LinearSVC(
    class_weight='balanced'
)

nb = MultinomialNB()

# ===============================
# Train & Evaluate
# ===============================
models = {
    "Logistic Regression": lr,
    "Linear SVM": svm,
    "Naive Bayes": nb
}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    print(f"\n===============================")
    print(name)
    print("===============================")
    print("Accuracy:", accuracy_score(y_test, preds))
    print("\nClassification Report:")
    print(classification_report(y_test, preds))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, preds))

    if hasattr(model, "predict_proba"):
        probs = model.predict_proba(X_test)[:, 1]
        print("ROC-AUC:", roc_auc_score(y_test, probs))

Detected TEXT column : 0                      "like this if you are a tribe fan"
1                   "you're idiot......................."
2       "I am a woman Babs, and the only "war on women...
3       "WOW & YOU BENEFITTED SO MANY WINS THIS YEAR F...
4       "haha green me red you now loser whos winning ...
                              ...                        
2230    "FUCKIN LAME COME ON WTF STOP FUCKING OVER MY ...
2231    "YOU SHUT YOUR IGNORANT PIE HOLE YOU LITTLE IN...
2232    "sweetie pie is looking very much like her cou...
2233    "ball4real where are you with your miami g-ayn...
2234    "Man....if you are a 3 point shooter, you must...
Name: Comment, Length: 2235, dtype: object
Detected LABEL column: 0       0
1       1
2       1
3       1
4       1
       ..
2230    0
2231    1
2232    0
2233    1
2234    0
Name: Insult, Length: 2235, dtype: int64
DataFrame created:


Logistic Regression
Accuracy: 0.7024608501118568

Classification Report:
              precision    r