In [5]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from nltk.corpus import stopwords

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
data = pd.read_csv("/content/labeled_data.csv")

In [7]:

data.head()


Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [8]:
data = data[['class', 'tweet']]
data = data.rename(columns={'class': 'label', 'tweet': 'text'})

# optional: balance dataset (optional step)
data = data.sample(frac=1).reset_index(drop=True)
data.head()


Unnamed: 0,label,text
0,1,@_tonydennis it's trash ik just wanted to fuck...
1,1,@ThatBoyACE71 Straight pussy
2,1,What's a Queen without a King?\n\n...A lonely ...
3,1,RT @hoes: Scarlett Johansson http://t.co/CBaHf...
4,1,Creeping Death still stuck in my head bitch


In [9]:
def clean_text(text):
    text = re.sub(r'@[A-Za-z0-9_]+','',text)  # Remove @mentions
    text = re.sub(r'#','',text)               # Remove hashtags
    text = re.sub(r'RT[\s]+','',text)         # Remove RT
    text = re.sub(r'https?:\/\/\S+','',text)  # Remove links
    text = re.sub(r'[^A-Za-z\s]','',text)     # Keep letters only
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

data['clean_text'] = data['text'].apply(clean_text)
data.head()


Unnamed: 0,label,text,clean_text
0,1,@_tonydennis it's trash ik just wanted to fuck...,trash ik wanted fuck dorsey
1,1,@ThatBoyACE71 Straight pussy,straight pussy
2,1,What's a Queen without a King?\n\n...A lonely ...,whats queen without king lonely ass bitch
3,1,RT @hoes: Scarlett Johansson http://t.co/CBaHf...,scarlett johansson
4,1,Creeping Death still stuck in my head bitch,creeping death still stuck head bitch


In [10]:
X = data['clean_text']
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [12]:
model = LogisticRegression(max_iter=300)
model.fit(X_train_vec, y_train)


In [13]:
y_pred = model.predict(X_test_vec)

print("‚úÖ Model Evaluation Results:\n")
print("Accuracy:", round(accuracy_score(y_test, y_pred) * 100, 2), "%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


‚úÖ Model Evaluation Results:

Accuracy: 89.71 %

Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.18      0.28       302
           1       0.91      0.97      0.94      3877
           2       0.85      0.83      0.84       778

    accuracy                           0.90      4957
   macro avg       0.79      0.66      0.69      4957
weighted avg       0.88      0.90      0.88      4957


Confusion Matrix:
 [[  55  225   22]
 [  35 3747   95]
 [   2  131  645]]


In [14]:
def predict_toxicity(text):
    cleaned = clean_text(text)
    vector = vectorizer.transform([cleaned])
    prediction = model.predict(vector)[0]

    if prediction == 0:
        label = "üö´ Hate Speech"
    elif prediction == 1:
        label = "‚ö†Ô∏è Offensive / Bullying"
    else:
        label = "‚úÖ Clean / Neutral"
    return label

# Test examples
samples = [
    "I hate those people, they should disappear!",
    "You're amazing and I respect you!",
    "You idiot, go away!",
    "This is such a beautiful day.",
    "That‚Äôs the dumbest thing I‚Äôve heard."
]

for s in samples:
    print(f"Text: {s}\nPrediction: {predict_toxicity(s)}\n")


Text: I hate those people, they should disappear!
Prediction: üö´ Hate Speech

Text: You're amazing and I respect you!
Prediction: ‚ö†Ô∏è Offensive / Bullying

Text: You idiot, go away!
Prediction: ‚úÖ Clean / Neutral

Text: This is such a beautiful day.
Prediction: ‚úÖ Clean / Neutral

Text: That‚Äôs the dumbest thing I‚Äôve heard.
Prediction: ‚úÖ Clean / Neutral



In [15]:
import joblib
joblib.dump(model, 'cyberbully_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
print("‚úÖ Model and Vectorizer saved successfully!")


‚úÖ Model and Vectorizer saved successfully!
