In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import joblib

# Load datasets
df = pd.read_csv('dataset.csv')

# Cleaning Text Data
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)  
    # Remove special characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)  
    # Convert to lowercase
    text = text.lower()  
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

df['cleaned_bullying_words'] = df['bullying_words'].apply(clean_text)
df.tail(20)

Unnamed: 0,bullying_words,type_bully,cleaned_bullying_words
21632,moshemane,bully,moshemane
21633,monna kaofela,non-bully,monna kaofela
21634,mosali kaofela,non-bully,mosali kaofela
21635,morapeli oa sebele,non-bully,morapeli oa sebele
21636,motlotsuoa oa morena,non-bully,motlotsuoa oa morena
21637,mora molimo,non-bully,mora molimo
21638,matjekela,bully,matjekela
21639,nyanya mpe,bully,nyanya mpe
21640,o mobe,bully,mobe
21641,seleee tooe,bully,seleee tooe


In [14]:
# Encoding Labels
df['label'] = df['type_bully'].apply(lambda x: 1 if x == 'bully' else 0)

# Split Data
X = df['cleaned_bullying_words']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
df.sample(5)

Unnamed: 0,bullying_words,type_bully,cleaned_bullying_words,label
13222,I like these ditties. Can we have them? I'll r...,non-bully,like ditty reverse type hate white shirt,0
5205,@nightmaremyles fuck yes to both of you. &lt;3,bully,nightmaremyles fuck yes lt,1
10559,during the week I'm more blk polo/cargo pants/...,non-bully,week blk polo cargo pant white runner kinda gu...,0
15778,kk,non-bully,kk,0
4475,lol! hey stop being modest. you OFTEN are a ...,bully,lol hey stop modest often hot bitch,1


In [8]:
# Vectorize Text Data
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)

In [9]:
X_test_tfidf = vectorizer.transform(X_test)

In [10]:
# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)

In [11]:
# Evaluate Random Forest model
rf_accuracy = rf_model.score(X_test_tfidf, y_test)
print(f'Random Forest Accuracy: {rf_accuracy}')

# Save the vectorizer and the model
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

Random Forest Accuracy: 0.896559685984761


['tfidf_vectorizer.joblib']

In [12]:
joblib.dump(rf_model, 'random_forest_model.joblib')

['random_forest_model.joblib']