In [1]:
import pandas as pd

# Create a small sample dataset
data = {
    'text': [
        'Mtu huyu ni mjinga kabisa',  # Hate speech
        'Ninafurahia sana maisha yangu',  # Non-hate speech
        'Ninyi wote hamna akili',  # Hate speech
        'Tuna amani na umoja',  # Non-hate speech
        'Watu wa jamii hiyo hawana maana',  # Hate speech
        'Mimi napenda kujifunza Kiswahili'  # Non-hate speech
    ],
    'label': [1, 0, 1, 0, 1, 0]  # 1: Hate Speech, 0: Non-hate speech
}

# Convert to DataFrame
df = pd.DataFrame(data)
df.head()


Unnamed: 0,text,label
0,Mtu huyu ni mjinga kabisa,1
1,Ninafurahia sana maisha yangu,0
2,Ninyi wote hamna akili,1
3,Tuna amani na umoja,0
4,Watu wa jamii hiyo hawana maana,1



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [2]:
import re
import string

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    # Remove digits
    text = re.sub(r"\d+", "", text)
    return text

df['clean_text'] = df['text'].apply(preprocess_text)
df.head()


Unnamed: 0,text,label,clean_text
0,Mtu huyu ni mjinga kabisa,1,mtu huyu ni mjinga kabisa
1,Ninafurahia sana maisha yangu,0,ninafurahia sana maisha yangu
2,Ninyi wote hamna akili,1,ninyi wote hamna akili
3,Tuna amani na umoja,0,tuna amani na umoja
4,Watu wa jamii hiyo hawana maana,1,watu wa jamii hiyo hawana maana


In [3]:
from sklearn.model_selection import train_test_split

X = df['clean_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training size: {len(X_train)}")
print(f"Testing size: {len(X_test)}")


Training size: 4
Testing size: 2


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train the model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.5
Classification Report:
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
import joblib

# Save model and vectorizer
joblib.dump(model, 'kiswahili_hate_speech_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

# To load the model and vectorizer later:
# model = joblib.load('kiswahili_hate_speech_model.pkl')
# vectorizer = joblib.load('tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [7]:
# New text
new_text = ['Watu hawa hawana akili kabisa']

# Preprocess the text
new_text_cleaned = [preprocess_text(t) for t in new_text]

# Vectorize the text
new_text_tfidf = vectorizer.transform(new_text_cleaned)

# Predict
prediction = model.predict(new_text_tfidf)
print(f"Prediction (1=Hate Speech, 0=Non-Hate Speech): {prediction[0]}")


Prediction (1=Hate Speech, 0=Non-Hate Speech): 1
