<a href="https://colab.research.google.com/github/harry934/MACHINE-LEARNING-PROJECTS/blob/main/Topic_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Topic Modeling on HateSpeech-Kenya Dataset
Dataset: https://www.kaggle.com/datasets/edwardombui/hatespeech-kenya

### Workflow
<ol><li>Load the dataset (hatespeech-kenya).
<li>Preprocess the data (cleaning, tokenization, etc.).
<li> Build a dictionary and document-term matrix (use 5000 features)
<li> Train an LDA model using gensim.
<li> Visualize topics using pyLDAvis.
<li> Evaluate topics for interpretability.</ol>

In [None]:
# Step 0: Install required packages (run once)
!pip install gensim pyLDAvis pandas numpy matplotlib seaborn scikit-learn wordcloud nltk -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from gensim import corpora
from gensim.models import LdaMulticore
from gensim.models.ldamodel import LdaModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import warnings
warnings.filterwarnings("ignore")

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

print("Libraries imported and NLTK data downloaded.")

## Step 1: Load the dataset

After running the above cell, you should be able to execute the Kaggle download command without the `OSError`. You can now re-run the cell where you attempted to download the dataset.

In [None]:
# Upload the CSV file from Kaggle: "HateSpeech_Kenya.csv"

df = pd.read_csv('/content/HateSpeech_Kenya.csv')

print("Dataset shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nFirst few rows:")
df.head()

In [None]:
# Explore class distribution (if labeled)
print(df['Class'].value_counts())  # or adjust column name

## Step 2: Text Preprocessing

In [None]:
# Identify the correct text column (common names)
nltk.download('punkt_tab')

text_col = None
for col in ['Tweet']:
    if col in df.columns:
        text_col = col
        break

if text_col is None:
    # If not found, print columns and pick manually
    print("Available columns:", df.columns.tolist())
    text_col = input("Enter the name of the text column: ")

print(f"Using text column: {text_col}")

# Extract texts
texts = df[text_col].astype(str)

# Preprocessing function
stop_words = set(stopwords.words('english'))
# Add common Swahili/ Sheng / Kenyan online slang stopwords (optional but helpful)
extra_stopwords = {'ni', 'wa', 'na', 'ya', 'kwa', 'ni', 'lakini', 'nawe', 'mimi', 'wewe', 'yeye',
                   'sisi', 'nyinyi', 'hao', 'hii', 'hizi', 'hiyo', 'hizo', 'hapa', 'pale', 'humu',
                   'kule', 'ndio', 'hapana', 'bila', 'kila', 'baada', 'kabla', 'hadi', 'zaidi',
                   'si', 'je', 'kwani', 'kwamba', 'ili', 'ambao', 'ambayo', 'ambazo', 'watu',
                   'msee', 'bro', 'sis', 'dem', 'aki', 'ama', 'sasa', 'tu', 'nioe', 'wasee'}
stop_words.update(extra_stopwords)

stemmer = PorterStemmer()

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', text)
    # Remove mentions (@user) and hashtags
    text = re.sub(r'@\w+|#\w+', ' ', text)
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and short words
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    # Optional: Stemming (can also use lemmatization)
    tokens = [stemmer.stem(word) for word in tokens]
    return tokens

print("Preprocessing texts...")
processed_texts = texts.apply(preprocess_text)

# Remove empty documents
processed_texts = processed_texts[processed_texts.apply(len) > 0]

print(f"Number of documents after cleaning: {len(processed_texts)}")
print("Sample processed text:", processed_texts.iloc[0])

## Step 3: Create Dictionary and Document-Term Matrix (with max 5000 features)

In [None]:
# Create dictionary
dictionary = corpora.Dictionary(processed_texts)

# Filter extremes: remove terms appearing in <5 docs or >70% of docs
dictionary.filter_extremes(no_below=5, no_above=0.7, keep_n=5000)

print(f"Dictionary size after filtering: {len(dictionary)} tokens")

# Create Bag-of-Words corpus
corpus = [dictionary.doc2bow(text) for text in processed_texts]

print(f"Corpus created with {len(corpus)} documents.")

## Step 4: Train LDA Model using Gensim

In [None]:
# Optimal number of topics? Let's try 8–12 for hate speech data (ethnicity, politics, gender, etc.)
NUM_TOPICS = 10

lda_model = LdaMulticore(
    corpus=corpus,
    id2word=dictionary,
    num_topics=NUM_TOPICS,
    random_state=42,
    chunksize=2000,
    passes=10,
    alpha='symmetric',
    eta='auto',
    workers=4,  # adjust based on your CPU
    eval_every=1,
    per_word_topics=True
)

print("LDA Model Training Completed!")

In [None]:
# Display top words per topic
print("Top words per topic:\n")
for idx in range(NUM_TOPICS):
    print(f"Topic #{idx + 1}:")
    words = lda_model.print_topic(idx, topn=10)
    print(words)
    print()

## Step 5: Visualize with pyLDAvis

In [None]:
# Prepare visualization
vis = gensimvis.prepare(lda_model, corpus, dictionary, mds='mmds', sort_topics=False)

# Display in notebook
pyLDAvis.display(vis)

## Step 6: Evaluate Topics for Interpretability


In [None]:
# Compute Coherence Score (higher is better)
from gensim.models.coherencemodel import CoherenceModel

coherence_model = CoherenceModel(model=lda_model, texts=processed_texts, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(f"C_V Coherence Score: {coherence_score:.4f}")

# Manual interpretation (example labels based on common Kenyan hate speech themes)
topic_labels = {
    0: "Ethnic Attacks (Luo vs Kikuyu)",
    1: "Political Incitement & Election Violence",
    2: "Gender-Based Hate & Misogyny",
    3: "Religious Intolerance",
    4: "Raila/Ruto Political Tribalism",
    5: "General Insults & Profanity",
    6: "Anti-Kikuyu Sentiment",
    7: "Calls for Violence/Genocide Rhetoric",
    8: "Anti-Kalenjin Hate",
    9: "Police Brutality & State Criticism"
}

print("\nSuggested Topic Labels (based on inspection):")
for i, label in topic_labels.items():
    # Corrected line: Extract the word (first element) from each tuple
    top_words = [word_prob[0] for word_prob in lda_model.show_topic(i, 10)]
    print(f"Topic {i+1}: {label}")
    print("   →", ", ".join(top_words))
    print()

In [None]:
# Find dominant topic for each document
def get_dominant_topic(ldamodel, corpus):
    topics = []
    for doc in corpus:
        topic_probs = ldamodel.get_document_topics(doc)
        dominant = max(topic_probs, key=lambda x: x[1])
        topics.append((dominant[0], dominant[1]))
    return topics

df_clean = df.loc[processed_texts.index].copy()
dominant_topics = get_dominant_topic(lda_model, corpus)
df_clean['dominant_topic'] = [t[0] + 1 for t in dominant_topics]
df_clean['topic_prob'] = [t[1] for t in dominant_topics]

print("Dominant topic distribution:")
print(df_clean['dominant_topic'].value_counts().sort_index())

Next Steps You Can Add:

Hyperparameter tuning (grid search over number of topics using coherence)<br>
Compare with BERTopic (modern alternative)
Classify hate vs non-hate using topic proportions as features