In [29]:
# Importing libraries
import os
import glob
import re
import pandas as pd

In [30]:
# Mounting Drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
# ------------------------------
# TASK 1: Readinging Files from Google Drive & Data Preprocessing
# ------------------------------
# Defining the folder containing the .txt files on Google Drive
text_data = '/content/text_data/'

# Function to extract label from filename.
# For example, if file names start with "positive" or "negative"
def extract_label(filename):
 base = os.path.basename(filename)
 if base.startswith('positive'):
    return 'positive'
 elif base.startswith('negative'):
    return 'negative'
 else:
    return 'unknown'

# Function to clean and normalize text
def clean_text(text):
 """
 Clean and normalize text.
 Modify this function if your local language requires special handling
 (e.g., accented letters, diacritics, or specific punctuation).
 This example lowercases the text and removes non-alphabetic characters.
 """
 text = text.lower()
 text = re.sub(r'[^a-z\s]', '', text)
 text = re.sub(r'\s+', ' ', text).strip()
 return text

# Read all .txt files from the specified folder using glob
file_paths = glob.glob(os.path.join(text_data, '*.txt'))

print(f"Found {len(file_paths)} text files.")

# Lists to store document texts and corresponding labels
documents = []
labels = []
for file_path in file_paths:
 with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()
 text = clean_text(text)
 documents.append(text)
 labels.append(extract_label(file_path))

# Create a DataFrame from the documents and labels
data = pd.DataFrame({'text': documents, 'label': labels})
print("Data preview:")
print(data.head())

Found 6 text files.
Data preview:
                                                text     label
0  bulabe yakwaatibwa naayisibwa bubi obuzibu ku ...  negative
1  obuto bwe nokusoma kwe jennifer nansubuga maku...   unknown
2  yazaalibwa era yakulira kampala omukozi wa bba...  positive
3  yazaalibwa era yakulira kampala omukozi wa bba...  positive
4  bulabe yakwaatibwa naayisibwa bubi obuzibu ku ...  negative


In [32]:
#-------------------------------
# TASK 2: Feature Extraction
# ------------------------------
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer to extract unigrams and bigrams
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
features = vectorizer.fit_transform(data['text'])
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")


Vocabulary size: 1402


In [36]:
# ------------------------------
# TASK 3: Model Implementation with Multinomial Naive Bayes
# ------------------------------
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Split the data into training and test sets (optionally, you can create a development set)
X = data['text']
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(
 X, y, test_size=0.4, random_state=42, stratify=y)

# Build a pipeline that combines TF-IDF vectorization with the multinomial naive Bayes classifier.
# MultinomialNB applies Laplace (add-one) smoothing by default (alpha=1.0).

pipeline = Pipeline([
 ('tfidf', TfidfVectorizer(ngram_range=(1, 2), lowercase=True)),
 ('nb', MultinomialNB(alpha=1.0))
])

# Train the naive Bayes classifier
pipeline.fit(X_train, y_train)
print("Naive Bayes model training completed.")



Naive Bayes model training completed.


In [37]:
# ------------------------------
# TASK 4: Model Evaluation and Analysis
# ------------------------------
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Predict labels on the test set
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print("Accuracy on Test Set:", accuracy)
print("\nClassification Report:")
print(report)
print("\nConfusion Matrix:")
print(cm)



Accuracy on Test Set: 1.0

Classification Report:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00         1
    positive       1.00      1.00      1.00         1
     unknown       1.00      1.00      1.00         1

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3


Confusion Matrix:
[[1 0 0]
 [0 1 0]
 [0 0 1]]


In [38]:
# Optional: Examine top discriminative features for each class
# Retrieve the fitted vectorizer and classifier from the pipeline
vectorizer = pipeline.named_steps['tfidf']
classifier = pipeline.named_steps['nb']
feature_names = vectorizer.get_feature_names_out()
def print_top_features(class_index, n=10):
 # For multinomial naive Bayes, the log probabilities for features can be found in feature_log_prob_
 coef = classifier.feature_log_prob_[class_index]
 # Sort features by their log probability contribution
 topn = sorted(zip(coef, feature_names), reverse=True)[:n]
 print(f"\nTop features for class '{classifier.classes_[class_index]}':")
 for log_prob, feature in topn:
  print(f"{feature}: {log_prob:.4f}")
for idx in range(len(classifier.classes_)):
  print_top_features(idx, n=10)



Top features for class 'negative':
sitaani gwegwasinga: -6.9228
gwegwasinga ebyokugwa: -6.9228
ebyokugwa lwebbula: -6.9228
sitaani: -6.9907
lwebbula: -6.9907
gwegwasinga: -6.9907
ebyokugwa: -6.9907
lwebbula sitaani: -7.0186
bwongo baayawukana: -7.1245
bulabe yakwaatibwa: -7.1245

Top features for class 'positive':
yagisomera yakola: -6.9996
okuttibwa yagisomera: -6.9996
diguli okusomesa: -6.9996
yataasibwa okuttibwa: -7.0530
yataasibwa: -7.0530
yakola diguli: -7.0530
yakola: -7.0530
yagisomera: -7.0530
okuttibwa: -7.0530
okusomesa: -7.0530

Top features for class 'unknown':
mu: -6.7491
ku: -7.0095
nga: -7.0666
makumbi: -7.1078
era: -7.1086
nti: -7.1399
nnyo: -7.1508
the: -7.1619
afirika: -7.1619
okuva: -7.1731


In [41]:
# ------------------------------
# (Optional Bonus Task) Predict New Documents
# ------------------------------
# Predict labels for new sample documents (adjust the sample texts to your local language)
new_documents = [
 "kasule alina obugaga bungi",
 "kasule yegatta ku gavumenti",
 "obuzibu we bwava kuyomba"
]
new_predictions = pipeline.predict(new_documents)
print("\nPredictions for new documents:")
for doc, pred in zip(new_documents, new_predictions):
 print(f"Text: {doc}\nPredicted Label: {pred}\n")


Predictions for new documents:
Text: kasule alina obugaga bungi
Predicted Label: positive

Text: kasule yegatta ku gavumenti
Predicted Label: unknown

Text: obuzibu we bwava kuyomba
Predicted Label: negative

