In [27]:
# Import necessary libraries
import os
import glob
import re
import pandas as pd

In [28]:
# For Google Drive mounting in Colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [29]:
# TASK 1: Reading Files from Google Drive & Data Preprocessing
# ------------------------------
# Defining the folder containing your .txt files on Google Drive
data_text = '/content/text_data'

In [30]:
# Function to extract label from filename.
# For example, files starting with "positive" or "negative" are assigned accordingly.
def extract_label(filename):
 base = os.path.basename(filename)
 if base.startswith('positive'):
  return 'positive'
 elif base.startswith('negative'):
  return 'negative'
 else:
  return 'unknown'

# Function to clean and normalize text
def clean_text(text):
 text = text.lower()
 # Remove non-alphabetic characters (adjust regex to keep language-specific symbols if needed)
 text = re.sub(r'[^a-z\s]', '', text)
 text = re.sub(r'\s+', ' ', text).strip()
 return text
# Read all .txt files from the folder using glob
file_paths = glob.glob(os.path.join(data_text, '*.txt'))
print(f"Found {len(file_paths)} text files.")

Found 12 text files.


In [31]:
# Lists to store the document text and corresponding labels
documents = []
labels = []

for file_path in file_paths:
 with open(file_path, 'r', encoding='utf-8') as file:
  text = file.read()
 text = clean_text(text)
 documents.append(text)
 labels.append(extract_label(file_path))
# Create a DataFrame from the documents and labels
data = pd.DataFrame({'text': documents, 'label': labels})
print("Data preview:")
print(data.head())

Data preview:
                                                text     label
0  lewic odwoko piny akemo okwerogu okworgi mac a...  negative
1  rwo yakobo kadi acel kabedo juda karacel teko ...   unknown
2  lewic odwoko piny akemo okwerogu okworgi mac a...  negative
3  rwo yakobo kadi acel kabedo juda karacel teko ...   unknown
4  teno tyene kica atek meicel adwo meicel teko k...  positive


In [32]:
# ------------------------------
# TASK 2: Feature Extraction
# ------------------------------
from sklearn.feature_extraction.text import TfidfVectorizer
# Initialize a TF-IDF vectorizer that extracts unigrams and bigrams
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
features = vectorizer.fit_transform(data['text'])
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")

Vocabulary size: 236


In [33]:
# TASK 3: Model Implementation with Logistic Regression
# ------------------------------
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Split the dataset into training and test sets (using stratification to preserve class distribution)
X = data['text']
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(
 X, y, test_size=0.4, random_state=42, stratify=y)

# Build a pipeline combining TF-IDF vectorization and logistic regression
pipeline = Pipeline([
 ('tfidf', TfidfVectorizer(ngram_range=(1, 2), lowercase=True)),
 ('logreg', LogisticRegression(max_iter=1000, solver='liblinear'))
])

# Optional: Hyperparameter tuning using GridSearchCV for regularization strength
param_grid = {
 'logreg__C': [0.1, 1, 10],
 'logreg__penalty': ['l2']
}

grid = GridSearchCV(pipeline, param_grid, cv=2, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)

#grid = GridSearchCV(pipeline, param_grid, cv=2, scoring='accuracy', n_jobs=-1)
#grid.fit(X_train, y_train)
print("Best hyperparameters found:", grid.best_params_)

# Use the best estimator from grid search
best_model = grid.best_estimator_

0     negative
1      unknown
2     negative
3      unknown
4     positive
5     positive
6     negative
7      unknown
8      unknown
9     negative
10    positive
11    positive
Name: label, dtype: object
Best hyperparameters found: {'logreg__C': 1, 'logreg__penalty': 'l2'}


In [35]:
# ------------------------------
# TASK 4: Model Evaluation and Analysis
# ------------------------------
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Predict on the test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print("Accuracy on Test Set:", accuracy)
print("\nClassification Report:")
print(report)
print("\nConfusion Matrix:")
print(cm)

Accuracy on Test Set: 1.0

Classification Report:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00         1
    positive       1.00      1.00      1.00         2
     unknown       1.00      1.00      1.00         2

    accuracy                           1.00         5
   macro avg       1.00      1.00      1.00         5
weighted avg       1.00      1.00      1.00         5


Confusion Matrix:
[[1 0 0]
 [0 2 0]
 [0 0 2]]


In [36]:
# Optional: Examine top features for each class
vectorizer = best_model.named_steps['tfidf']
classifier = best_model.named_steps['logreg']
feature_names = vectorizer.get_feature_names_out()
def print_top_features(class_index, n=10):
 coef = classifier.coef_[class_index]
 # Sort features by coefficient weight (largest positive weights)
 topn = sorted(zip(coef, feature_names), reverse=True)[:n]
 print(f"\nTop features for class '{classifier.classes_[class_index]}':")
 for weight, feature in topn:
  print(f"{feature}: {weight:.4f}")
for idx in range(len(classifier.classes_)):
 print_top_features(idx, n=10)
# ------------------------------
# (Optional Bonus Task) Predict New Documents
# ------------------------------
# Example predictions for new sample documents in your local language
new_documents = [
 "mi obanga kwogo",
 "timo kica ber",
 "Rwot doŋ omio piny ocido oko icuc i Cion",
 "Oneko jo luŋ a yam miowa owakere i yomcuny",
 "myero ijale odoco iyom cuny me dwoko kwo ni bote",
 "Rwot onwoŋo doŋ omoko tammere oko"
]
new_predictions = best_model.predict(new_documents)
print("\nPredictions for new documents:")
for doc, pred in zip(new_documents, new_predictions):
 print(f"Text: {doc}\nPredicted Label: {pred}\n")


Top features for class 'negative':
akemo: 0.3184
goba: 0.2504
okwero: 0.1878
mac: 0.1846
rwenyowu arwenya: 0.1252
rwenyowu: 0.1252
owille: 0.1252
okwero goba: 0.1252
goba rwenyowu: 0.1252
arwenya okwero: 0.1252

Top features for class 'positive':
adwo: 0.3075
tyene: 0.2153
teno tyene: 0.2153
teno: 0.2153
yomcuny teno: 0.1845
yomcuny: 0.1845
tyene adwo: 0.1845
teko konyogi: 0.1845
owakere yomcuny: 0.1845
owakere: 0.1845

Top features for class 'unknown':
icrael: 0.2515
yakobo: 0.1692
teko icrael: 0.1692
karacel teko: 0.1692
karacel: 0.1692
kabedo juda: 0.1692
kabedo: 0.1692
juda karacel: 0.1692
juda: 0.1692
acel kabedo: 0.1692

Predictions for new documents:
Text: mi obanga kwogo
Predicted Label: negative

Text: timo kica ber
Predicted Label: negative

Text: Rwot doŋ omio piny ocido oko icuc i Cion
Predicted Label: negative

Text: Oneko jo luŋ a yam miowa owakere i yomcuny
Predicted Label: positive

Text: myero ijale odoco iyom cuny me dwoko kwo ni bote
Predicted Label: negative

Text: