# Dependencies


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from utils import (
    build_freqs,
    train_naive_bayes,
    predict_naive_bayes,
    cross_validation,
    plot_confusion_matrix,
)

encoder = LabelEncoder()

# Preprocessing


In [2]:
df = pd.read_csv("../data/preprocessed/masakhane_afrisenti_twi_train_preprocessed.csv")
df["sentiment"] = encoder.fit_transform(df["label"])
df.head()

Unnamed: 0,tweet,label,sentiment
0,kako be shark but wo ti ewu,negative,0
1,br ne bayie nti na me supporti man city,negative,0
2,s3 woofis3 mada wafutuo tantan no 3y3wo s3mafa...,negative,0
3,wabɔdam anaa wo trumu yɛ nkate nkwan aseɛ,negative,0
4,enfa bi da bra 🤣🤣,negative,0


# Model Training


In [3]:
train_x = df["tweet"]
train_y = df["sentiment"]

In [4]:
freqs = build_freqs(train_x, train_y)

In [5]:
logprior, loglikelihood, vocab, classes = train_naive_bayes(freqs, train_x, train_y)

# Model Prediction


In [6]:
text = "3kom"

pred = predict_naive_bayes(text, logprior, loglikelihood, vocab, classes)
print(pred)

negative


# Model Validation


In [7]:
df2 = pd.read_csv(
    "../data/preprocessed/masakhane_afrisenti_twi_validation_preprocessed.csv"
)
df2["sentiment"] = encoder.fit_transform(df2["label"])
val_x = df2["tweet"]
val_y = df2["sentiment"]

In [8]:
alphas = [0.1, 0.01, 0.2, 0.02, 0.3, 0.03, 0.4, 0.5, 0.05]
best_alpha, sc = cross_validation(train_x, train_y, val_x, val_y, alphas)

# Retraining Using CV + Train set


In [9]:
full_train_x = np.concatenate([train_x, val_x])
full_train_y = np.concatenate([train_y, val_y])

In [10]:
freqs_full = build_freqs(full_train_x, full_train_y)

In [11]:
logprior_f, loglikelihood_f, vocab_f, classes_f = train_naive_bayes(
    freqs_full, full_train_x, full_train_y, alpha=best_alpha
)

# Test and Evaluation


In [12]:
df3 = pd.read_csv("../data/preprocessed/masakhane_afrisenti_twi_test_preprocessed.csv")
df3["sentiment"] = encoder.fit_transform(df3["label"])
test_x = df3["tweet"]
test_y = df3["sentiment"]

In [13]:
test_preds = [
    predict_naive_bayes(text, logprior_f, loglikelihood_f, vocab_f, classes_f)
    for text in test_x
]
test_preds_enc = encoder.transform(test_preds)
print(classification_report(test_y, test_preds_enc, target_names=encoder.classes_))


              precision    recall  f1-score   support

    negative       0.73      0.78      0.75       353
    positive       0.82      0.77      0.79       450

    accuracy                           0.77       803
   macro avg       0.77      0.77      0.77       803
weighted avg       0.78      0.77      0.77       803



In [14]:
plot_confusion_matrix(
    y_true=test_y,
    y_pred=test_preds_enc,
    classes=encoder.classes_,
    filename="confusion matrix nb.png",
)