# Dependencies


In [1]:
import sys

sys.path.append("../")

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    f1_score,
    recall_score,
)
from src.utils import build_tfidf, tune_params, plot_confusion_matrix

encoder = LabelEncoder()

# TF-IDF


In [2]:
tf_idf = build_tfidf()
tf_idf

# Training


In [3]:
df = pd.read_csv("../data/preprocessed/masakhane_afrisenti_twi_train_preprocessed.csv")
df["sentiment"] = encoder.fit_transform(df["label"])

df2 = pd.read_csv(
    "../data/preprocessed/masakhane_afrisenti_twi_validation_preprocessed.csv"
)
df2["sentiment"] = encoder.fit_transform(df2["label"])


In [4]:
train_x = df["tweet"]
train_y = df["sentiment"]

val_x = df2["tweet"]
val_y = df2["sentiment"]

# Trainign and Tuning Params


In [5]:
params_ = {
    "tfidfvectorizer__ngram_range": [(1, 1), (1, 2)],
    "tfidfvectorizer__max_df": [0.75, 0.85, 0.95],
    "tfidfvectorizer__min_df": [1, 2, 5],
    "nb__alpha": [0.1, 0.01, 0.2, 0.02, 0.3, 0.03, 0.4, 0.5, 0.05],
}


In [6]:
best_params, best_score, _ = tune_params(train_x, train_y, val_x, val_y, params=params_)

Manual parameter search...

Params tried: {'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__max_df': 0.75, 'tfidfvectorizer__min_df': 1, 'nb__alpha': 0.1}
Accuracy: 0.7515 | F1: 0.7502 | Recall: 0.7525

Params tried: {'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__max_df': 0.75, 'tfidfvectorizer__min_df': 1, 'nb__alpha': 0.01}
Accuracy: 0.7485 | F1: 0.7473 | Recall: 0.7498

Params tried: {'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__max_df': 0.75, 'tfidfvectorizer__min_df': 1, 'nb__alpha': 0.2}
Accuracy: 0.7636 | F1: 0.7619 | Recall: 0.7635

Params tried: {'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__max_df': 0.75, 'tfidfvectorizer__min_df': 1, 'nb__alpha': 0.02}
Accuracy: 0.7485 | F1: 0.7473 | Recall: 0.7498

Params tried: {'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__max_df': 0.75, 'tfidfvectorizer__min_df': 1, 'nb__alpha': 0.3}
Accuracy: 0.7636 | F1: 0.7617 | Recall: 0.7628

Params tried: {'tfidfvectorizer__ngram_range': (1,

In [7]:
full_train_x = np.concatenate([train_x, val_x])
full_train_y = np.concatenate([train_y, val_y])

model = build_tfidf(alpha=0.4, ngram_range=(1, 2), max_df=0.75, min_df=1)

In [8]:
model.fit(full_train_x, full_train_y)

# Testing


In [9]:
df3 = pd.read_csv("../data/preprocessed/masakhane_afrisenti_twi_test_preprocessed.csv")
df3["sentiment"] = encoder.fit_transform(df3["label"])
test_x = df3["tweet"]
test_y = df3["sentiment"]


In [10]:
preds = model.predict(test_x)

# Evaluation


In [11]:
print("Classification Report:\n")
print(classification_report(test_y, preds, target_names=["negative", "positive"]))

accuracy = accuracy_score(test_y, preds)
f1 = f1_score(test_y, preds, average="macro")
recall = recall_score(test_y, preds, average="macro")

print(f"Accuracy: {accuracy:.4f}")
print(f"Macro F1 Score: {f1:.4f}")
print(f"Macro Recall: {recall:.4f}")


Classification Report:

              precision    recall  f1-score   support

    negative       0.79      0.74      0.77       353
    positive       0.81      0.85      0.83       450

    accuracy                           0.80       803
   macro avg       0.80      0.79      0.80       803
weighted avg       0.80      0.80      0.80       803

Accuracy: 0.8007
Macro F1 Score: 0.7963
Macro Recall: 0.7944


In [12]:
plot_confusion_matrix(y_true=test_y,y_pred=preds,classes=["negative","positive"], filename="confusion matrix tfidf.png")