# Dependencies


In [58]:
import sys

sys.path.append("../")

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from src.utils import (
    build_freqs,
    train_naive_bayes,
    predict_naive_bayes,
    cross_validation,
)

# Preprocessing


In [42]:
df = pd.read_csv("../data/raw/masakhane_afrisenti_twi_train.csv")
df.head()

Unnamed: 0,tweet,label
0,kako be shark but wo ti ewu,negative
1,br ne bayie nti na me supporti man city,negative
2,s3 woofis3 mada wafutuo tantan no 3y3wo s3mafa...,negative
3,wabɔdam anaa wo trumu yɛ nkate nkwan aseɛ,negative
4,enfa bi da bra 🤣🤣,negative


In [43]:
encoder = LabelEncoder()
df["sentiment"] = encoder.fit_transform(df["label"])
print(encoder.classes_)

['negative' 'neutral' 'positive']


In [44]:
df.head()

Unnamed: 0,tweet,label,sentiment
0,kako be shark but wo ti ewu,negative,0
1,br ne bayie nti na me supporti man city,negative,0
2,s3 woofis3 mada wafutuo tantan no 3y3wo s3mafa...,negative,0
3,wabɔdam anaa wo trumu yɛ nkate nkwan aseɛ,negative,0
4,enfa bi da bra 🤣🤣,negative,0


# Model Training


In [45]:
train_x = df["tweet"]
train_y = df["sentiment"]

In [46]:
freqs = build_freqs(train_x, train_y)

In [47]:
logprior, loglikelihood, vocab, classes = train_naive_bayes(freqs, train_x, train_y)

# Model Prediction


In [48]:
text = "3kom"

pred = predict_naive_bayes(text, logprior, loglikelihood, vocab, classes)
print(pred)

negative


# Model Validation


In [49]:
df2 = pd.read_csv("../data/raw/masakhane_afrisenti_twi_validation.csv")
df2["sentiment"] = encoder.fit_transform(df2["label"])
val_x = df2["tweet"]
val_y = df2["sentiment"]

In [None]:
alphas = [0.1, 0.01, 0.2, 0.02, 0.3, 0.03, 0.4, 0.04, 0.5, 0.05]
best_alpha, sc = cross_validation(train_x, train_y, val_x, val_y, alphas)

# Retraining Using CV + Train set


In [None]:
full_train_x = np.concatenate([train_x, val_x])
full_train_y = np.concatenate([train_y, val_y])

In [None]:
freqs_full = build_freqs(full_train_x, full_train_y)

In [None]:
logprior_f, loglikelihood_f, vocab_f, classes_f = train_naive_bayes(
    freqs_full, full_train_x, full_train_y, alpha=0.05
)

# Test and Evaluation

In [57]:
df3 = pd.read_csv("../data/raw/masakhane_afrisenti_twi_test.csv")
df3["sentiment"] = encoder.fit_transform(df3["label"])
test_x = df3["tweet"]
test_y = df3["sentiment"]

In [61]:
test_preds = [predict_naive_bayes(text,logprior_f,loglikelihood_f,vocab_f,classes_f) for text in test_x]
test_preds_enc = encoder.transform(test_preds)
f1_f = f1_score(test_y, test_preds_enc, average='macro')
f1_f

0.5257571836352044