In [None]:
import pandas as pd
import string
import nltk
import urllib.request
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

### FEATURE CLEANING AND PREPROCESSING

In [None]:
# Load the data into a DataFrame

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_output = pd.read_csv("test.csv")
df_train.head()

In [None]:
# Remove the id column

df_train = df_train.drop("id", axis=1)
df_test = df_test.drop("id", axis=1)

In [None]:
# Convert all the text to lowercase

df_train["NewsText"] = df_train["NewsText"].apply(lambda x: x.lower())
df_test["NewsText"] = df_test["NewsText"].apply(lambda x: x.lower())

In [None]:
# Remove punctuation marks and special characters

df_train["NewsText"] = df_train["NewsText"].apply(lambda x: x.translate(str.maketrans("", "", string.punctuation)))
df_test["NewsText"] = df_test["NewsText"].apply(lambda x: x.translate(str.maketrans("", "", string.punctuation)))

In [None]:
# Tokenize the text
df_train["NewsText"] = df_train["NewsText"].apply(lambda x: nltk.word_tokenize(x))
df_test["NewsText"] = df_test["NewsText"].apply(lambda x: nltk.word_tokenize(x))

In [None]:
# Download the Hausa stopword list from Stopwords ISO project
# Convert each word from bytes to a string using decode()

url = "https://raw.githubusercontent.com/stopwords-iso/stopwords-ha/master/stopwords-ha.txt"
response = urllib.request.urlopen(url)
stop_words = [word.strip().decode() for word in response]

In [None]:
# Remove stop words

df_train["NewsText"] = df_train["NewsText"].apply(lambda x: [word for word in x if word not in stop_words])
df_test["NewsText"] = df_test["NewsText"].apply(lambda x: [word for word in x if word not in stop_words])

print(df_train.head())
print(df_test.head())

In [None]:
# Get training data info and check for null values

print(df_train.info())
print(df_train.isna().sum())

In [None]:
# Get testing data info and check for null values

print(df_test.info())
print(df_test.isna().sum())

In [None]:
# Create an instance of the CountVectorizer class
vectorizer = TfidfVectorizer()

# Fit and transform the training text data into a bag-of-words representation
X_train = vectorizer.fit_transform(df_train["NewsText"].apply(str))

# Fit and transform the testing text data into a bag-of-words representation
X_test = vectorizer.transform(df_test["NewsText"].apply(str))

# Get the feature names
feature_names = vectorizer.get_feature_names_out()

In [None]:
# Assign the target variable
y_train = df_train["label"]

In [13]:
# Convert the sparse matrix to a dense array

X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

In [None]:
# # Split the data into training and testing sets

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [None]:
# Create a Naive Bayes object and fit it to the training data

lscv = GaussianNB()
lscv.fit(X_train_dense, y_train)

In [None]:
# Predit the labels of the test data

y_pred = lscv.predict(X_test_dense)
y_train_pred = lscv.predict(X_train_dense)

accuracy = accuracy_score(y_train, y_train_pred)
print(f"Accuracy: {accuracy:.5f}")

In [None]:
# Create a dataframe with the id and label colum

df_output = pd.DataFrame({"id": df_output["id"], "label": y_pred})

In [None]:
# Output the DataFrame to a CSV file

df_output.to_csv("output.csv", index=False)