In [1]:
import pandas as pd
import string
import nltk
import urllib.request
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

### FEATURE CLEANING AND PREPROCESSING

In [2]:
# Load the data into a DataFrame

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_output = pd.read_csv("test.csv")
df_train.head()

Unnamed: 0,id,NewsText,label
0,1000,Nan ne muka kawo karshen labarai da rahotannin...,0
1,1001,Sai ku kasance tare da mu a gobe Litinin idan ...,0
2,1002,Rahotanni sun ce tawagar bincike ta Saudiyya a...,1
3,1003,Jaridar Saudiyya da ake bugawa a London Al-Sha...,1
4,1004,Yanzu ana jiran tabbatar da ingancin rigakafin...,1


In [3]:
# Remove the id column

df_train = df_train.drop("id", axis=1)
df_test = df_test.drop("id", axis=1)

In [4]:
# Convert all the text to lowercase

df_train["NewsText"] = df_train["NewsText"].apply(lambda x: x.lower())
df_test["NewsText"] = df_test["NewsText"].apply(lambda x: x.lower())

In [5]:
# Remove punctuation marks and special characters

df_train["NewsText"] = df_train["NewsText"].apply(lambda x: x.translate(str.maketrans("", "", string.punctuation)))
df_test["NewsText"] = df_test["NewsText"].apply(lambda x: x.translate(str.maketrans("", "", string.punctuation)))

In [6]:
# Download the NLTK resources
nltk.download("punkt")

# Tokenize the text
df_train["NewsText"] = df_train["NewsText"].apply(lambda x: nltk.word_tokenize(x))
df_test["NewsText"] = df_test["NewsText"].apply(lambda x: nltk.word_tokenize(x))

[nltk_data] Downloading package punkt to C:\Users\Adeniyi
[nltk_data]     Babalola\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
# Download the Hausa stopword list from Stopwords ISO project
# Convert each word from bytes to a string using decode()

url = "https://raw.githubusercontent.com/stopwords-iso/stopwords-ha/master/stopwords-ha.txt"
response = urllib.request.urlopen(url)
stop_words = [word.strip().decode() for word in response]

In [8]:
# Remove stop words

df_train["NewsText"] = df_train["NewsText"].apply(lambda x: [word for word in x if word not in stop_words])
df_test["NewsText"] = df_test["NewsText"].apply(lambda x: [word for word in x if word not in stop_words])

print(df_train.head())
print(df_test.head())

                                            NewsText  label
0  [nan, muka, kawo, karshen, labarai, rahotannin...      0
1  [ku, kasance, tare, mu, gobe, litinin, idan, r...      0
2  [rahotanni, tawagar, bincike, saudiyya, jami, ...      1
3  [jaridar, saudiyya, ake, bugawa, london, alsha...      1
4  [yanzu, ana, jiran, tabbatar, ingancin, rigaka...      1
                                            NewsText
0  [babu, dai, cikakken, bayani, kan, musabbabin,...
1  [shugaba, ci, lashe, zaɓen, inda, samu, ƙuria,...
2  [tun, 186, museveni, ke, mulki, uganda, wanda,...
3  [kakakin, kawancen, turki, almaliki, harbo, ji...
4  [manchester, united, saman, tebur, tazarar, ma...


In [9]:
# Get training data info and check for null values

print(df_train.info())
print(df_train.isna().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   NewsText  256 non-null    object
 1   label     256 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 4.1+ KB
None
NewsText    0
label       0
dtype: int64


In [10]:
# Get testing data info and check for null values

print(df_test.info())
print(df_test.isna().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63 entries, 0 to 62
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   NewsText  63 non-null     object
dtypes: object(1)
memory usage: 632.0+ bytes
None
NewsText    0
dtype: int64


In [11]:
# Create an instance of the CountVectorizer class
vectorizer = TfidfVectorizer()

# Fit and transform the training text data into a bag-of-words representation
X_train = vectorizer.fit_transform(df_train["NewsText"].apply(str))

# Fit and transform the testing text data into a bag-of-words representation
X_test = vectorizer.transform(df_test["NewsText"].apply(str))

# Get the feature names
feature_names = vectorizer.get_feature_names_out()

In [12]:
# Assign the target variable
y_train = df_train["label"]

In [13]:
# # Split the data into training and testing sets

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [14]:
# Create a LogisticRegression object and fit it to the training data

clf = LogisticRegression()
clf.fit(X_train, y_train)

In [15]:
# Predit the labels of the test data

y_pred = clf.predict(X_test)

# accuracy = accuracy_score(y_test, y_pred)
# print(f"Accuracy: {accuracy:.5f}")

In [16]:
# Create a dataframe with the id and label colum

df_output = pd.DataFrame({"id": df_output["id"], "label": y_pred})

In [17]:
# Output the DataFrame to a CSV file

df_output.to_csv("output.csv", index=False)