In [1]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from googletrans import Translator

### FEATURE CLEANING AND PREPROCESSING

In [2]:
# Load the data into a DataFrame

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_output = pd.read_csv("test.csv")
df_train.head()

Unnamed: 0,id,NewsText,label
0,1000,Nan ne muka kawo karshen labarai da rahotannin...,0
1,1001,Sai ku kasance tare da mu a gobe Litinin idan ...,0
2,1002,Rahotanni sun ce tawagar bincike ta Saudiyya a...,1
3,1003,Jaridar Saudiyya da ake bugawa a London Al-Sha...,1
4,1004,Yanzu ana jiran tabbatar da ingancin rigakafin...,1


In [3]:
# Initialise the translator

translator = Translator()

In [4]:
# Define a function to translate text from Hausa to English
def translate_text(text):
    # Use the translator to translate the text
    translated = translator.translate(text, src="ha", dest="en")
    translated = translated.__dict__["text"]
    # Return the translated text
    return translated

In [5]:
# Apply the translation function to the "NewsText" column
df_train["NewsText"] = df_train["NewsText"].apply(translate_text)
df_test["NewsText"] = df_test["NewsText"].apply(translate_text)

In [6]:
# Remove the id column

df_train = df_train.drop("id", axis=1)
df_test = df_test.drop("id", axis=1)

In [7]:
# Convert all the text to lowercase

df_train["NewsText"] = df_train["NewsText"].apply(lambda x: x.lower())
df_test["NewsText"] = df_test["NewsText"].apply(lambda x: x.lower())

In [8]:
# Remove punctuation marks and special characters

df_train["NewsText"] = df_train["NewsText"].apply(lambda x: x.translate(str.maketrans("", "", string.punctuation)))
df_test["NewsText"] = df_test["NewsText"].apply(lambda x: x.translate(str.maketrans("", "", string.punctuation)))

In [9]:
# Tokenize the text
df_train["NewsText"] = df_train["NewsText"].apply(lambda x: nltk.word_tokenize(x))
df_test["NewsText"] = df_test["NewsText"].apply(lambda x: nltk.word_tokenize(x))

In [10]:
# Remove stopwords
stop_words = stopwords.words("english")

df_train["NewsText"] = df_train["NewsText"].apply(lambda x: [word for word in x if word not in stop_words])
df_test["NewsText"] = df_test["NewsText"].apply(lambda x: [word for word in x if word not in stop_words])

In [11]:
# Define the lemmatizer object
lemmatizer = WordNetLemmatizer()

# Define a function to lemmatize a single word
def lemmatize_word(word, pos_tag):
    """
    pos_tag: part of speech tag of the word
    """
    # Map the pos_tag to the WordNet POS tag format
    pos_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV, "J": wordnet.ADJ}
    wordnet_pos = pos_map.get(pos_tag[0], wordnet.NOUN)
    
    # Lemmatize the word
    return lemmatizer.lemmatize(word, pos=wordnet_pos)

# Define a function to lemmatize a list of words
def lemmatize_words(words):
    # Use NLTK's pos_tag function to get the part of speech tag of each word
    pos_tags = nltk.pos_tag(words)
    # Lemmatize each word using the lemmatize_word function
    lemmatized_words = [lemmatize_word(word, pos_tag) for word, pos_tag in pos_tags]
    return lemmatized_words

In [12]:
# Lemmatize the text

df_train["NewsText"] = df_train["NewsText"].apply(lemmatize_words)
df_test["NewsText"] = df_test["NewsText"].apply(lemmatize_words)

In [13]:
# Get training data info and check for null values

print(df_train.info())
print(df_train.isna().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   NewsText  256 non-null    object
 1   label     256 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 4.1+ KB
None
NewsText    0
label       0
dtype: int64


In [14]:
# Get testing data info and check for null values

print(df_test.info())
print(df_test.isna().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63 entries, 0 to 62
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   NewsText  63 non-null     object
dtypes: object(1)
memory usage: 632.0+ bytes
None
NewsText    0
dtype: int64


In [15]:
# Create an instance of the CountVectorizer class
vectorizer = TfidfVectorizer()

# Fit and transform the training text data into a bag-of-words representation
X_train = vectorizer.fit_transform(df_train["NewsText"].apply(str))

# Fit and transform the testing text data into a bag-of-words representation
X_test = vectorizer.transform(df_test["NewsText"].apply(str))

# Get the feature names
feature_names = vectorizer.get_feature_names_out()

In [16]:
# Assign the target variable
y_train = df_train["label"]

In [17]:
# Define the parameter grid to search
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4],
    'gamma': ['scale', 'auto']
}

In [18]:
# Create a Naive Bayes object and fit it to the training data

lscv = SVC(random_state=5)

In [19]:
# Create the GridSearchCV object
grid_search = GridSearchCV(lscv, param_grid, cv=5, n_jobs=-1, scoring="accuracy")

In [20]:
grid_search.fit(X_train, y_train)

In [21]:
# Print the best parameters and score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

Best parameters:  {'C': 10, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}
Best score:  0.44524886877828057


In [22]:
# Predit the labels of the test data

y_pred = grid_search.predict(X_test)
y_train_pred = grid_search.predict(X_train)

accuracy = accuracy_score(y_train, y_train_pred)
print(f"Accuracy: {accuracy:.5f}")

Accuracy: 1.00000


In [23]:
# Create a dataframe with the id and label colum

df_output = pd.DataFrame({"id": df_output["id"], "label": y_pred})

In [24]:
# Output the DataFrame to a CSV file

df_output.to_csv("output.csv", index=False)