In [33]:

import pandas as pd

df = pd.read_csv("../data/processed/uganda_fake_news_preprocessed_v1.csv")

df.head()


Unnamed: 0,id,text,label,source,platform_type,language,date_collected,clean_text,label_encoded
0,UG_TRUE_001,The Ministry of Health confirms no outbreak of...,TRUE,Daily Monitor,News Website,English,2025-01-10,ministry health confirms outbreak ebola kampala,1.0
1,UG_FAKE_001,Drinking hot water every 15 minutes kills COVI...,FAKE,AfricaCheck,Fact-Check,English,2025-01-10,drinking hot water every minute kill covid vir...,0.0
2,UG_FAKE_002,Government has approved free electricity for a...,FAKE,PesaCheck,Fact-Check,English,2025-01-11,government approved free electricity ugandan s...,0.0
3,UG_TRUE_002,Parliament passes new amendment to the Nationa...,TRUE,New Vision,News Website,English,2025-01-11,parliament pass new amendment national id regi...,1.0
4,UG_TRUE_003,Museveni alisema hakuna lockdown tena nchini U...,TRUE,BBC Africa,News Website,Mixed,2025-01-12,museveni alisema hakuna lockdown tena nchini u...,1.0


In [34]:

# Check class distribution
df.shape
df['label_encoded'].value_counts()


label_encoded
0.0    201
1.0    200
Name: count, dtype: int64

In [35]:
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


In [36]:
# Initialize TfidfVectorizer with specific parameters
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.9
)


In [37]:
# Fit and transform the clean_text column to create feature matrix X and target vector y
X = tfidf.fit_transform(df['clean_text'])
y = df['label_encoded']


In [38]:

# Get the shape of the feature matrix   
X.shape


(402, 1105)

In [39]:
# Get the feature names
feature_names = tfidf.get_feature_names_out()

feature_names[:20]


array(['abuse', 'access', 'access clean', 'accident', 'account',
       'account fake', 'account impersonating', 'account promising',
       'action', 'activist', 'ad', 'adaptation', 'addressing', 'adopt',
       'adopt new', 'africa', 'afya', 'afya atangaza', 'agricultural',
       'agricultural sector'], dtype=object)

In [40]:
# Mean TF-IDF score per term
import numpy as np

# Mean TF-IDF score per term
mean_tfidf = np.asarray(X.mean(axis=0)).flatten()

top_indices = mean_tfidf.argsort()[-20:]
top_terms = [(feature_names[i], mean_tfidf[i]) for i in top_indices]

top_terms


[('enhance', np.float64(0.012494426461877909)),
 ('president museveni', np.float64(0.013095161154203066)),
 ('support', np.float64(0.013205405453021148)),
 ('didnt', np.float64(0.013291832898235623)),
 ('sector', np.float64(0.014375221933814932)),
 ('police', np.float64(0.015096128913247856)),
 ('climate', np.float64(0.016713294234780768)),
 ('bobi wine', np.float64(0.018125797774371198)),
 ('bobi', np.float64(0.018125797774371198)),
 ('wine', np.float64(0.018125797774371198)),
 ('president', np.float64(0.021591870854072236)),
 ('false', np.float64(0.021951447399508883)),
 ('uganda government', np.float64(0.02276616061086408)),
 ('fake', np.float64(0.02359768491787956)),
 ('museveni', np.float64(0.02494252790840258)),
 ('government', np.float64(0.02664674095846216)),
 ('ugandan', np.float64(0.029441071188310852)),
 ('new', np.float64(0.029988740221874862)),
 ('video', np.float64(0.030884912643153146)),
 ('uganda', np.float64(0.05325846695270154))]

In [41]:
# Save the TF-IDF vectorizer and the feature matrix
import joblib

joblib.dump(tfidf, "../models/tfidf_vectorizer_v1.pkl")
joblib.dump(X, "../models/X_tfidf_v1.pkl")
joblib.dump(y, "../models/y_labels_v1.pkl")


['../models/y_labels_v1.pkl']