In [58]:
import pandas as pd

df = pd.read_csv("../data/processed/uganda_fake_news_v1_cleaned.csv")

df.head()


Unnamed: 0,id,text,label,source,platform_type,language,date_collected
0,UG_TRUE_001,The Ministry of Health confirms no outbreak of...,TRUE,Daily Monitor,News Website,English,2025-01-10
1,UG_FAKE_001,Drinking hot water every 15 minutes kills COVI...,FAKE,AfricaCheck,Fact-Check,English,2025-01-10
2,UG_FAKE_002,Government has approved free electricity for a...,FAKE,PesaCheck,Fact-Check,English,2025-01-11
3,UG_TRUE_002,Parliament passes new amendment to the Nationa...,TRUE,New Vision,News Website,English,2025-01-11
4,UG_TRUE_003,Museveni alisema hakuna lockdown tena nchini U...,TRUE,BBC Africa,News Website,Mixed,2025-01-12


In [59]:
# Check class distribution
df.shape
df['label'].value_counts()


label
FAKE    201
TRUE    200
Name: count, dtype: int64

In [60]:
# Sample some texts
df['text'].sample(5).tolist()


['No, Ugandan police didn’t tweet that citizens must report anyone not wearing masks.',
 'FALSE: Ugandan opposition leader Bobi Wine has not fled to the US. There is no evidence to support this claim, and Bobi Wine has not made any public statements indicating that he has left Uganda.',
 'Beware of fake Ugandan hotel, resorts and lodges ‘gold list’ ranking.',
 'No evidence that 5G technology spreads COVID-19 virus.',
 'This video of US President Trump commenting on electoral violence in Uganda is doctored. The original video is of Trump’s speech during his campaign in Wisconsin in 2016.']

In [61]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [62]:
# Preprocessing components
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


In [63]:
# Text preprocessing function

def preprocess_text(text):
    # 1. Lowercase
    text = text.lower()
    
    # 2. Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # 3. Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # 4. Tokenize
    tokens = word_tokenize(text)
    
    # 5. Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # 6. Lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return " ".join(tokens)


In [64]:
# Apply preprocessing to the text column
df['clean_text'] = df['text'].apply(preprocess_text)


In [65]:
# Display original and cleaned text samples
df[['text', 'clean_text']].head(5)


Unnamed: 0,text,clean_text
0,The Ministry of Health confirms no outbreak of...,ministry health confirms outbreak ebola kampala
1,Drinking hot water every 15 minutes kills COVI...,drinking hot water every minute kill covid vir...
2,Government has approved free electricity for a...,government approved free electricity ugandan s...
3,Parliament passes new amendment to the Nationa...,parliament pass new amendment national id regi...
4,Museveni alisema hakuna lockdown tena nchini U...,museveni alisema hakuna lockdown tena nchini u...


In [66]:
# Encode labels
df['label_encoded'] = df['label'].map({'FAKE': 0, 'TRUE': 1})


In [67]:

# Display encoded labels
df[['label', 'label_encoded']].head()


Unnamed: 0,label,label_encoded
0,TRUE,1.0
1,FAKE,0.0
2,FAKE,0.0
3,TRUE,1.0
4,TRUE,1.0


In [68]:
# Save the preprocessed dataset
df.to_csv("../data/processed/uganda_fake_news_preprocessed_v1.csv", index=False)
