# 2. Data Understanding

In [1]:
import pandas as pd
import numpy as np
import re

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load the dataset
df = pd.read_csv("Kenyan_politicians_hatespeech.csv")
df.head(10)

Unnamed: 0,Tweet ID,Likes,Retweets,Total Replies,Texts,Created At
0,1.912286e+18,2.0,0.0,1.0,Rigathi Gachagua ni snitches Ile ya ukweli. La...,2025-04-15 23:23:00+00:00
1,1.912285e+18,0.0,0.0,0.0,@NoCountryHere it’s different for Gachagua man...,2025-04-15 23:21:43+00:00
2,1.912278e+18,0.0,0.0,0.0,@gitaus794 @Mithika_Denno @bonifacemwangi Wher...,2025-04-15 22:52:56+00:00
3,1.912275e+18,0.0,0.0,0.0,@hermexinvesting @MwangiHub The common enemy o...,2025-04-15 22:40:45+00:00
4,1.912272e+18,0.0,0.0,0.0,SHOCK as Gachagua addresses MILLIONS of Mlolon...,2025-04-15 22:30:19+00:00
5,1.91227e+18,0.0,0.0,0.0,@Br1anKE @EtalePhilip Gachagua doesn't feed me...,2025-04-15 22:19:35+00:00
6,1.91227e+18,0.0,0.0,0.0,@RobertAlai @NdindiNyoro Let these assholes se...,2025-04-15 22:19:21+00:00
7,1.912269e+18,0.0,0.0,0.0,"@rigathi Gachagua tiga ubaby mani, kiria utahi...",2025-04-15 22:18:41+00:00
8,1.912266e+18,0.0,0.0,0.0,@NelsonHavi Imagine their are dimwits who even...,2025-04-15 22:04:29+00:00
9,1.912264e+18,0.0,0.0,0.0,@rigathi 😂😂Akamba nation thinking Gachagua wil...,2025-04-15 21:56:23+00:00


In [3]:
df_tail = df.tail(10)
print(df_tail)

           Tweet ID  Likes  Retweets  Total Replies  \
11307  1.567917e+18   12.0       0.0            2.0   
11308  1.808858e+18    0.0       0.0            0.0   
11309  9.767161e+17    1.0       0.0            0.0   
11310  1.911297e+18    2.0       1.0            0.0   
11311  1.910260e+18    3.0       1.0            2.0   
11312  1.910036e+18    9.0       3.0            1.0   
11313  1.729588e+18  190.0      13.0           17.0   
11314  1.912614e+18  543.0      52.0           57.0   
11315  1.910270e+18  516.0     136.0           17.0   
11316  1.567604e+18    0.0       1.0            0.0   

                                                   Texts  \
11307  @Aaroncheruiyot @mwafrika_254 @Wetangulam Why ...   
11308  @khantiger528 @citizentvkenya I totally agree ...   
11309  @trueasembo @MusaliaMudavadi @Wetangulam @KBon...   
11310  Pretty obvious that Musalia, Wetangula, Raila ...   
11311  National Assembly Speaker Moses Wetangula susp...   
11312  Female MPs Fight:\n \nEALA 

In [4]:
# Show basic info and sample
df_info = df.info()
print(df_info)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11317 entries, 0 to 11316
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Tweet ID       11317 non-null  float64
 1   Likes          11317 non-null  float64
 2   Retweets       11317 non-null  float64
 3   Total Replies  11317 non-null  float64
 4   Texts          11317 non-null  object 
 5   Created At     11317 non-null  object 
dtypes: float64(4), object(2)
memory usage: 530.6+ KB
None


In [5]:
# Numeric summary only
numeric_summary = df.describe(include=[float, int]).T
print("🔢 Numeric Summary:\n", numeric_summary)

🔢 Numeric Summary:
                  count          mean           std           min  \
Tweet ID       11317.0  1.827889e+18  1.624154e+17  1.262900e+17   
Likes          11317.0  9.191644e+02  1.448601e+04  0.000000e+00   
Retweets       11317.0  1.956497e+02  2.283746e+03  0.000000e+00   
Total Replies  11317.0  6.696625e+01  1.080506e+03  0.000000e+00   

                        25%           50%           75%           max  
Tweet ID       1.815727e+18  1.891572e+18  1.912258e+18  1.914318e+18  
Likes          1.000000e+00  1.300000e+01  3.020000e+02  1.060202e+06  
Retweets       0.000000e+00  4.000000e+00  5.300000e+01  1.504460e+05  
Total Replies  0.000000e+00  1.000000e+00  1.900000e+01  8.787900e+04  


In [6]:
# Categorical summary only (object, category, etc.)
categorical_summary = df.describe(include=['object']).T
print("\n🔤 Categorical Summary:\n", categorical_summary)


🔤 Categorical Summary:
             count unique                        top freq
Texts       11317  11241    @BernardKavuli Matiangi    5
Created At  11317  11228  2025-04-03 11:25:57+00:00    3


In [7]:
# Checking the columns
df_columns = df.columns
print(df_columns)

Index(['Tweet ID', 'Likes', 'Retweets', 'Total Replies', 'Texts',
       'Created At'],
      dtype='object')


In [8]:
# Checking the number of rows and column
df_shape = df.shape
print(df_shape)

(11317, 6)


In [9]:
# Check for Duplicates
df.duplicated().sum()  # Count duplicates

0

In [10]:
# Checking missing values per column
df.isnull().sum()  

Tweet ID         0
Likes            0
Retweets         0
Total Replies    0
Texts            0
Created At       0
dtype: int64

# 3. Data Cleaning

In [11]:
# Dropping unwanted column
df = df.drop(columns=['Tweet ID'])

In [12]:
df_columns = df.columns
print(df_columns)

Index(['Likes', 'Retweets', 'Total Replies', 'Texts', 'Created At'], dtype='object')


In [13]:
# Dropping duplicates
df.drop_duplicates(inplace=True)

In [14]:
# Convert to numeric columns
df['Likes'] = pd.to_numeric(df['Likes'], errors='coerce')
df['Retweets'] = pd.to_numeric(df['Retweets'], errors='coerce')
df['Total Replies'] = pd.to_numeric(df['Total Replies'], errors='coerce')

# Convert date column
df['Created At'] = pd.to_datetime(df['Created At'], errors='coerce')

In [15]:
# Step 1: Remove duplicates
df_cleaned = df.drop_duplicates(subset='Texts').copy()

In [16]:
# Step 2: Clean text function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+', '', text)  # remove URLs
    text = re.sub(r'@\w+', '', text)  # remove mentions
    text = re.sub(r'#\w+', '', text)  # remove hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove punctuation and numbers
    text = re.sub(r'\s+', ' ', text).strip()  # normalize whitespace
    return text

df_cleaned['Cleaned_Text'] = df_cleaned['Texts'].apply(clean_text)

In [17]:
# Insert the new column after 'cleaned_text'
col_index = df_cleaned.columns.get_loc('Cleaned_Text') + 1  # position after 'cleaned_text'
df_cleaned.insert(loc=col_index, column='Label', value= '')  # or value='pending' or ''

In [18]:
# Convert to categorical columns
df_cleaned['Cleaned_Text'] = df_cleaned['Cleaned_Text'].astype('category')
df_cleaned['Label'] = df_cleaned['Label'].astype('category')

In [19]:
# Cleaning column names
df_cleaned.columns = df_cleaned.columns.str.strip().str.lower().str.replace(' ', '_')

In [20]:
# Sort by raw tweet text
df_cleaned = df_cleaned.sort_values(by='cleaned_text')

In [21]:
df_cleaned = df_cleaned.drop(columns=['texts'])
df_cleaned.columns

Index(['likes', 'retweets', 'total_replies', 'created_at', 'cleaned_text',
       'label'],
      dtype='object')

In [22]:
df_cleaned.to_csv('kenyan_politics_cleaned_text.csv', index=False)

In [24]:
"""
# Select rows 9004 to 11254 (index 9003 to 11253)
subset_df = df_cleaned.iloc[9003:11254]

# Step 3: Save to a new CSV
subset_df.to_csv("kenya_tweets_subset_9004_11254.csv", index=False)

print("Subset saved as kenya_tweets_subset_9004_11254.csv")
"""

'\n# Select rows 9004 to 11254 (index 9003 to 11253)\nsubset_df = df_cleaned.iloc[9003:11254]\n\n# Step 3: Save to a new CSV\nsubset_df.to_csv("kenya_tweets_subset_9004_11254.csv", index=False)\n\nprint("Subset saved as kenya_tweets_subset_9004_11254.csv")\n'