<a href="https://colab.research.google.com/github/kibali-cell/ML-Projects/blob/main/ExpensesCategorization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Dependencies

In [155]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


Data Collection & Preprocessing

In [156]:
# Transactions Data Loading
data = pd.read_csv('mpesa_transactions.csv')

In [157]:
print(X.dtypes)

Transaction Type      int64
Recipient             int64
Amount              float64
Is Expense          float64
dtype: object


In [158]:
# Filtering out 'Uncategorized' rows to ensure we train on labeled data
data = data[data['Category'] != 'Uncategorized']

In [159]:
data.head()

Unnamed: 0,Date,Time,Transaction Type,Recipient,Amount,Is Expense,Category
0,2025-04-26,14:38:11,Merchant Payment,6497271 - Jeremaih kobia,-120.0,True,Food
1,2025-04-26,14:30:32,Customer Payment to Small Business,2547******059 Samwel Kamau,-100.0,True,Food
2,2025-04-26,03:08:58,Customer Bundle Purchase,4093441SAFARICOM DATA BUNDLES,-10.0,True,Utilities
3,2025-04-25,19:56:57,Merchant Payment,5047120 - KENNEDY MAINA 2,-100.0,True,Transport
4,2025-04-25,19:49:20,Customer Payment to Small Business,2547******216 ANNE NJINO,-70.0,True,Food


In [160]:
data['Is Expense'] = data['Is Expense'].map({True: 1, False: 0})

In [161]:
# Convert 'Amount' to numeric (handle strings like "-120.00")
data['Amount'] = pd.to_numeric(data['Amount'], errors='coerce')

# Any NaN values in 'Amount' after conversion
if data['Amount'].isna().any():
    print("Warning: Some 'Amount' values could not be converted to numeric. Dropping these rows.")
    data = data.dropna(subset=['Amount'])



In [162]:
# Label Encoding (Features to numeric)
# Encode categorical variables
le_transaction_type = LabelEncoder()
data['Transaction Type'] = le_transaction_type.fit_transform(data['Transaction Type'])

le_recipient = LabelEncoder()
data['Recipient'] = le_recipient.fit_transform(data['Recipient'])

le_category = LabelEncoder()
data['Category'] = le_category.fit_transform(data['Category'])


In [163]:
# Verify data types to ensure all are numeric
X = data[['Transaction Type', 'Recipient', 'Amount', 'Is Expense']]
print("Data types of features:")
print(X.dtypes)

Data types of features:
Transaction Type      int64
Recipient             int64
Amount              float64
Is Expense          float64
dtype: object


In [164]:
data.head()

Unnamed: 0,Date,Time,Transaction Type,Recipient,Amount,Is Expense,Category
0,2025-04-26,14:38:11,4,41,-120.0,,2
1,2025-04-26,14:30:32,1,15,-100.0,,2
2,2025-04-26,03:08:58,0,34,-10.0,,7
3,2025-04-25,19:56:57,4,35,-100.0,,6
4,2025-04-25,19:49:20,1,19,-70.0,,2


In [165]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 141 entries, 0 to 150
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Date              141 non-null    object 
 1   Time              141 non-null    object 
 2   Transaction Type  141 non-null    int64  
 3   Recipient         141 non-null    int64  
 4   Amount            141 non-null    float64
 5   Is Expense        0 non-null      float64
 6   Category          141 non-null    int64  
dtypes: float64(2), int64(3), object(2)
memory usage: 12.9+ KB


In [166]:
# Checking for any data with null values
data.isnull().sum()

Unnamed: 0,0
Date,0
Time,0
Transaction Type,0
Recipient,0
Amount,0
Is Expense,141
Category,0


In [167]:
data.describe()

Unnamed: 0,Transaction Type,Recipient,Amount,Is Expense,Category
count,141.0,141.0,141.0,0.0,141.0
mean,2.624113,25.602837,-107.858156,,2.801418
std,1.891859,15.266153,344.376716,,1.968393
min,0.0,0.0,-4000.0,,0.0
25%,1.0,10.0,-100.0,,2.0
50%,2.0,31.0,-60.0,,2.0
75%,4.0,39.0,-25.0,,4.0
max,7.0,50.0,60.0,,7.0


In [168]:
print(X.dtypes)

Transaction Type      int64
Recipient             int64
Amount              float64
Is Expense          float64
dtype: object


Data Visualization


Separating Features and Target

In [169]:
X = data[['Transaction Type', 'Recipient', 'Amount', 'Is Expense']]
y = data['Category']


In [170]:
# Spliting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(112, 4)
(29, 4)
(112,)
(29,)


Model Training

In [171]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

MAking Predictions

In [172]:
y_pred = model.predict(X_test)

In [173]:
# Model Evaluation

print("\nModel Perfomance Report:")
print(classification_report(y_test, y_pred, target_names=le_category.classes_))


Model Perfomance Report:
              precision    recall  f1-score   support

     Alcohol       0.25      1.00      0.40         1
       Bills       1.00      1.00      1.00         4
        Food       0.90      0.90      0.90        10
   Groceries       1.00      0.50      0.67         2
      Income       1.00      1.00      1.00         2
   Transfers       1.00      0.83      0.91         6
   Transport       0.00      0.00      0.00         1
   Utilities       1.00      1.00      1.00         3

    accuracy                           0.86        29
   macro avg       0.77      0.78      0.73        29
weighted avg       0.91      0.86      0.87        29



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [175]:
#Sample Prediction

sample_transaction = pd.DataFrame({
    'Transaction Type': [le_transaction_type.transform(['Merchant Payment'])[0]],
    'Recipient': [le_recipient.transform(['6497271 - Jeremaih kobia'])[0]],
    'Amount': [-120.00],
    'Is Expense': [1]
})

predicted_category = le_category.inverse_transform(model.predict(sample_transaction))[0]
print(f"\nSample Prediction for '6497271 - Jeremaih kobia': {predicted_category}")


Sample Prediction for '6497271 - Jeremaih kobia': Food
