Author: Keijaoh Campbell <br/>
Website: keijaoh.com <br/>
Title: ML Journey Part 5: Predicting Expense Categories

In [None]:
#dataset can be found at: https://www.kaggle.com/datasets/prasad22/daily-transactions-dataset/
# use Subcategory like a expense_name

In [None]:
'''
Github: https://github.com/keijaoh/software-to-ml-365-challenge-day

Kaggle Dataset: https://www.kaggle.com/datasets/prasad22/daily-transactions-dataset/

'''

**Importing Data from Google Drive**

In [None]:
#Library used to import data from google drive
!pip install -U -q PyDrive

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials


# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
#initial libraries required
import pandas as pd
from pandas import json_normalize


In [None]:
#colab specific to access environmental variables
from google.colab import userdata

In [None]:
#location of the CSV and the name of the file
file_link = userdata.get('file_url')
file_name = userdata.get('file_name')

# to get the id part of the file
id = file_link.split("/")[-2]

downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile(file_name)

expenses_csv = pd.read_csv(file_name)

In [None]:
#Preview what is inside the file
expenses_csv

Unnamed: 0,Date,Mode,Category,Subcategory,Note,Amount,Income/Expense,Currency
0,20/09/2018 12:04:08,Cash,Transportation,Train,2 Place 5 to Place 0,30.0,Expense,INR
1,20/09/2018 12:03:15,Cash,Food,snacks,Idli medu Vada mix 2 plates,60.0,Expense,INR
2,19/09/2018,Saving Bank account 1,subscription,Netflix,1 month subscription,199.0,Expense,INR
3,17/09/2018 23:41:17,Saving Bank account 1,subscription,Mobile Service Provider,Data booster pack,19.0,Expense,INR
4,16/09/2018 17:15:08,Cash,Festivals,Ganesh Pujan,Ganesh idol,251.0,Expense,INR
...,...,...,...,...,...,...,...,...
2456,1/1/2015,Cash,Transportation,,share jeep - Place T base to top,20.0,Expense,INR
2457,1/1/2015,Cash,Transportation,,share auto - Place H to Place T base,20.0,Expense,INR
2458,1/1/2015,Cash,Transportation,,bus - brc to Place H,30.0,Expense,INR
2459,1/1/2015,Cash,Food,,tea,10.0,Expense,INR


In [None]:
def num_of_nan_sub_category():
  # Count the number of NaN values in the 'Subcategory' column
  nan_count = expenses_csv['Subcategory'].isna().sum()

  not_nan_count = expenses_csv['Subcategory'].notna().sum()

  # Print the number of NaN values in the 'Subcategory' column
  print(f"Number of NaN values in the 'Subcategory' column: {nan_count} vs Not NaN values {not_nan_count}")

num_of_nan_sub_category()

Number of NaN values in the 'Subcategory' column: 0 vs Not NaN values 2461


In [None]:
#Note: the following approach could have been handled various other ways
#I am choosing to instead
'''
1. If the "Subcategory" column is NaN, it will look at the "Note" column's value.
2. If the "Note" column is also NaN (or empty), it will set the value to "Miscellaneous".
'''

#fills the SubCategory if its empty
def fill_subcategory(row):
    if pd.isna(row['Subcategory']):
        return row['Note'] if pd.notna(row['Note']) and row['Note'] != '' else 'Miscellaneous'
    else:
        return row['Subcategory']

# Apply the function to each row
expenses_csv['Subcategory'] = expenses_csv.apply(fill_subcategory, axis=1)

# Print the DataFrame to see the changes
expenses_csv


Unnamed: 0,Date,Mode,Category,Subcategory,Note,Amount,Income/Expense,Currency
0,20/09/2018 12:04:08,Cash,Transportation,Train,2 Place 5 to Place 0,30.0,Expense,INR
1,20/09/2018 12:03:15,Cash,Food,snacks,Idli medu Vada mix 2 plates,60.0,Expense,INR
2,19/09/2018,Saving Bank account 1,subscription,Netflix,1 month subscription,199.0,Expense,INR
3,17/09/2018 23:41:17,Saving Bank account 1,subscription,Mobile Service Provider,Data booster pack,19.0,Expense,INR
4,16/09/2018 17:15:08,Cash,Festivals,Ganesh Pujan,Ganesh idol,251.0,Expense,INR
...,...,...,...,...,...,...,...,...
2456,1/1/2015,Cash,Transportation,share jeep - Place T base to top,share jeep - Place T base to top,20.0,Expense,INR
2457,1/1/2015,Cash,Transportation,share auto - Place H to Place T base,share auto - Place H to Place T base,20.0,Expense,INR
2458,1/1/2015,Cash,Transportation,bus - brc to Place H,bus - brc to Place H,30.0,Expense,INR
2459,1/1/2015,Cash,Food,tea,tea,10.0,Expense,INR


In [None]:
#checking to make sure there is no Nan
num_of_nan_sub_category()


Number of NaN values in the 'Subcategory' column: 0 vs Not NaN values 2461


In [None]:
#we dont need the Date, Mode, Amount or Currency

columns_to_exclude = ['Date','Mode','Amount','Currency']

# Drop the columns from the DataFrame
expenses_csv = expenses_csv.drop(columns_to_exclude, axis=1)

#check the dataframe to see what is left
expenses_csv

Unnamed: 0,Category,Subcategory,Note,Income/Expense
0,Transportation,Train,2 Place 5 to Place 0,Expense
1,Food,snacks,Idli medu Vada mix 2 plates,Expense
2,subscription,Netflix,1 month subscription,Expense
3,subscription,Mobile Service Provider,Data booster pack,Expense
4,Festivals,Ganesh Pujan,Ganesh idol,Expense
...,...,...,...,...
2456,Transportation,share jeep - Place T base to top,share jeep - Place T base to top,Expense
2457,Transportation,share auto - Place H to Place T base,share auto - Place H to Place T base,Expense
2458,Transportation,bus - brc to Place H,bus - brc to Place H,Expense
2459,Food,tea,tea,Expense


# Tensorlfow and Training

In [None]:
# all the other libraries outside of pd
# TensorFlow and Keras imports
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Scikit-learn imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

#checking the tensorflow version
print(tf.__version__)

2.14.0


Pre-Processing the Data

In [None]:
#set the values to lowercase to standardize the results

#Subcategory in this dataset is like the expense name
expenses_csv['Subcategory'] = expenses_csv['Subcategory'].str.lower()
expenses_csv['Category'] = expenses_csv['Category'].str.lower()

#Creating the features and labels
X = expenses_csv['Subcategory'].astype(str).values
Y = expenses_csv['Category'].values


In [None]:
#Tokenizing the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=100) # adjust the maxlen based on your data

In [None]:
#Encoding the labels
onehot_encoder = OneHotEncoder(sparse_output= False)

y_encoded = onehot_encoder.fit_transform(Y.reshape(-1,1))

In [None]:
#Train-Test split
X_train, X_test, Y_train, Y_test = train_test_split(X_pad, y_encoded, test_size= 0.2, random_state=42)

In [None]:
#Building the model
model = Sequential()
model.add(Embedding(input_dim= len(tokenizer.word_index) + 1, output_dim=128, input_length=100))

#LSTM (Long Short-Term Memory) is a recurrent neural network (RNN)
#architecture widely used in Deep Learning. It excels at capturing long-term dependencies, making it ideal for sequence prediction tasks.
model.add(LSTM(64))
model.add(Dense(y_encoded.shape[1], activation='softmax'))

In [None]:
#Compile the model
#Notes:
#Adam optimizer, or Adaptive Moment Estimation, is a gradient descent-based algorithm that minimizes the loss function during neural network training.
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
#Training the model
#Notes: an epoch is the number of passes a training dataset takes around an algorithm
model.fit(X_train,Y_train, epochs=10, validation_data=(X_test,Y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f94fdbfbfa0>

# Testing the Trained Model

In [None]:
'''
must preprocess the data the same way the training was done to be consisent
so if the training data is lowercased, padded and tokenized the user input data must be the same
'''

def pre_process_input(expense_name, tokenizer,maxlen=100):
  #Convert to lowercase
  expense_name = expense_name.lower()
  #Tokenizing
  sequence = tokenizer.texts_to_sequences([expense_name])

  #Padding
  padded_sequence = pad_sequences(sequence, maxlen= maxlen)

  return padded_sequence



In [None]:
#Predict the Category and return the top 3 predictions

#need numpy
import numpy as np

#call predict top 3 categories
def display_top_three_predictions(input_text):
  #handle preprocessing of the input
  pre_processed_input = pre_process_input(input_text, tokenizer)

  #it outputs the prediction problabilities
  probabilities = model.predict(pre_processed_input)


 # We want the first 3 predictions
  top_three_indices = np.argsort(probabilities, axis=1)[:, -3:]
  # Reverse the order of the elements in the array along the given axis
  top_three_indices = np.flip(top_three_indices, axis=1)

  top_three_categories = [onehot_encoder.categories_[0][indices] for indices in top_three_indices]

  return top_three_indices, top_three_categories, probabilities


input_text = "hospital"

top_three_indicies, top_three_categories, probabilities = display_top_three_predictions(input_text)

for indices, preds in zip(top_three_indicies, top_three_categories):
    print(f"Predictions for '{input_text}':")
    for index, pred in zip(indices, preds):
        probability = probabilities[0][index]  # Assuming one input
        print(f"  {pred}: {probability * 100:.2f}%")



Predictions for 'hospital':
  health: 89.11%
  festivals: 1.79%
  food: 1.61%
