In [1]:
# Import libraries
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input, Concatenate
from tensorflow.keras.layers import BatchNormalization

In [2]:
# Load train data
file_path = "4000_train.csv"
df_raw = pd.read_csv(file_path)

df_raw.head()

Unnamed: 0.1,Unnamed: 0,Transaction Number,Transaction Date,Transaction Type,Transaction Description,Balance,Category,Location City,Location Country,Amount
0,2499,2500,19/06/2020,FPO,ALICJA ALEXANDER,1220.15,Services/Home Improvement,Nottingham,UK,-25.0
1,5252,5253,22/02/2017,BP,SAVE THE CHANGE,16851.42,Savings,Swansea,UK,-0.38
2,4462,4463,01/03/2018,SO,ESAVINGS ACCOUNT,21443.89,Savings,Swansea,UK,-300.0
3,2789,2790,03/02/2020,SO,ESAVINGS ACCOUNT,11951.41,Savings,Nottingham,UK,-300.0
4,1918,1919,22/01/2021,BP,SAVE THE CHANGE,629.61,Savings,Nottingham,UK,-1.18


In [3]:
# Drop unnecessary columns and clean the data
df_raw.drop(columns=["Unnamed: 0", "Transaction Number"], inplace=True)
df_raw.dropna(subset=["Category"], inplace=True)

df_raw["Transaction Date"] = pd.to_datetime(df_raw["Transaction Date"], format="%d/%m/%Y")
df_raw["weekday"] = df_raw["Transaction Date"].dt.day_name()
df_raw["Transaction Description"] = df_raw["Transaction Description"].str.lower().str.replace(r"[^a-z0-9\s]", "", regex=True)

# Create a new DataFrame
df = df_raw[["Transaction Description", "weekday", "Amount", "Category"]]
df.head(10)

Unnamed: 0,Transaction Description,weekday,Amount,Category
0,alicja alexander,Friday,-25.0,Services/Home Improvement
1,save the change,Wednesday,-0.38,Savings
2,esavings account,Thursday,-300.0,Savings
3,esavings account,Monday,-300.0,Savings
4,save the change,Friday,-1.18,Savings
5,amazon uk marketpl,Thursday,-12.99,Amazon
6,save the change,Monday,-2.99,Savings
7,amznmktplace,Thursday,-13.99,Amazon
8,amazoncoukvm9gg,Monday,-14.4,Amazon
9,trading212uk,Thursday,-200.0,Investment


In [4]:
# Data preprocessing
## Text data (TF-IDF)
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_text = vectorizer.fit_transform(df["Transaction Description"]).toarray()

## Categorical data (One-hot encoding)
ohe = OneHotEncoder(sparse_output=False)
X_weekday = ohe.fit_transform(df[["weekday"]])

## Num data (Scaler)
scaler = StandardScaler()
X_balance = scaler.fit_transform(df[["Amount"]])

# Concatenate all features
X = np.hstack((X_text, X_weekday, X_balance))

# Label encoding
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df["Category"].str.split(',')) 

In [5]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Build the model
model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(512, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(y.shape[1], activation='sigmoid')  # Мulti-label → sigmoid
])

In [13]:
# Compile and train the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)

Epoch 1/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 18ms/step - accuracy: 0.4039 - loss: 0.7632 - val_accuracy: 0.5872 - val_loss: 0.3798
Epoch 2/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.7920 - loss: 0.2815 - val_accuracy: 0.6524 - val_loss: 0.1162
Epoch 3/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.8874 - loss: 0.0723 - val_accuracy: 0.7315 - val_loss: 0.0900
Epoch 4/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.9387 - loss: 0.0361 - val_accuracy: 0.8595 - val_loss: 0.0677
Epoch 5/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.9723 - loss: 0.0218 - val_accuracy: 0.9097 - val_loss: 0.0431
Epoch 6/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.9840 - loss: 0.0146 - val_accuracy: 0.9134 - val_loss: 0.0265
Epoch 7/10
[1m100/100

<keras.src.callbacks.history.History at 0x179a90820>

In [15]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9351 - loss: 0.0173 
Test Accuracy: 92.22%


In [17]:
# Load final test dataset
new_file_path = "Rest_Test.csv"
df_new = pd.read_csv(new_file_path)

df_new["Transaction Description"] = df_new["Transaction Description"].str.lower().str.replace(r"[^a-z0-9\s]", "", regex=True)
df_new["weekday"] = pd.to_datetime(df_new["Transaction Date"], format="%d/%m/%Y").dt.day_name()

# Transform new data
X_new_text = vectorizer.transform(df_new["Transaction Description"]).toarray()
X_new_weekday = ohe.transform(df_new[["weekday"]])
X_new_balance = scaler.transform(df_new[["Amount"]])

# Combine new data
X_new = np.hstack((X_new_text, X_new_weekday, X_new_balance))

In [19]:
# Final predictions and probabilities
predictions_new = model.predict(X_new)
probabilities_new = np.max(predictions_new, axis=1) * 100  
y_pred_labels_new = np.argmax(predictions_new, axis=1)

[1m81/81[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step   


In [21]:
# Final dataframe with predictions and probabilities
df_new["Predicted Category"] = mlb.classes_[y_pred_labels_new]
df_new["Prediction Probability (%)"] = probabilities_new

df_new.head(20)

Unnamed: 0.1,Unnamed: 0,Transaction Number,Transaction Date,Transaction Type,Transaction Description,Balance,Category,Location City,Location Country,Amount,weekday,Predicted Category,Prediction Probability (%)
0,5831,5832,11/05/2016,BP,save the change,12346.9,Savings,Swansea,UK,-1.42,Wednesday,Savings,99.927055
1,867,868,10/12/2021,DEB,antigoni abrahms,1205.42,Entertainment,London,UK,-6.0,Friday,Entertainment,99.476891
2,2974,2975,04/11/2019,DEB,iz ezy vegan ltd,4941.3,Dine Out,Nottingham,UK,-9.5,Monday,Dine Out,73.26693
3,1140,1141,24/09/2021,DEB,audible uk,1086.19,Entertainment,Nottingham,UK,-7.99,Friday,Entertainment,99.535454
4,997,998,01/11/2021,DD,severn trent water,2579.88,Bills,Nottingham,UK,-35.32,Monday,Bills,99.945541
5,1337,1338,19/07/2021,DEB,sainsburys smkts,975.85,Groceries,,,-21.55,Monday,Groceries,99.76387
6,541,542,28/02/2022,FPO,miss d wu,3053.04,Services,Nottingham,UK,-11.0,Monday,Services,99.888451
7,1099,1100,04/10/2021,DEB,amznmktplace,1666.98,Amazon,Nottingham,UK,-22.84,Monday,Amazon,99.365898
8,1132,1133,27/09/2021,DEB,oriental mart hydr,961.58,Groceries,Nottingham,UK,-13.04,Monday,Groceries,99.941216
9,6359,6360,15/10/2015,DEB,amazon uk marketpl,8717.21,Amazon,Swansea,UK,-24.95,Thursday,Amazon,99.865707


In [25]:
df_new.to_csv('predictions.csv', index=False)