In [1]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from stop_words import get_stop_words
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [2]:
# Import data, pick relevant columns and concatenate them
# Replace all non-alphanumerical chars (except German mutation and slashes for booking texts) with a space
data = pd.read_csv("transaction_data.csv", sep = ";", index_col= 0)
X = data[["Buchungstext", "Verwendungszweck", "Beguenstigter_Zahlungspflichtiger", "Betrag"]].agg(" ".join, axis=1)
y = data.Label

In [3]:
# Split data into training & test set (X: independent variable(s), e.g. "Verwendungszweck" / y: dependent variable (Label))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [4]:
# Vectorize data set into vocab of single words ("tokens") via TF-IDF (assessing term relevance in document collection)
# Remove German stopwords (e.g. "alle", "bis") in the process
tokens = TfidfVectorizer(stop_words = get_stop_words("german"), ngram_range = (1,3))
tokens.fit(X_train)

X_train_count = tokens.transform(X_train).toarray()
X_test_count = tokens.transform(X_test).toarray()

In [5]:
# Define model and train it
model = MultinomialNB()
model.fit(X_train_count, y_train)

MultinomialNB()

In [6]:
# Test predictive quality of multinomial Naive Bayes classifier on test data and display quality indicators
y_predict = model.predict(X_test_count)
print(accuracy_score(y_test, y_predict))

# Perform cross-validation for more generalisable results
cross_val = cross_val_score(model, X_test_count, y_test, cv = 5)
print(cross_val)

0.9206349206349206
[0.69230769 0.76923077 0.76923077 0.66666667 0.58333333]
