In [None]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from stop_words import get_stop_words
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [None]:
# Import data, pick relevant columns and concatenate them to a single string
data = pd.read_csv("transaction_data.csv", sep = ";", index_col = 0)
X = data[["Buchungstext", "Verwendungszweck", "Beguenstigter_Zahlungspflichtiger", "Betrag"]].agg(" ".join, axis=1)
y = data[["Label"]]

In [36]:
# Split data into training & test set (X: features, e.g. "Verwendungszweck" / y: label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)

In [35]:
# Create a data preprocessing pipeline featuring word token vectorization, inverse document frequency transformation and actual ML model
# HIER EINFACH PIPELINE KOMPLETTIEREN, DIREKT MIT .FIT AUF TRAININGSDATEN ANWENDEN UND DANN PREDICTEN (SIEHE MEDIUM ARTIKEL)
pipe = Pipeline(steps = [
    ("vectorizer",  CountVectorizer(stop_words = get_stop_words("german"), lowercase = True)),
    ("tfidf",       TfidfTransformer(use_idf = True, smooth_idf = True)),
    ("model",       MultinomialNB()),
    ])

In [None]:
# Cross-validate acquired results
cross_val = cross_val_score(pipe, X, y, cv = 5)
print(cross_val)

In [None]:
# Vectorize data set into vocab of single words ("tokens") via TF-IDF (assessing term relevance in document collection)
# Remove German stopwords (e.g. "alle", "bis") in the process
tokens = TfidfVectorizer(stop_words = get_stop_words("german"), ngram_range = (1,3))
tokens.fit(X_train)

X_train_count = tokens.transform(X_train).toarray()
X_test_count = tokens.transform(X_test).toarray()

In [None]:
# Implement model and train it
model = MultinomialNB()
model.fit(X_train_count, y_train)

In [None]:
# Test predictive quality of multinomial Naive Bayes classifier on test data and display quality indicators
y_predict = model.predict(X_test_count)
print(accuracy_score(y_test, y_predict))

# Perform cross-validation for more generalisable results
cross_val = cross_val_score(model, X_test_count, y_test, cv = 5)
print(cross_val)