In [116]:
# Import libraries
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from stop_words import get_stop_words
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

### GridSearchCV for parameter tuning

In [117]:
# Import data, pick relevant columns and concatenate them to a single string
data = pd.read_csv("transaction_data.csv", sep = ";", index_col = 0)
X = data[["Buchungstext", "Verwendungszweck", "Beguenstigter_Zahlungspflichtiger", "Betrag"]].agg(" ".join, axis=1)
y = data[["Label"]]

In [118]:
# Split data into training & test set (X: features, e.g. "Verwendungszweck" / y: label to be predicted)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 2021)

In [119]:
# Apply vectorization and TF IDF transformation
vect = TfidfVectorizer(stop_words = get_stop_words("german"), ngram_range = (1,3))
vect.fit(X_train)
X_train_count = vect.transform(X_train)
X_test_count = vect.transform(X_test)

In [120]:
# Implement model and train it
model = MultinomialNB()
model.fit(X_train_count, y_train.values.ravel())

MultinomialNB()

In [121]:
# Test predictive quality of multinomial Naive Bayes classifier on test data and display quality indicator accuracy
#y_predict = model.predict(X_test_count)
#accuracy_rating = accuracy_score(y_test, y_predict)
#print("Accuracy score: " + str(accuracy_rating))

In [122]:
# Implement pipeline to prevent data leakage and cross-validate parameters
pipe = Pipeline([
    ("vect", TfidfVectorizer(stop_words = get_stop_words("german"), ngram_range = (1,3))),
    ("model", MultinomialNB())
])

In [123]:
# Test predictive quality of multinomial Naive Bayes classifier on test data and display quality indicators
print(classification_report(y_test, y_predict))

                  precision    recall  f1-score   support

         finance       1.00      1.00      1.00         8
          income       1.00      1.00      1.00         5
         leisure       0.90      1.00      0.95        26
          living       1.00      0.40      0.57         5
         private       1.00      0.80      0.89         5
standardOfLiving       0.93      1.00      0.97        14

        accuracy                           0.94        63
       macro avg       0.97      0.87      0.90        63
    weighted avg       0.94      0.94      0.93        63



In [124]:
# Perform cross-validation for more generalisable results
cross_val = cross_val_score(pipe, X, y.values.ravel(), cv = 10).mean()
print("Cross-validated accuracy score: " + str(cross_val))

Cross-validated accuracy score: 0.9


In [163]:
# Print confusion matrix
cf_matrix = confusion_matrix(y_test, y_predict)
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot = True, fmt = ".2%", cmap = sns.light_palette("seagreen", as_cmap = True))

AttributeError: 'numpy.ndarray' object has no attribute 'pivot'