- For dealing with text related tasks, we will be using nltk. 
- For machine learning related tasks, we will be using scikit-learn library.

![title](misc/workflow.png)


In [2]:
from os import listdir
from os.path import isfile, join

import sys
import ast
import json
import pandas as pd
import numpy as np
from loguru import logger
from shutil import copy

import random
import warnings
from joblib import dump, load
from operator import itemgetter

import re
import yaml
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

from src.helpers import preprocess_single_text, load_mapping

# from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, precision_recall_fscore_support 
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

import matplotlib.pyplot as plt
%matplotlib inline

nltk.download('stopwords')
nltk.download('punkt')

pd.options.display.max_columns = 100
pd.options.display.max_rows = 300
pd.options.display.max_colwidth = 100
np.set_printoptions(threshold=2000)
warnings.filterwarnings('ignore')

logger.add("logs/{time}.log")
logger.add(sys.stdout, colorize=True, format="<green>{time}</green> <level>{message}</level>")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nikhilkumarjha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nikhilkumarjha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nikhilkumarjha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nikhilkumarjha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


2

### Load data

In [3]:
def load_data(data_folder="dataset/"):
    filepaths = [join(data_folder, f) for f in listdir(data_folder) if (isfile(join(data_folder, f))) and ("json" in f)]

    data = []
    target = []

    for ind, f in enumerate(filepaths):
        class_name = f.split(".")[0].split("_")[-1]
        json_data = pd.read_json(f).values.tolist()
        logger.info(f'File: {f} - {len(json_data)} rows')
        data.extend([item for sublist in json_data for item in sublist])
        target.extend([class_name] * len(json_data))
        
    return data, target

data, target = load_data(data_folder="dataset/")

2021-12-23 17:11:26.026 | INFO     | __main__:load_data:10 - File: dataset/dataset_1.json - 303 rows


[32m2021-12-23T17:11:26.026581+0100[0m [1mFile: dataset/dataset_1.json - 303 rows[0m


2021-12-23 17:11:26.039 | INFO     | __main__:load_data:10 - File: dataset/dataset_5.json - 237 rows


[32m2021-12-23T17:11:26.039232+0100[0m [1mFile: dataset/dataset_5.json - 237 rows[0m


2021-12-23 17:11:26.060 | INFO     | __main__:load_data:10 - File: dataset/dataset_4.json - 333 rows


[32m2021-12-23T17:11:26.060940+0100[0m [1mFile: dataset/dataset_4.json - 333 rows[0m


2021-12-23 17:11:26.072 | INFO     | __main__:load_data:10 - File: dataset/dataset_3.json - 37 rows


[32m2021-12-23T17:11:26.072553+0100[0m [1mFile: dataset/dataset_3.json - 37 rows[0m


2021-12-23 17:11:26.082 | INFO     | __main__:load_data:10 - File: dataset/dataset_2.json - 985 rows


[32m2021-12-23T17:11:26.082450+0100[0m [1mFile: dataset/dataset_2.json - 985 rows[0m


In [4]:
old_data, old_target = load_data(data_folder="dataset/old")

2021-12-23 17:11:26.129 | INFO     | __main__:load_data:10 - File: dataset/old/dataset_1.json - 353 rows


[32m2021-12-23T17:11:26.129968+0100[0m [1mFile: dataset/old/dataset_1.json - 353 rows[0m


2021-12-23 17:11:26.153 | INFO     | __main__:load_data:10 - File: dataset/old/dataset_5.json - 139 rows


[32m2021-12-23T17:11:26.153557+0100[0m [1mFile: dataset/old/dataset_5.json - 139 rows[0m


2021-12-23 17:11:26.163 | INFO     | __main__:load_data:10 - File: dataset/old/dataset_4.json - 392 rows


[32m2021-12-23T17:11:26.163713+0100[0m [1mFile: dataset/old/dataset_4.json - 392 rows[0m


2021-12-23 17:11:26.181 | INFO     | __main__:load_data:10 - File: dataset/old/dataset_3.json - 28 rows


[32m2021-12-23T17:11:26.181741+0100[0m [1mFile: dataset/old/dataset_3.json - 28 rows[0m


2021-12-23 17:11:26.190 | INFO     | __main__:load_data:10 - File: dataset/old/dataset_2.json - 210 rows


[32m2021-12-23T17:11:26.190097+0100[0m [1mFile: dataset/old/dataset_2.json - 210 rows[0m


In [5]:
@logger.catch
def load_mapping(mapping_file='dataset/mapping.yaml'):
    with open(mapping_file, 'r') as f:
        mapping = yaml.safe_load(f)

    mapping_dict = {}
    _ = [mapping_dict.update({f"{el[0]}":f"{el[1]}"}) for el in 
         [(el.split('=')[0].split('.')[0].split('_')[1], el.split('=')[1].strip()) for el in mapping]]
    
    return mapping_dict

mapping_dict = load_mapping(mapping_file='dataset/mapping.yaml')

### Preprocess data
- Remove unwanted chars and symbols
- Tokenize
- Stemming / Lemmatize
- Remove stop words

In [6]:
@logger.catch
def stopword_removal(text, stop_words, curated_stop_words=None):
    if curated_stop_words is not None and isinstance(curated_stop_words, list):
        stop_words.update(curated_stop_words)

    text = text.lower()
    token = word_tokenize(text)
    
    return ' '.join([w for w in token if not w in stop_words])

In [23]:
@logger.catch
def text_cleaner(text):
    rules = [
        {r'\n': u' '}, # remove new line character
        {r'\t': u' '}, # remove tab character
        {r'ß': u'ss'}, # replace ä with ae
        {r'ä': u'ae'}, # replace ä with ae
        {r'ü': u'ue'}, # replace ü with ue
        {r'ö': u'oe'}, # replace ö with oe
        {r'oe': u'o'}, # replace oe with o
        {r'ue': u'u'}, # replace ue with u
        {r'ae': u'a'}, # replace ae with a
        {r'xxx': u' '}, # remove xxx word
        {r'[^A-Za-zÀ-ž ]': u' '},  # keep only ASCII + European Chars and whitespace, no digits
        {r'>\s+': u'>'},  # remove spaces after a tag opens or closes
        {r'\s+': u' '},  # replace consecutive spaces
        {r'\s*<br\s*/?>\s*': u'\n'},  # newline after a <br>
        {r'</(div)\s*>\s*': u'\n'},  # newline after </p> and </div> and <h1/>...
        {r'</(p|h\d)\s*>\s*': u'\n\n'},  # newline after </p> and </div> and <h1/>...
        {r'<head>.*<\s*(/head|body)[^>]*>': u''},  # remove <head> to </head>
        {r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'},  # show links instead of texts
        {r'[ \t]*<[^<]*?/?>': u''},  # remove remaining tags
        {r'^\s+': u''},  # remove spaces at the beginning
        {r'\b[A-Za-zÀ-ž]\b': u''} # remove single character words
    ]
    
    for rule in rules:
        for (k, v) in rule.items():
            regex = re.compile(k)
            text = regex.sub(v, text)
        text = text.rstrip()
        
    return text.lower()

In [24]:
@logger.catch
def do_stemming(text, stemmer):
    stemmed_words = [stemmer.stem(word) for word in text]
    
    return ''.join(stemmed_words)

In [25]:
@logger.catch
def preprocess_text(text, stop_words=None, curated_stop_words=None, stemming=False, stemmer=None):
    text = text_cleaner(text)
    
    if stop_words is not None:
        text = stopword_removal(text, stop_words, curated_stop_words)
    
    if stemming:
        text = do_stemming(text, stemmer)
        
    return text

In [26]:
stemmer = SnowballStemmer('german')

stop_words = set(stopwords.words('german'))
with open('dataset/stopwords.yaml', 'r') as f:
    curated_stop_words = yaml.safe_load(f)
    
processed_data = [preprocess_text(text, 
                                  stop_words=stop_words, 
                                  curated_stop_words=curated_stop_words, 
                                  stemming=True, 
                                  stemmer=stemmer) for text in data]

text = "Hiermit möchte ich meine Abtrittserklärung für meine Versicherung Nr.178718252 auf dauer mitteilen.Rechnungen die von meinem Tierarzt Dr.H.D.Bertelsmann , kleintierpraxis Möhnestr.106, 59755Arnsberg kommen sollen alle in Zukunft mit dem Arzt abgerechnet werden.Hier die  Kontodaten vom Tierarzt."
print(preprocess_text(text, stop_words=stop_words, curated_stop_words=curated_stop_words, stemming=True, stemmer=stemmer))

hiermit mochte abtrittserklarung versicherung nr daur mitteilen rechnungen tierarzt dr bertelsmann kleintierpraxis mohnestr arnsberg kommen sollen zukunft arzt abgerechnet kontodaten tierarzt


### EDA

In [27]:
before_after_preprocessing = pd.DataFrame([data, processed_data]).T
before_after_preprocessing.columns = ['before', 'after']
before_after_preprocessing.sample(5)

Unnamed: 0,before,after
1176,bezugnehmend auf ihre eMail vom xxx d.M. übersende ich Ihnen\ndie Tierarztrechnung(siehe Anhang).,bezugnehmend email ubersende tierarztrechnung siehe anhang
1615,unser gemeinsamer Kunde Frau xxx hat eine Rechnung zur Kostenerstattung für seine Hundekrankenve...,gemeinsamer kunde rechnung kostenerstattung hundekrankenversicherung eingereicht bearbeiten anfr...
292,eine weitere Rechnung zu o.g. Versicherung mit der Bitte um Bearbeitung.,weitere rechnung versicherung bearbeitung
1486,Bitte um Rückerstattung folgender Rechnung für Hund xxx Versicherung xxx,ruckerstattung folgender rechnung hund versicherung
1328,anbei reiche ich Ihnen die Rechnung der Kundin mit der Ritte um Erstattung ein.\nDas Fäden ziehe...,anbei reiche rechnung kundin ritte erstattung faden ziehen rechnung durchgefuhrten sterilisation...


In [28]:
word_count = pd.Series(" ".join(before_after_preprocessing["after"]).split()).value_counts()
word_count[:10]

rechnung            1116
anbei                736
kostenerstattung     705
erstattung           424
kunden               392
konto                359
kontaktieren         338
bearbeiten           338
eingereicht          337
kunde                332
dtype: int64

### Train-test split

In [29]:
@logger.catch
def random_train_test_split(processed_data, target):
    test_inds = random.sample(range(len(data)), len(data)//5)
    train_inds = list(set(range(len(data))) - set(test_inds))

    train_x = list(itemgetter(*train_inds)(processed_data))
    test_x = list(itemgetter(*test_inds)(processed_data))
    train_y = list(itemgetter(*train_inds)(target))
    test_y = list(itemgetter(*test_inds)(target))
    
    return train_x, test_x, train_y, test_y

train_x, test_x, train_y, test_y = random_train_test_split(processed_data, target)
metrics_list = []

### Modeling

In [30]:
vectorizer = TfidfVectorizer(lowercase=True, max_features=2000)
vectorizer.fit(processed_data)

train_x_vec = vectorizer.transform(train_x)
test_x_vec = vectorizer.transform(test_x)
logger.info(f'Each word is represented by a vector of length {train_x_vec.shape[1]}')

# Compare original text with its numeric vector representation
logger.info(f"Original sentence:\n{train_x[0]}\n")

# Feature Matrix
features = pd.DataFrame(train_x_vec[0].toarray(), columns=vectorizer.get_feature_names())

nonempty_features = features.loc[:, (features != 0).any(axis=0)]
logger.info(f"Vector representation of sentence:\n {nonempty_features}")

2021-12-23 17:26:17.210 | INFO     | __main__:<module>:6 - Each word is represented by a vector of length 2000


[32m2021-12-23T17:26:17.210273+0100[0m [1mEach word is represented by a vector of length 2000[0m


2021-12-23 17:26:17.213 | INFO     | __main__:<module>:9 - Original sentence:
tierverischerung vertragsnummer



[32m2021-12-23T17:26:17.213575+0100[0m [1mOriginal sentence:
tierverischerung vertragsnummer
[0m


2021-12-23 17:26:17.229 | INFO     | __main__:<module>:15 - Vector representation of sentence:
    tierverischerung  vertragsnummer
0          0.892814        0.450426


[32m2021-12-23T17:26:17.229374+0100[0m [1mVector representation of sentence:
    tierverischerung  vertragsnummer
0          0.892814        0.450426[0m


In [31]:
# models to test
classifiers = [
    LogisticRegression(solver="saga", random_state=42, n_jobs=-1, penalty='l1'),
    RandomForestClassifier(random_state=42, n_estimators=500, n_jobs=-1, criterion='gini', max_features='sqrt'),
    GradientBoostingClassifier(n_estimators=500, random_state=42),
    DecisionTreeClassifier(random_state=42, criterion='entropy', max_features='sqrt'), 
    KNeighborsClassifier(n_jobs=-1, n_neighbors=7),
    SGDClassifier(n_jobs=-1)
]

# get names of the objects in list 
names = [re.match(r"[^\(]+", name.__str__())[0] for name in classifiers]
logger.info(f"Classifiers to test: {names}")

# test all classifiers and save pred. results on test data
results = {}
for name, clf in zip(names, classifiers):
    logger.info(f"Training classifier: {name}")
    clf.fit(train_x_vec, train_y)
    prediction = clf.predict(test_x_vec)
    report = classification_report(test_y, prediction)
    results[name] = report

    logger.info(f"Results for {name}:")
    logger.info(f"{report}\n")

2021-12-23 17:26:17.765 | INFO     | __main__:<module>:13 - Classifiers to test: ['LogisticRegression', 'RandomForestClassifier', 'GradientBoostingClassifier', 'DecisionTreeClassifier', 'KNeighborsClassifier', 'SGDClassifier']


[32m2021-12-23T17:26:17.765994+0100[0m [1mClassifiers to test: ['LogisticRegression', 'RandomForestClassifier', 'GradientBoostingClassifier', 'DecisionTreeClassifier', 'KNeighborsClassifier', 'SGDClassifier'][0m


2021-12-23 17:26:17.770 | INFO     | __main__:<module>:18 - Training classifier: LogisticRegression


[32m2021-12-23T17:26:17.770416+0100[0m [1mTraining classifier: LogisticRegression[0m


2021-12-23 17:26:18.120 | INFO     | __main__:<module>:24 - Results for LogisticRegression:


[32m2021-12-23T17:26:18.120694+0100[0m [1mResults for LogisticRegression:[0m


2021-12-23 17:26:18.122 | INFO     | __main__:<module>:25 -               precision    recall  f1-score   support

           1       0.32      0.30      0.31        40
           2       0.79      0.89      0.84       211
           3       0.00      0.00      0.00         5
           4       0.85      0.72      0.78        71
           5       1.00      0.87      0.93        52

    accuracy                           0.78       379
   macro avg       0.59      0.55      0.57       379
weighted avg       0.77      0.78      0.77       379




[32m2021-12-23T17:26:18.122584+0100[0m [1m              precision    recall  f1-score   support

           1       0.32      0.30      0.31        40
           2       0.79      0.89      0.84       211
           3       0.00      0.00      0.00         5
           4       0.85      0.72      0.78        71
           5       1.00      0.87      0.93        52

    accuracy                           0.78       379
   macro avg       0.59      0.55      0.57       379
weighted avg       0.77      0.78      0.77       379

[0m


2021-12-23 17:26:18.124 | INFO     | __main__:<module>:18 - Training classifier: RandomForestClassifier


[32m2021-12-23T17:26:18.124649+0100[0m [1mTraining classifier: RandomForestClassifier[0m


2021-12-23 17:26:21.412 | INFO     | __main__:<module>:24 - Results for RandomForestClassifier:


[32m2021-12-23T17:26:21.412204+0100[0m [1mResults for RandomForestClassifier:[0m


2021-12-23 17:26:21.414 | INFO     | __main__:<module>:25 -               precision    recall  f1-score   support

           1       0.36      0.38      0.37        40
           2       0.80      0.87      0.83       211
           3       0.00      0.00      0.00         5
           4       0.86      0.72      0.78        71
           5       0.98      0.88      0.93        52

    accuracy                           0.78       379
   macro avg       0.60      0.57      0.58       379
weighted avg       0.78      0.78      0.78       379




[32m2021-12-23T17:26:21.414136+0100[0m [1m              precision    recall  f1-score   support

           1       0.36      0.38      0.37        40
           2       0.80      0.87      0.83       211
           3       0.00      0.00      0.00         5
           4       0.86      0.72      0.78        71
           5       0.98      0.88      0.93        52

    accuracy                           0.78       379
   macro avg       0.60      0.57      0.58       379
weighted avg       0.78      0.78      0.78       379

[0m


2021-12-23 17:26:21.416 | INFO     | __main__:<module>:18 - Training classifier: GradientBoostingClassifier


[32m2021-12-23T17:26:21.416366+0100[0m [1mTraining classifier: GradientBoostingClassifier[0m


2021-12-23 17:26:41.349 | INFO     | __main__:<module>:24 - Results for GradientBoostingClassifier:


[32m2021-12-23T17:26:41.349725+0100[0m [1mResults for GradientBoostingClassifier:[0m


2021-12-23 17:26:41.354 | INFO     | __main__:<module>:25 -               precision    recall  f1-score   support

           1       0.33      0.38      0.35        40
           2       0.80      0.84      0.82       211
           3       0.25      0.20      0.22         5
           4       0.80      0.72      0.76        71
           5       1.00      0.85      0.92        52

    accuracy                           0.76       379
   macro avg       0.64      0.60      0.61       379
weighted avg       0.77      0.76      0.77       379




[32m2021-12-23T17:26:41.354307+0100[0m [1m              precision    recall  f1-score   support

           1       0.33      0.38      0.35        40
           2       0.80      0.84      0.82       211
           3       0.25      0.20      0.22         5
           4       0.80      0.72      0.76        71
           5       1.00      0.85      0.92        52

    accuracy                           0.76       379
   macro avg       0.64      0.60      0.61       379
weighted avg       0.77      0.76      0.77       379

[0m


2021-12-23 17:26:41.358 | INFO     | __main__:<module>:18 - Training classifier: DecisionTreeClassifier


[32m2021-12-23T17:26:41.358296+0100[0m [1mTraining classifier: DecisionTreeClassifier[0m


2021-12-23 17:26:41.422 | INFO     | __main__:<module>:24 - Results for DecisionTreeClassifier:


[32m2021-12-23T17:26:41.422718+0100[0m [1mResults for DecisionTreeClassifier:[0m


2021-12-23 17:26:41.426 | INFO     | __main__:<module>:25 -               precision    recall  f1-score   support

           1       0.23      0.38      0.29        40
           2       0.78      0.69      0.74       211
           3       0.12      0.20      0.15         5
           4       0.54      0.55      0.55        71
           5       0.82      0.77      0.79        52

    accuracy                           0.64       379
   macro avg       0.50      0.52      0.50       379
weighted avg       0.68      0.64      0.65       379




[32m2021-12-23T17:26:41.426490+0100[0m [1m              precision    recall  f1-score   support

           1       0.23      0.38      0.29        40
           2       0.78      0.69      0.74       211
           3       0.12      0.20      0.15         5
           4       0.54      0.55      0.55        71
           5       0.82      0.77      0.79        52

    accuracy                           0.64       379
   macro avg       0.50      0.52      0.50       379
weighted avg       0.68      0.64      0.65       379

[0m


2021-12-23 17:26:41.428 | INFO     | __main__:<module>:18 - Training classifier: KNeighborsClassifier


[32m2021-12-23T17:26:41.428613+0100[0m [1mTraining classifier: KNeighborsClassifier[0m


2021-12-23 17:26:41.595 | INFO     | __main__:<module>:24 - Results for KNeighborsClassifier:


[32m2021-12-23T17:26:41.595849+0100[0m [1mResults for KNeighborsClassifier:[0m


2021-12-23 17:26:41.598 | INFO     | __main__:<module>:25 -               precision    recall  f1-score   support

           1       0.28      0.57      0.38        40
           2       0.71      0.81      0.76       211
           3       0.00      0.00      0.00         5
           4       0.90      0.39      0.55        71
           5       0.97      0.54      0.69        52

    accuracy                           0.66       379
   macro avg       0.57      0.46      0.48       379
weighted avg       0.73      0.66      0.66       379




[32m2021-12-23T17:26:41.598319+0100[0m [1m              precision    recall  f1-score   support

           1       0.28      0.57      0.38        40
           2       0.71      0.81      0.76       211
           3       0.00      0.00      0.00         5
           4       0.90      0.39      0.55        71
           5       0.97      0.54      0.69        52

    accuracy                           0.66       379
   macro avg       0.57      0.46      0.48       379
weighted avg       0.73      0.66      0.66       379

[0m


2021-12-23 17:26:41.601 | INFO     | __main__:<module>:18 - Training classifier: SGDClassifier


[32m2021-12-23T17:26:41.601329+0100[0m [1mTraining classifier: SGDClassifier[0m


2021-12-23 17:26:41.728 | INFO     | __main__:<module>:24 - Results for SGDClassifier:


[32m2021-12-23T17:26:41.728992+0100[0m [1mResults for SGDClassifier:[0m


2021-12-23 17:26:41.730 | INFO     | __main__:<module>:25 -               precision    recall  f1-score   support

           1       0.37      0.50      0.43        40
           2       0.83      0.82      0.82       211
           3       0.50      0.40      0.44         5
           4       0.78      0.73      0.75        71
           5       0.98      0.90      0.94        52

    accuracy                           0.77       379
   macro avg       0.69      0.67      0.68       379
weighted avg       0.79      0.77      0.78       379




[32m2021-12-23T17:26:41.730921+0100[0m [1m              precision    recall  f1-score   support

           1       0.37      0.50      0.43        40
           2       0.83      0.82      0.82       211
           3       0.50      0.40      0.44         5
           4       0.78      0.73      0.75        71
           5       0.98      0.90      0.94        52

    accuracy                           0.77       379
   macro avg       0.69      0.67      0.68       379
weighted avg       0.79      0.77      0.78       379

[0m


### Gridsearch

In [32]:
# pipe = Pipeline([("tfidf", TfidfVectorizer()), ("svc", LinearSVC())])

# params = {
#     "tfidf__ngram_range": [(1, 2)],
#     "tfidf__max_df": [0.1, 0.3, 0.5, 0.7],
#     "tfidf__min_df": [10, 30, 50, 70],
#     "svc__C": np.arange(0.2, 1, 0.2),
#     "svc__penalty": ['l2'],
#     "svc__max_iter": [5000],
# }

# pipe_clf = GridSearchCV(pipe, params, n_jobs=-1, scoring="f1_macro", verbose=2)
# pipe_clf.fit(processed_data, target)
# best_params = pipe_clf.best_params_
# print(best_params)

# # run pipe with optimized parameters
# pipe.set_params(**best_params).fit(train_x, train_y)
# pipe_pred = pipe.predict(test_x)
# report = classification_report(test_y, pipe_pred)
# print(report)

# with open('configs/svc.json', 'w') as file:
#      file.write(json.dumps(best_params))
        
# pipe.set_params(**best_params).fit(processed_data, target)
# dump(pipe, filename="trained_models/model_svc.sav")

In [33]:
pipe = Pipeline([("tfidf", TfidfVectorizer()), ("logreg", LogisticRegression(n_jobs=-1))])

with open("gridsearch_space/model_logreg.json", "r") as gs_file:
    contents = gs_file.read()
    params = ast.literal_eval(contents)
    logger.info(params)
    
pipe_clf = GridSearchCV(pipe, params, n_jobs=-1, scoring="f1_macro", verbose=2)
pipe_clf.fit(processed_data, target)
best_params = pipe_clf.best_params_
logger.info(best_params)

# run pipe with optimized parameters
pipe.set_params(**best_params).fit(train_x, train_y)
pipe_pred = pipe.predict(test_x)
report = classification_report(test_y, pipe_pred)
logger.info(report)

precision, recall, fscore, support = precision_recall_fscore_support(test_y, pipe_pred, average='macro')
metrics_list.append(['model_logreg', precision, recall, fscore])

with open('configs/logreg.json', 'w') as file:
     file.write(json.dumps(best_params))
        
pipe.set_params(**best_params).fit(processed_data, target)
dump(pipe, filename="trained_models/model_logreg.sav")

2021-12-23 17:26:41.767 | INFO     | __main__:<module>:6 - {'tfidf__ngram_range': [(1, 2)], 'tfidf__max_df': [0.1, 0.3, 0.5, 0.7], 'tfidf__min_df': [10, 30, 50, 70], 'logreg__solver': ['newton-cg', 'liblinear', 'sag'], 'logreg__penalty': ['l1', 'l2', 'elasticnet'], 'logreg__max_iter': [2000]}


[32m2021-12-23T17:26:41.767288+0100[0m [1m{'tfidf__ngram_range': [(1, 2)], 'tfidf__max_df': [0.1, 0.3, 0.5, 0.7], 'tfidf__min_df': [10, 30, 50, 70], 'logreg__solver': ['newton-cg', 'liblinear', 'sag'], 'logreg__penalty': ['l1', 'l2', 'elasticnet'], 'logreg__max_iter': [2000]}[0m
Fitting 5 folds for each of 144 candidates, totalling 720 fits


2021-12-23 17:27:16.199 | INFO     | __main__:<module>:11 - {'logreg__max_iter': 2000, 'logreg__penalty': 'l2', 'logreg__solver': 'newton-cg', 'tfidf__max_df': 0.7, 'tfidf__min_df': 10, 'tfidf__ngram_range': (1, 2)}


[32m2021-12-23T17:27:16.199748+0100[0m [1m{'logreg__max_iter': 2000, 'logreg__penalty': 'l2', 'logreg__solver': 'newton-cg', 'tfidf__max_df': 0.7, 'tfidf__min_df': 10, 'tfidf__ngram_range': (1, 2)}[0m


2021-12-23 17:27:16.380 | INFO     | __main__:<module>:17 -               precision    recall  f1-score   support

           1       0.39      0.40      0.40        40
           2       0.81      0.88      0.84       211
           3       0.00      0.00      0.00         5
           4       0.81      0.72      0.76        71
           5       1.00      0.87      0.93        52

    accuracy                           0.79       379
   macro avg       0.60      0.57      0.59       379
weighted avg       0.78      0.79      0.78       379



[32m2021-12-23T17:27:16.380654+0100[0m [1m              precision    recall  f1-score   support

           1       0.39      0.40      0.40        40
           2       0.81      0.88      0.84       211
           3       0.00      0.00      0.00         5
           4       0.81      0.72      0.76        71
           5       1.00      0.87      0.93        52

    accuracy                           0.79       379
   macro avg       0.60      0.57      0.59       379
weighted avg       0.78      0.79      0.78       379
[0m


['trained_models/model_logreg.sav']

In [34]:
pipe = Pipeline([("tfidf", TfidfVectorizer()), ("dtree", DecisionTreeClassifier())])

with open("gridsearch_space/model_dectree.json", "r") as gs_file:
    contents = gs_file.read()
    params = ast.literal_eval(contents)
    logger.info(params)

pipe_clf = GridSearchCV(pipe, params, n_jobs=-1, scoring="f1_macro", verbose=2)
pipe_clf.fit(processed_data, target)
best_params = pipe_clf.best_params_
logger.info(best_params)

# run pipe with optimized parameters
pipe.set_params(**best_params).fit(train_x, train_y)
pipe_pred = pipe.predict(test_x)
report = classification_report(test_y, pipe_pred)
logger.info(report)

precision, recall, fscore, support = precision_recall_fscore_support(test_y, pipe_pred, average='macro')
metrics_list.append(['model_dectree', precision, recall, fscore])

with open('configs/dectree.json', 'w') as file:
     file.write(json.dumps(best_params))

pipe.set_params(**best_params).fit(processed_data, target)
dump(pipe, filename="trained_models/model_dectree.sav")

2021-12-23 17:29:10.706 | INFO     | __main__:<module>:6 - {'tfidf__ngram_range': [(1, 2)], 'tfidf__max_df': [0.1, 0.3, 0.5, 0.7], 'tfidf__min_df': [10, 30, 50, 70], 'dtree__criterion': ['gini', 'entropy'], 'dtree__max_features': ['sqrt', 'log2']}


[32m2021-12-23T17:29:10.706654+0100[0m [1m{'tfidf__ngram_range': [(1, 2)], 'tfidf__max_df': [0.1, 0.3, 0.5, 0.7], 'tfidf__min_df': [10, 30, 50, 70], 'dtree__criterion': ['gini', 'entropy'], 'dtree__max_features': ['sqrt', 'log2']}[0m
Fitting 5 folds for each of 64 candidates, totalling 320 fits


2021-12-23 17:29:23.855 | INFO     | __main__:<module>:11 - {'dtree__criterion': 'gini', 'dtree__max_features': 'sqrt', 'tfidf__max_df': 0.7, 'tfidf__min_df': 10, 'tfidf__ngram_range': (1, 2)}


[32m2021-12-23T17:29:23.855349+0100[0m [1m{'dtree__criterion': 'gini', 'dtree__max_features': 'sqrt', 'tfidf__max_df': 0.7, 'tfidf__min_df': 10, 'tfidf__ngram_range': (1, 2)}[0m


2021-12-23 17:29:23.956 | INFO     | __main__:<module>:17 -               precision    recall  f1-score   support

           1       0.34      0.55      0.42        40
           2       0.83      0.73      0.77       211
           3       0.00      0.00      0.00         5
           4       0.70      0.69      0.70        71
           5       0.80      0.83      0.81        52

    accuracy                           0.70       379
   macro avg       0.53      0.56      0.54       379
weighted avg       0.74      0.70      0.72       379



[32m2021-12-23T17:29:23.956173+0100[0m [1m              precision    recall  f1-score   support

           1       0.34      0.55      0.42        40
           2       0.83      0.73      0.77       211
           3       0.00      0.00      0.00         5
           4       0.70      0.69      0.70        71
           5       0.80      0.83      0.81        52

    accuracy                           0.70       379
   macro avg       0.53      0.56      0.54       379
weighted avg       0.74      0.70      0.72       379
[0m


['trained_models/model_dectree.sav']

In [35]:
pipe = Pipeline([("tfidf", TfidfVectorizer()), ("knc", KNeighborsClassifier(n_jobs=-1))])

with open("gridsearch_space/model_kneg.json", "r") as gs_file:
    contents = gs_file.read()
    params = ast.literal_eval(contents)
    logger.info(params)

pipe_clf = GridSearchCV(pipe, params, n_jobs=-1, scoring="f1_macro", verbose=2)
pipe_clf.fit(processed_data, target)
best_params = pipe_clf.best_params_
logger.info(best_params)

# run pipe with optimized parameters
pipe.set_params(**best_params).fit(train_x, train_y)
pipe_pred = pipe.predict(test_x)
report = classification_report(test_y, pipe_pred)
logger.info(report)

precision, recall, fscore, support = precision_recall_fscore_support(test_y, pipe_pred, average='macro')
metrics_list.append(['model_kneg', precision, recall, fscore])

with open('configs/kneg.json', 'w') as file:
     file.write(json.dumps(best_params))
        
pipe.set_params(**best_params).fit(processed_data, target)
dump(pipe, filename="trained_models/model_kneg.sav")

2021-12-23 17:30:10.166 | INFO     | __main__:<module>:6 - {'tfidf__ngram_range': [(1, 2)], 'tfidf__max_df': [0.1, 0.3, 0.5, 0.7], 'tfidf__min_df': [10, 30, 50, 70], 'knc__algorithm': ['ball_tree', 'kd_tree'], 'knc__n_neighbors': [3, 5, 7]}


[32m2021-12-23T17:30:10.166759+0100[0m [1m{'tfidf__ngram_range': [(1, 2)], 'tfidf__max_df': [0.1, 0.3, 0.5, 0.7], 'tfidf__min_df': [10, 30, 50, 70], 'knc__algorithm': ['ball_tree', 'kd_tree'], 'knc__n_neighbors': [3, 5, 7]}[0m
Fitting 5 folds for each of 96 candidates, totalling 480 fits


2021-12-23 17:30:37.697 | INFO     | __main__:<module>:11 - {'knc__algorithm': 'ball_tree', 'knc__n_neighbors': 7, 'tfidf__max_df': 0.5, 'tfidf__min_df': 50, 'tfidf__ngram_range': (1, 2)}


[32m2021-12-23T17:30:37.697264+0100[0m [1m{'knc__algorithm': 'ball_tree', 'knc__n_neighbors': 7, 'tfidf__max_df': 0.5, 'tfidf__min_df': 50, 'tfidf__ngram_range': (1, 2)}[0m


2021-12-23 17:30:37.940 | INFO     | __main__:<module>:17 -               precision    recall  f1-score   support

           1       0.25      0.50      0.33        40
           2       0.75      0.76      0.76       211
           3       0.00      0.00      0.00         5
           4       0.70      0.45      0.55        71
           5       0.92      0.65      0.76        52

    accuracy                           0.65       379
   macro avg       0.52      0.47      0.48       379
weighted avg       0.70      0.65      0.66       379



[32m2021-12-23T17:30:37.940412+0100[0m [1m              precision    recall  f1-score   support

           1       0.25      0.50      0.33        40
           2       0.75      0.76      0.76       211
           3       0.00      0.00      0.00         5
           4       0.70      0.45      0.55        71
           5       0.92      0.65      0.76        52

    accuracy                           0.65       379
   macro avg       0.52      0.47      0.48       379
weighted avg       0.70      0.65      0.66       379
[0m


['trained_models/model_kneg.sav']

In [36]:
pipe = Pipeline([("tfidf", TfidfVectorizer()), ("rfc", RandomForestClassifier(n_jobs=-1))])

with open("gridsearch_space/model_ranfor.json", "r") as gs_file:
    contents = gs_file.read()
    params = ast.literal_eval(contents)
    logger.info(params)

pipe_clf = GridSearchCV(pipe, params, n_jobs=-1, scoring="f1_macro", verbose=2)
pipe_clf.fit(processed_data, target)
best_params = pipe_clf.best_params_
logger.info(best_params)

# run pipe with optimized parameters
pipe.set_params(**best_params).fit(train_x, train_y)
pipe_pred = pipe.predict(test_x)
report = classification_report(test_y, pipe_pred)
logger.info(report)

precision, recall, fscore, support = precision_recall_fscore_support(test_y, pipe_pred, average='macro')
metrics_list.append(['model_ranfor', precision, recall, fscore])

with open('configs/ranfor.json', 'w') as file:
     file.write(json.dumps(best_params))
        
pipe.set_params(**best_params).fit(processed_data, target)
dump(pipe, filename="trained_models/model_ranfor.sav")

2021-12-23 17:31:29.480 | INFO     | __main__:<module>:6 - {'tfidf__ngram_range': [(1, 2)], 'tfidf__max_df': [0.1, 0.3, 0.5, 0.7], 'tfidf__min_df': [10, 30, 50, 70], 'rfc__criterion': ['gini', 'entropy'], 'rfc__n_estimators': [1000], 'rfc__max_features': ['sqrt', 'log2']}


[32m2021-12-23T17:31:29.480293+0100[0m [1m{'tfidf__ngram_range': [(1, 2)], 'tfidf__max_df': [0.1, 0.3, 0.5, 0.7], 'tfidf__min_df': [10, 30, 50, 70], 'rfc__criterion': ['gini', 'entropy'], 'rfc__n_estimators': [1000], 'rfc__max_features': ['sqrt', 'log2']}[0m
Fitting 5 folds for each of 64 candidates, totalling 320 fits


2021-12-23 17:44:43.190 | INFO     | __main__:<module>:11 - {'rfc__criterion': 'entropy', 'rfc__max_features': 'sqrt', 'rfc__n_estimators': 1000, 'tfidf__max_df': 0.5, 'tfidf__min_df': 10, 'tfidf__ngram_range': (1, 2)}


[32m2021-12-23T17:44:43.190466+0100[0m [1m{'rfc__criterion': 'entropy', 'rfc__max_features': 'sqrt', 'rfc__n_estimators': 1000, 'tfidf__max_df': 0.5, 'tfidf__min_df': 10, 'tfidf__ngram_range': (1, 2)}[0m


2021-12-23 17:44:47.489 | INFO     | __main__:<module>:17 -               precision    recall  f1-score   support

           1       0.40      0.53      0.45        40
           2       0.84      0.85      0.85       211
           3       0.00      0.00      0.00         5
           4       0.83      0.76      0.79        71
           5       1.00      0.88      0.94        52

    accuracy                           0.79       379
   macro avg       0.61      0.60      0.61       379
weighted avg       0.80      0.79      0.80       379



[32m2021-12-23T17:44:47.489483+0100[0m [1m              precision    recall  f1-score   support

           1       0.40      0.53      0.45        40
           2       0.84      0.85      0.85       211
           3       0.00      0.00      0.00         5
           4       0.83      0.76      0.79        71
           5       1.00      0.88      0.94        52

    accuracy                           0.79       379
   macro avg       0.61      0.60      0.61       379
weighted avg       0.80      0.79      0.80       379
[0m


['trained_models/model_ranfor.sav']

In [50]:
pipe = Pipeline([("tfidf", TfidfVectorizer()), ("gbc", GradientBoostingClassifier())])

with open("gridsearch_space/model_gradboost.json", "r") as gs_file:
    contents = gs_file.read()
    params = ast.literal_eval(contents)
    logger.info(params)

pipe_clf = GridSearchCV(pipe, params, n_jobs=-1, scoring="f1_macro", verbose=2)
pipe_clf.fit(processed_data, target)
best_params = pipe_clf.best_params_
logger.info(best_params)

# run pipe with optimized parameters
pipe.set_params(**best_params).fit(train_x, train_y)
pipe_pred = pipe.predict(test_x)
report = classification_report(test_y, pipe_pred)
logger.info(report)

precision, recall, fscore, support = precision_recall_fscore_support(test_y, pipe_pred, average='macro')
metrics_list.append(['model_gradboost', precision, recall, fscore])

with open('configs/gradboost.json', 'w') as file:
     file.write(json.dumps(best_params))

pipe.set_params(**best_params).fit(processed_data, target)
dump(pipe_clf, filename="trained_models/model_gradboost.sav")

2021-12-23 17:51:29.310 | INFO     | __main__:<module>:6 - {'tfidf__ngram_range': [(1, 2)], 'tfidf__max_df': [0.1, 0.3, 0.5, 0.7], 'tfidf__min_df': [10, 30, 50, 70], 'gbc__learning_rate': [0.001, 0.01], 'gbc__max_features': ['sqrt', 'log2'], 'gbc__n_estimators': [1000]}


[32m2021-12-23T17:51:29.310169+0100[0m [1m{'tfidf__ngram_range': [(1, 2)], 'tfidf__max_df': [0.1, 0.3, 0.5, 0.7], 'tfidf__min_df': [10, 30, 50, 70], 'gbc__learning_rate': [0.001, 0.01], 'gbc__max_features': ['sqrt', 'log2'], 'gbc__n_estimators': [1000]}[0m
Fitting 5 folds for each of 64 candidates, totalling 320 fits


2021-12-23 18:13:56.275 | INFO     | __main__:<module>:11 - {'gbc__learning_rate': 0.01, 'gbc__max_features': 'sqrt', 'gbc__n_estimators': 1000, 'tfidf__max_df': 0.3, 'tfidf__min_df': 10, 'tfidf__ngram_range': (1, 2)}


[32m2021-12-23T18:13:56.275885+0100[0m [1m{'gbc__learning_rate': 0.01, 'gbc__max_features': 'sqrt', 'gbc__n_estimators': 1000, 'tfidf__max_df': 0.3, 'tfidf__min_df': 10, 'tfidf__ngram_range': (1, 2)}[0m


2021-12-23 18:14:03.897 | INFO     | __main__:<module>:17 -               precision    recall  f1-score   support

           1       0.42      0.38      0.39        40
           2       0.80      0.89      0.84       211
           3       0.33      0.20      0.25         5
           4       0.86      0.72      0.78        71
           5       0.96      0.85      0.90        52

    accuracy                           0.79       379
   macro avg       0.67      0.61      0.63       379
weighted avg       0.79      0.79      0.78       379



[32m2021-12-23T18:14:03.897929+0100[0m [1m              precision    recall  f1-score   support

           1       0.42      0.38      0.39        40
           2       0.80      0.89      0.84       211
           3       0.33      0.20      0.25         5
           4       0.86      0.72      0.78        71
           5       0.96      0.85      0.90        52

    accuracy                           0.79       379
   macro avg       0.67      0.61      0.63       379
weighted avg       0.79      0.79      0.78       379
[0m


['trained_models/model_gradboost.sav']

In [37]:
pipe = Pipeline([("tfidf", TfidfVectorizer()), ("sgd", SGDClassifier(n_jobs=-1))])

with open("gridsearch_space/model_sgd.json", "r") as gs_file:
    contents = gs_file.read()
    params = ast.literal_eval(contents)
    logger.info(params)
    
pipe_clf = GridSearchCV(pipe, params, n_jobs=-1, scoring="f1_macro", verbose=2)
pipe_clf.fit(processed_data, target)
best_params = pipe_clf.best_params_
logger.info(best_params)

# run pipe with optimized parameters
pipe.set_params(**best_params).fit(train_x, train_y)
pipe_pred = pipe.predict(test_x)
report = classification_report(test_y, pipe_pred)
logger.info(report)

precision, recall, fscore, support = precision_recall_fscore_support(test_y, pipe_pred, average='macro')
metrics_list.append(['model_sgd', precision, recall, fscore])

with open('configs/sgd.json', 'w') as file:
     file.write(json.dumps(best_params))

pipe.set_params(**best_params).fit(processed_data, target)
dump(pipe, filename="trained_models/model_sgd.sav")

2021-12-23 17:46:30.933 | INFO     | __main__:<module>:6 - {'tfidf__ngram_range': [(1, 2)], 'tfidf__max_df': [0.1, 0.3, 0.5, 0.7], 'tfidf__min_df': [10, 30, 50, 70], 'sgd__alpha': [0.0001, 0.001], 'sgd__max_iter': [2000], 'sgd__tol': [0.0001, 0.001], 'sgd__loss': ['modified_huber']}


[32m2021-12-23T17:46:30.933251+0100[0m [1m{'tfidf__ngram_range': [(1, 2)], 'tfidf__max_df': [0.1, 0.3, 0.5, 0.7], 'tfidf__min_df': [10, 30, 50, 70], 'sgd__alpha': [0.0001, 0.001], 'sgd__max_iter': [2000], 'sgd__tol': [0.0001, 0.001], 'sgd__loss': ['modified_huber']}[0m
Fitting 5 folds for each of 64 candidates, totalling 320 fits


2021-12-23 17:46:48.341 | INFO     | __main__:<module>:11 - {'sgd__alpha': 0.0001, 'sgd__loss': 'modified_huber', 'sgd__max_iter': 2000, 'sgd__tol': 0.0001, 'tfidf__max_df': 0.3, 'tfidf__min_df': 10, 'tfidf__ngram_range': (1, 2)}


[32m2021-12-23T17:46:48.341700+0100[0m [1m{'sgd__alpha': 0.0001, 'sgd__loss': 'modified_huber', 'sgd__max_iter': 2000, 'sgd__tol': 0.0001, 'tfidf__max_df': 0.3, 'tfidf__min_df': 10, 'tfidf__ngram_range': (1, 2)}[0m


2021-12-23 17:46:48.554 | INFO     | __main__:<module>:17 -               precision    recall  f1-score   support

           1       0.40      0.60      0.48        40
           2       0.84      0.77      0.80       211
           3       0.17      0.20      0.18         5
           4       0.75      0.76      0.76        71
           5       0.96      0.87      0.91        52

    accuracy                           0.76       379
   macro avg       0.62      0.64      0.63       379
weighted avg       0.78      0.76      0.77       379



[32m2021-12-23T17:46:48.554019+0100[0m [1m              precision    recall  f1-score   support

           1       0.40      0.60      0.48        40
           2       0.84      0.77      0.80       211
           3       0.17      0.20      0.18         5
           4       0.75      0.76      0.76        71
           5       0.96      0.87      0.91        52

    accuracy                           0.76       379
   macro avg       0.62      0.64      0.63       379
weighted avg       0.78      0.76      0.77       379
[0m


['trained_models/model_sgd.sav']

### Best trained model

In [38]:
metrics_df = pd.DataFrame(metrics_list, columns=['model', 'precision', 'recall', 'fscore'])

compare_model_metric = 'fscore'
metrics_df = metrics_df.sort_values(compare_model_metric, ascending=False)

best_trained_model_filename = 'trained_models/' + metrics_df.loc[0]['model'] + '.sav'

copy(best_trained_model_filename, 'trained_models/best_model.sav')
logger.info(f'Copied {best_trained_model_filename} as best_model.sav')

2021-12-23 17:47:11.403 | INFO     | __main__:<module>:9 - Copied trained_models/model_logreg.sav as best_model.sav


[32m2021-12-23T17:47:11.403434+0100[0m [1mCopied trained_models/model_logreg.sav as best_model.sav[0m


### Test

In [39]:
text = "kundin frau vertragsnummer tierarztrechnungzugesandtrechnung anhang beigefugt"
mapping_dict = load_mapping(mapping_file='dataset/mapping.yaml')
pipeline = load('trained_models/best_model.sav')

stopwords_locale = 'german'
stemmer = SnowballStemmer(stopwords_locale)

with open('dataset/stopwords.yaml', 'r') as f:
    curated_stop_words = yaml.safe_load(f)

stop_words = set(stopwords.words(stopwords_locale))

text = preprocess_single_text(text, 
                              stop_words=stop_words, 
                              curated_stop_words=curated_stop_words,
                              stemming=True, 
                              stemmer=stemmer)

result = {}
for cls, prob in zip(pipeline.classes_.tolist(), pipeline.predict_proba([text]).tolist().pop()):
    result[mapping_dict[cls]] = prob

result

{'unklar': 0.2409734196506198,
 'unklar-an-vn': 0.5556520310353268,
 'an-ta': 0.03953945454337567,
 'an-vn-bekanntes-konto': 0.0936289121749357,
 'benanntes-konto-im-text': 0.0702061825957421}

In [48]:
texts = [
    "Subject  Vertragsnummer: 1 750 649 32 From:  giuseppe.infantolino@barmenia.de  To:  schaden@barmenia.de Sent Tue, 12 Oct 2021 11:55;59 +0200 CamScanner 10-12-2021 11.40.pdf Guten Tag, die Kundin Frau Melike Toplugedik mit der Vertragsnummer 1 750 649 32, hat mir die Tierarztrechnung  zugesandt. Rechnung ist im Anhang beigefügt. Mit freundlichen Grüßen Giuseppe Infantolino Viele Grüße aus dem schönen Saarland Barmenia Versicherungen Bous Saar Giuseppe Infantolino  Versicherungsfachmann IHK  66359 Bous Saar Mobil: (0160) 1540589 Mail : giuseppe . inf antolino@barmenia . de Homepage : https : / / agentur . barmenia .de/qiuseppe_infantolino Bezirksdirektion: www . barmenia—saarbruecken . de Termine nur nach Vereinbarung #MachenWirGern Meine Beratungsgrundlage Ich bin für die Barmenia Krankenversicherung a.G. als gebundene Versicherungsvertreter tätig und berate Sie ausschließlich über die Produkte der Barmenia Krankenversicherung AG Barmenia Lebensversicherung a.G Barmenia Allgemeine Versicherungs-AG Roland Rechtschutz-Versicherung-AG Roland Schutzbrief-Versicherung a.G Schlichtungssteilen H’ij r Lebens— und Sachversicherungen: Verein Versicherungsombudsmann e.V.  Postfach 080632  10006 Berlin Für private Krankenve n : Ombudsmann für private Kranken- und Pflegeversicherung Postfach 060222 10052 Berlin",
    "From:  Bartolomeo Cellammare <bartolomeo.cellammare@web.de>  To:  tierarztrechnung@barmenia.de Subject  Leistungsfall Katze Elis, Vers. Nr.: 302083555 Thu, 21 Oct 2021 14:10:18+0200 Sent 1 .jpeg  2.jpeg  3.jpeg  4,jpeg 54 ♦In! 6.jpeg Sehr geehrte Damen und Herren, leider ist unsere Katze erkrankt und reiche ihnen für die Leistungsbearbeitung die Rechnungen vom Tierarzt ein,  mit der Bitte um Begleichung. Vielen Dank im Voraus für die Mühe. Beste Grüße B. Cellammare",
    "From:  Info-KB <info-kb@barmenia.de> To:  Schaden <schaden@barmenia.de> Subject:  WG: Abtrittserklärung auf dauer!!! Sent: Tue, 12 Oct 2021 08:32:58 +0000 Von: Info <info@barmenia.de> Gesendet: Dienstag, 12. Oktober 2021 09:58 An: Info-KB <info-kb@barmenia.de> Betreff: FW: Abtrittserklärung auf dauer!!! From: Heiko Giese <heiko.giese.voss@gmail.com<mailto:heiko.giese.voss@gmail.com» Sent: Dienstag, 12. Oktober 2021 07:39:19 To: Info Subject: Abtrittserklärung auf dauer! ! ! Sehr geehrte Damen u. Herren Hiermit möchte ich meine Abtrittserklärung für meine Versicherung Nr.178718252 auf dauer mitteilen.Rechnungen die von meinem Tierarzt Dr.H.D.Bertelsmann , kleintierpraxis Möhnestr.106, 59755 Arnsberg kommen sollen alle in Zukunft mit dem Arzt abgerechnet werden.Hier die  Kontodaten vom Tierarzt. Deutsche Bank PBK Iban: DE 64466700240500700000 Bic: DEUTDEDB961 MfgSusanne Giese-Wiemann Mobil: 016092136774",
    "  Bettina Redöhl, Kugelfangtrift 196, 30657 Hannover  Tel.: 0176/61 666 192 Bettina RedOhl, Kuoefengtrift 196, 30657 Ht IIllIsh‘A-1i Einschreiben/Einwurf Barmenia Allgemeine Versicherungs-AG Hauptverwaltung gAR wesa-) Barmenis-Allee1 202) 42119 Wuppertal 2% VW vt t, ginsC Hannover, 19.10.2021 Vertragsnummer 179479473  Rechnungen Tierklinik Sehr geehrte Damen und Herren, als Anlage übersende ich Ihnen die Rechnungen der Tierärzte am Lohner Weg, mit Bitte um Erstattung  der Rechnungsbeträge auf das Ihnen bekannte Konto. Bei weiteren Fragen stehen die o.g. Tierärzte bzw. die Tierärztin Frau Dr. Bensch, Sutelstr. 14, 30659  Hannover, Tel.: 0511/6463639 zur Verfügung. Vielen Dank. Bei weiteren Fragen stehe ich Ihnen gern zur Verfügung. Mit freundlichen GrüBen  g /2g/ Bettina Redöhl Anlagen",
    "From:  Marijke Holtkamp <m.etzrodtgweb.de> To:  tierarztrechnung@barmenia.de Subject  Tierarztrechungen Sent Thu, 21 Oct 2021 14:28:46+0200 IMG 2798.JPG IMG_2799.JPG Sehr geehrte Damen und Herren,  anbei sende ich Ihnen die Tierarztrechnung unserer Hündin Clara Tari mit der bitte um Erstattung: KreisSparkasse Köln DE 74 3705 0299 1152 0271 47 BIC COKSDE33xxX Vielen Dank.! Mit freundlichem Gruß  Marijke Holtkamp"
]
text_labels = ['unklar', 'unklar-an-vn', 'an-ta', 'an-vn-bekanntes-konto', 'benanntes-konto-im-text']
mapping_dict = load_mapping(mapping_file='dataset/mapping.yaml')

for ind, text in enumerate(texts):
    print(f'\ntext_{ind+1}')
    for model in ['model_sgd', 'model_gradboost', 'model_ranfor', 'model_kneg', 'model_dectree', 'model_logreg']:
        pipeline = load(f'trained_models/{model}.sav')

        stopwords_locale = 'german'
        stemmer = SnowballStemmer(stopwords_locale)

        with open('dataset/stopwords.yaml', 'r') as f:
            curated_stop_words = yaml.safe_load(f)

        stop_words = set(stopwords.words(stopwords_locale))

        text = preprocess_single_text(text, 
                                      stop_words=stop_words, 
                                      curated_stop_words=curated_stop_words,
                                      stemming=True, 
                                      stemmer=stemmer)

        result = {}
        for cls, prob in zip(pipeline.classes_.tolist(), pipeline.predict_proba([text]).tolist().pop()):
            print(cls, prob)
            result[mapping_dict[cls]] = prob
            predicted_class = pipeline.predict([text]).tolist().pop()

        print(model,'\t :: \t', mapping_dict[predicted_class], '\t :: \t', text_labels[ind], '\n', result, '\n')


text_1
1 0.0
2 1.0
3 0.0
4 0.0
5 0.0
model_sgd 	 :: 	 unklar-an-vn 	 :: 	 unklar 
 {'unklar': 0.0, 'unklar-an-vn': 1.0, 'an-ta': 0.0, 'an-vn-bekanntes-konto': 0.0, 'benanntes-konto-im-text': 0.0} 

0 0.21472550453649922


KeyError: 0

In [49]:
mapping_dict

{'1': 'unklar',
 '2': 'unklar-an-vn',
 '3': 'an-ta',
 '4': 'an-vn-bekanntes-konto',
 '5': 'benanntes-konto-im-text'}

In [None]:
hiermit mochte abtrittserklarung versicherung nr daur mitteilenrechnungen tierarzt drhdbertelsmann 
kleintierpraxis mohnestr arnsberg kommen sollen zukunft arzt abgerechnet werdenhier kontodaten tierarzt

Hiermit möchte ich meine Abtrittserklärung für meine Versicherung Nr.178718252 auf dauer mitteilen.Rechnungen die von meinem Tierarzt Dr.H.D.Bertelsmann , kleintierpraxis Möhnestr.106, 59755Arnsberg kommen sollen alle in Zukunft mit dem Arzt abgerechnet werden.Hier die  Kontodaten vom Tierarzt.

In [153]:
pipeline = load(f'trained_models/best_model.sav')
pipeline

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_df=0.7, min_df=10, ngram_range=(1, 2))),
                ('logreg',
                 LogisticRegression(max_iter=2000, n_jobs=-1,
                                    solver='newton-cg'))])

### Flask

In [94]:
import requests

url = 'http://localhost:5000/'
r = requests.post(url,
                  json={
                      'text':"kundin frau vertragsnummer tierarztrechnungzugesandtrechnung anhang beigefugt"
                  }
                 )

print(r.json())

ConnectionError: HTTPConnectionPool(host='localhost', port=5000): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc1f2499860>: Failed to establish a new connection: [Errno 61] Connection refused',))