In [None]:
DOWNLOAD_PIPELINE_AND_MODEL = False

if DOWNLOAD_PIPELINE_AND_MODEL:
    import gdown
    print("Downloading preprocessing pipeline")
    id = "1J0NiJA61QBaj80EswXuqDtEPzF6yM0uW"
    output = "pretrained_models/random_forest/prep_pipeline.pkl"
    gdown.download(output=output, quiet=False, id=id)
    print("Downloading model")
    id = "18JbYn29dK5E4AO_dMsqY0lIkz3yPHC-X"
    output = "pretrained_models/random_forest/best_estimator.pkl"
    gdown.download(output=output, quiet=False, id=id)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import spacy
import torch
from transformers import BertModel, BertTokenizer
from lib.bert_pytorch.helper_functions import get_device

class SelectFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, features_list):
        self.features_list = features_list
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X[self.features_list]
        X = X.iloc[:,0]
        return X

class CleanText(BaseEstimator, TransformerMixin):
    def cleaner(self, text):
        # Remove mid slash and digits
        text = re.sub(r'-', ' ', text)
        text = re.sub(r'\d+', '', text)
        # Custom ones not supported by spacy
        text = re.sub(r'Abs\.', 'Absatz', text)
        text = re.sub(r'e\.V\.', 'eingetragener Verein', text)
        text = re.sub(r'co\.', 'Kompanie', text)
        text = re.sub(r'Co\.', 'Kompanie', text)
        text = re.sub(r'gem\.', 'gemäß', text)
        text = re.sub(r"'s", '', text)
        return text
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X.apply(self.cleaner)
        return X

class SpacyLemmatizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.nlp = spacy.load("de_core_news_lg")
        self.nlp.remove_pipe("ner")
        self.nlp.remove_pipe("parser")
        self.nlp.remove_pipe("attribute_ruler")
    def normalize(self, text):
        doc = self.nlp(text)
        output = []
        for token in doc:
            if not token.is_punct and not token.is_stop and not token.is_space:
                output.append(token.lemma_)
        return " ".join(output)
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X.apply(self.normalize)
        return X

class Lowercase(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X.apply(lambda text: text.lower())
        return X

class TfIdfVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.vectorizer = TfidfVectorizer()
    def fit(self, X, y=None):
        self.vectorizer.fit(X)
        return self
    def transform(self, X, y=None):
        tfidf_encodings = self.vectorizer.transform(X)
        X = pd.DataFrame(tfidf_encodings.toarray())
        return X

class BertSentenceEmbeddigs(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased')
        self.device = get_device()
        self.model = BertModel.from_pretrained('bert-base-german-cased', 
                                                output_hidden_states=True)\
                                                    .to(self.device)
        self.model.eval()
    def embed_sentence(self, sentence: str):
        ids_tensor = self.tokenizer.encode(sentence, return_tensors='pt')
        ids_tensor = ids_tensor.to(self.device)
        with torch.no_grad():
            out = self.model(input_ids=ids_tensor)
        hidden_states = out.hidden_states
        last_four_layers = [hidden_states[i] for i in (-1, -2, -3, -4)]
        sentence_embedding = torch.cat(tuple(last_four_layers), dim=0)
        sentence_embedding = torch.mean(sentence_embedding, dim=0)
        sentence_embedding = torch.mean(sentence_embedding, dim=0)
        return sentence_embedding.cpu().numpy()
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X.apply(self.embed_sentence)
        X = pd.DataFrame(X.values.tolist())
        return X

prep_pipeline = Pipeline([
    ("select_features", SelectFeatures(features_list=["Zweck"])),
    ("clean_text", CleanText()),
    ("spacy_lemmatizer", SpacyLemmatizer()),
    ("text_lowercase", Lowercase()),
    ("tfidf_vectorizer", TfIdfVectorizer()),
    # ("bert_sentence_embeddings", BertSentenceEmbeddigs()),
])

In [None]:
import pandas as pd
import pickle


# Add test_data.csv into the /data folder
X = pd.read_csv('data/test_data.csv')


X.fillna('',inplace=True)

prep_pipeline = pickle.load(open("pretrained_models/random_forest/prep_pipeline.pkl", "rb" ))

X = prep_pipeline.transform(X)
X.head(3)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, f1_score

def custom_scorer_macro_f1(y_true,y_pred):
    macro_f1 = f1_score(y_true, y_pred, average='macro')
    return macro_f1

scorer_macro_f1 = make_scorer(custom_scorer_macro_f1, greater_is_better=True)

best_estimator = pickle.load(open("pretrained_models/random_forest/best_estimator.pkl", "rb" ))

In [None]:
X['prediction'] = best_estimator.predict(X)
X.head(5)