### Load dataset

To do: use augmented dataset.
- Pros: classes balanced
- Cons: larger vocabulary (if using Tf-idf lots of RAM is needed)

In [1]:
import pandas as pd

df = pd.read_csv("data/selected_data.csv")

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
df.head(5)

Unnamed: 0,Politikbereich,Zweck
0,Verkehr,"Nord-Süd-Tangente; Linie 26/27, 2.2. Teil-BPU,A"
1,Bildung,Gedenken zu 30 Jahre Mauerfall
2,Stadtentwicklung,Lernen Na Logo - Bildungsnetzwerk Hellersdorfer Promenade
3,Jugend,Kinder- und Jugendambulanz
4,Jugend,Therapiebad


### Split dataset

In [2]:
X, y = df.loc[:, df.columns != 'Politikbereich'], df.loc[:,df.columns == 'Politikbereich']

### Encode target labels

In [3]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

classes = ['Verkehr',
            'Bildung',
            'Stadtentwicklung',
            'Jugend',
            'Integration',
            'Wirtschaft',
            'Familie',
            'Kultur',
            'Arbeit',
            'Denkmalschutz',
            'Soziales',
            'Gesundheit',
            'Gleichstellung',
            'Wissenschaft',
            'Sport',
            'Bürgerschaftliches Engagement, Bürgerbeteiligung',
            'Kirchen, Religions-, Weltanschauungsgemeinschaften',
            'Umwelt',
            'Antidiskriminierung',
            'Frauen',
            'Forschung',
            'Verbraucherschutz',
            'Pflege',
            'Sicherheit, Ordnung']

le.fit(classes)
y = y["Politikbereich"].apply(lambda s: le.transform([s])[0]).values
y

array([21,  2, 18, ..., 11, 11, 18])

### Preprocessing pipeline

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import spacy
import torch
from transformers import BertModel, BertTokenizer
from lib.bert_pytorch.helper_functions import get_device

class SelectFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, features_list):
        self.features_list = features_list
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X[self.features_list]
        X = X.iloc[:,0]
        return X

class CleanText(BaseEstimator, TransformerMixin):
    def cleaner(self, text):
        # Remove mid slash and digits
        text = re.sub(r'-', ' ', text)
        text = re.sub(r'\d+', '', text)
        # Custom ones not supported by spacy
        text = re.sub(r'Abs\.', 'Absatz', text)
        text = re.sub(r'e\.V\.', 'eingetragener Verein', text)
        text = re.sub(r'co\.', 'Kompanie', text)
        text = re.sub(r'Co\.', 'Kompanie', text)
        text = re.sub(r'gem\.', 'gemäß', text)
        text = re.sub(r"'s", '', text)
        return text
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X.apply(self.cleaner)
        return X

class SpacyLemmatizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.nlp = spacy.load("de_core_news_lg")
        self.nlp.remove_pipe("ner")
        self.nlp.remove_pipe("parser")
        self.nlp.remove_pipe("attribute_ruler")
    def normalize(self, text):
        doc = self.nlp(text)
        output = []
        for token in doc:
            if not token.is_punct and not token.is_stop and not token.is_space:
                output.append(token.lemma_)
        return " ".join(output)
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X.apply(self.normalize)
        return X

class Lowercase(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X.apply(lambda text: text.lower())
        return X

class TfIdfVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.vectorizer = TfidfVectorizer()
    def fit(self, X, y=None):
        self.vectorizer.fit(X)
        return self
    def transform(self, X, y=None):
        tfidf_encodings = self.vectorizer.transform(X)
        X = pd.DataFrame(tfidf_encodings.toarray())
        return X

class BertSentenceEmbeddigs(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased')
        self.device = get_device()
        self.model = BertModel.from_pretrained('bert-base-german-cased', 
                                                output_hidden_states=True)\
                                                    .to(self.device)
        self.model.eval()
    def embed_sentence(self, sentence: str):
        ids_tensor = self.tokenizer.encode(sentence, return_tensors='pt')
        ids_tensor = ids_tensor.to(self.device)
        with torch.no_grad():
            out = self.model(input_ids=ids_tensor)
        hidden_states = out.hidden_states
        last_four_layers = [hidden_states[i] for i in (-1, -2, -3, -4)]
        sentence_embedding = torch.cat(tuple(last_four_layers), dim=0)
        sentence_embedding = torch.mean(sentence_embedding, dim=0)
        sentence_embedding = torch.mean(sentence_embedding, dim=0)
        return sentence_embedding.cpu().numpy()
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = X.apply(self.embed_sentence)
        X = pd.DataFrame(X.values.tolist())
        return X

prep_pipeline = Pipeline([
    ("select_features", SelectFeatures(features_list=["Zweck"])),
    ("clean_text", CleanText()),
    ("spacy_lemmatizer", SpacyLemmatizer()),
    ("text_lowercase", Lowercase()),
    ("tfidf_vectorizer", TfIdfVectorizer()),
    # ("bert_sentence_embeddings", BertSentenceEmbeddigs()),
])

# Example
prep_pipeline.fit(X, y)
prep_pipeline.transform(X).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1910,1911,1912,1913,1914,1915,1916,1917,1918,1919
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Save preprocessing pipeline

In [5]:
import pickle

pickle.dump(prep_pipeline,open("pretrained_models/random_forest/prep_pipeline.pkl", "wb" ))

### Parameters hypertuning and model selection with GridSearchCV

In [6]:
import os
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer


def custom_scorer_macro_f1(y_true,y_pred):
    macro_f1 = f1_score(y_true, y_pred, average='macro')
    return macro_f1

scorer_macro_f1 = make_scorer(custom_scorer_macro_f1, greater_is_better=True)

def custom_scorer_weighted_f1(y_true,y_pred):
    weighted_f1 = f1_score(y_true, y_pred, average='weighted')
    return weighted_f1

scorer_weighted_f1 = make_scorer(custom_scorer_weighted_f1, greater_is_better=True)


def execute_pipeline(features,labels, search_space=[
                    {"estimator": [RandomForestClassifier(random_state=42, verbose=1)],
                    "estimator__n_estimators": [10,50,100],
                    # "estimator__max_depth": [2, 6],
                    "estimator__class_weight": ['balanced',None]
                    }],
                    cv = 5,
                    verbose = 1,
                    n_jobs = os.cpu_count() - 2,
                    scoring = scorer_macro_f1):
                    # scoring = {"macro_f1": scorer_macro_f1,"weighted_f1": scorer_weighted_f1}):
    
    pipe = Pipeline([("preprocessing", prep_pipeline),
                    ("estimator", RandomForestClassifier())])
    
    gridsearch = GridSearchCV(pipe, search_space, scoring=scoring, cv=cv, verbose=verbose,n_jobs=n_jobs,refit=True)
    best_model = gridsearch.fit(features, labels)
    print(best_model.best_params_)
    print(best_model.best_score_)
    return best_model

best_estimator = execute_pipeline(X, y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


{'estimator': RandomForestClassifier(class_weight='balanced', n_estimators=50,
                       random_state=42, verbose=1), 'estimator__class_weight': 'balanced', 'estimator__n_estimators': 50}
0.302764736429617


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.6s finished


### Train best model on whole data

In [7]:
best_model = best_estimator.best_params_["estimator"]
best_model.fit(prep_pipeline.transform(X), y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.6s finished


RandomForestClassifier(class_weight='balanced', n_estimators=50,
                       random_state=42, verbose=1)

### Save best model for inference

In [8]:
import pickle

pickle.dump(best_model,open("pretrained_models/random_forest/best_estimator.pkl", "wb" ))