### Load preprocessed dataset

In [None]:
import pandas as pd

df = pd.read_csv("data/preprocessed_data.csv")
df.fillna('',inplace=True)

X_train, y_train = df.loc[:, df.columns != 'Politikbereich'], df['Politikbereich']

### Encode class labels

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

le.fit(y_train["Politikbereich"].unique().tolist())

y_train["Politikbereich"] = y_train["Politikbereich"].apply(lambda s: le.transform([s])[0])

y_train.head(5)

### Train a classifier with grid search and cross validation

In [None]:
import os
import pickle

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer


def my_custom_loss_func(y_true,y_pred):
    macro_f1 = f1_score(y_true, y_pred, average='macro')
    print(macro_f1)
    weighted_f1 = f1_score(y_true, y_pred, average='weighted')
    print(weighted_f1)
    return macro_f1

macro_f1 = make_scorer(my_custom_loss_func, greater_is_better=True)

def execute_pipeline(features,labels, search_space=[
                    {"estimator": [RandomForestClassifier(random_state=42, verbose=1, n_jobs=-1)],
                    "estimator__n_estimators": [10, 25],
                    "estimator__max_depth": [2, 6]
                    }], 
                    cv=3,
                    verbose=1,
                    n_jobs=os.cpu_count() - 2,
                    scoring= macro_f1):
    
    pipe = Pipeline([("estimator", RandomForestClassifier())])
    
    gridsearch = GridSearchCV(pipe, search_space, scoring=scoring, cv=cv, verbose=verbose,n_jobs=n_jobs)
    best_model = gridsearch.fit(features, labels)
    print(best_model.best_estimator_)
    print(best_model.best_score_)
    return best_model

best_estimator = execute_pipeline(X_train,y_train)

pickle.dump(best_estimator,open( "pretrained_models/random_forest/best_estimator.pkl", "wb" ))