### Load preprocessed dataset

In [2]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("data/preprocessed_data.csv")
df.fillna('',inplace=True)

X, y= df.loc[:, df.columns != 'Politikbereich'], df.loc[:,df.columns == 'Politikbereich']

### Encode class labels

In [3]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

le.fit(y["Politikbereich"].unique().tolist())

y["Politikbereich"] = y["Politikbereich"].apply(lambda s: le.transform([s])[0])

y.head(5)

Unnamed: 0,Politikbereich
0,21
1,2
2,18
3,11
4,11


### TD-IDF vectorizer

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectorizer.fit(X["Zweck"])

tfidf_encodings = vectorizer.transform(X["Zweck"])

X = pd.DataFrame(tfidf_encodings.toarray())
X.columns = vectorizer.get_feature_names()
X.head(5)

Unnamed: 0,aad,aaron,abb,abenteuerspielplatz,abgefahren,ablauforganisatorische,abqueer,absatz,absent,absichtserkennung,...,zylinderbohrungen,ältere,öffentlichkeitsarbeit,öffnung,ögb,öpnv,übertragung,überwindung,übungs,übungsleitern
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
print(len(vectorizer.vocabulary_))
print(len(X.iloc[0]))

1920
1920


### Train a default RandomForestClassifier

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score, make_scorer

def custom_scorer_macro_f1(y_true,y_pred):
    macro_f1 = f1_score(y_true, y_pred, average='macro')
    return macro_f1

scorer_macro_f1 = make_scorer(custom_scorer_macro_f1, greater_is_better=True)

def custom_scorer_weighted_f1(y_true,y_pred):
    weighted_f1 = f1_score(y_true, y_pred, average='weighted')
    return weighted_f1

scorer_weighted_f1 = make_scorer(custom_scorer_weighted_f1, greater_is_better=True)


rf = RandomForestClassifier(random_state=42)

scores = cross_validate(rf, X, y["Politikbereich"].values, cv=3,
                                    scoring = {"macro_f1": scorer_macro_f1,"weighted_f1": scorer_weighted_f1},
                                    return_train_score = False,
                                    verbose=1,
                                    n_jobs=10)
                                    
scores

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:    1.3s finished


{'fit_time': array([0.67100072, 0.62700319, 0.66500306]),
 'score_time': array([0.0459981 , 0.04399872, 0.04099965]),
 'test_macro_f1': array([0.19360245, 0.31623307, 0.31377238]),
 'test_weighted_f1': array([0.41980581, 0.49310218, 0.54892923])}

### Parameters hypertuning and model selection with GridSearchCV

In [None]:
# import os
# import pickle

# from sklearn.pipeline import Pipeline
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import f1_score, make_scorer


# def custom_scorer(y_true,y_pred):
#     macro_f1 = f1_score(y_true, y_pred, average='macro')
#     print(macro_f1)
#     weighted_f1 = f1_score(y_true, y_pred, average='weighted')
#     print(weighted_f1)
#     return macro_f1

# scorer = make_scorer(custom_scorer, greater_is_better=True)

# def execute_pipeline(features,labels, search_space=[
#                     {"estimator": [RandomForestClassifier(random_state=42, verbose=1, n_jobs=-1)],
#                     "estimator__n_estimators": [10, 25],
#                     "estimator__max_depth": [2, 6]
#                     }], 
#                     cv=3,
#                     verbose=1,
#                     n_jobs=os.cpu_count() - 2,
#                     scoring= scorer):
    
#     pipe = Pipeline([("estimator", RandomForestClassifier())])
    
#     gridsearch = GridSearchCV(pipe, search_space, scoring=scoring, cv=cv, verbose=verbose,n_jobs=n_jobs)
#     best_model = gridsearch.fit(features, labels)
#     print(best_model.best_estimator_)
#     print(best_model.best_score_)
#     return best_model

# best_estimator = execute_pipeline(X,y)

# pickle.dump(best_estimator,open( "pretrained_models/random_forest/best_estimator.pkl", "wb" ))