### Load preprocessed dataset

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("data/preprocessed_data.csv")
df.fillna('',inplace=True)

X, y= df.loc[:, df.columns != 'Politikbereich'], df.loc[:,df.columns == 'Politikbereich']

### Encode class labels

In [2]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

le.fit(y["Politikbereich"].unique().tolist())

y["Politikbereich"] = y["Politikbereich"].apply(lambda s: le.transform([s])[0])

y.head(5)

Unnamed: 0,Politikbereich
0,21
1,2
2,18
3,11
4,11


### Default Random Forest with TD-IDF vectorizer

Remark: the vectorizer shouldnt be fitted on both training and validation data, only training. I will correct this later.

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

X_tfidf = X.copy()

vectorizer = TfidfVectorizer()
vectorizer.fit(X_tfidf["Zweck"])

tfidf_encodings = vectorizer.transform(X_tfidf["Zweck"])

X_tfidf = pd.DataFrame(tfidf_encodings.toarray())
X_tfidf.columns = vectorizer.get_feature_names()

print(len(vectorizer.vocabulary_))
print(len(X_tfidf.iloc[0]))

X_tfidf.head(5)

1920
1920


Unnamed: 0,aad,aaron,abb,abenteuerspielplatz,abgefahren,ablauforganisatorische,abqueer,absatz,absent,absichtserkennung,...,zylinderbohrungen,ältere,öffentlichkeitsarbeit,öffnung,ögb,öpnv,übertragung,überwindung,übungs,übungsleitern
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score, make_scorer

def custom_scorer_macro_f1(y_true,y_pred):
    macro_f1 = f1_score(y_true, y_pred, average='macro')
    return macro_f1

scorer_macro_f1 = make_scorer(custom_scorer_macro_f1, greater_is_better=True)

def custom_scorer_weighted_f1(y_true,y_pred):
    weighted_f1 = f1_score(y_true, y_pred, average='weighted')
    return weighted_f1

scorer_weighted_f1 = make_scorer(custom_scorer_weighted_f1, greater_is_better=True)


rf = RandomForestClassifier(random_state=42)

scores = cross_validate(rf,
                        X_tfidf,
                        y["Politikbereich"].values,
                        cv=10,
                        scoring = {"macro_f1": scorer_macro_f1,"weighted_f1": scorer_weighted_f1},
                        return_train_score = False,
                        verbose=1,
                        n_jobs=10)
                                    
scores = pd.DataFrame(scores)

avg_macro_f1 = scores["test_macro_f1"].mean()
avg_weighted_f1 = scores["test_weighted_f1"].mean()

scores

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    4.5s remaining:   18.4s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    4.7s finished


Unnamed: 0,fit_time,score_time,test_macro_f1,test_weighted_f1
0,3.165005,0.041998,0.182503,0.447842
1,3.055007,0.059996,0.316578,0.496362
2,3.190001,0.023999,0.308986,0.539625
3,3.012001,0.066005,0.403031,0.615975
4,2.955001,0.063,0.466115,0.546395
5,3.202001,0.027001,0.607859,0.68835
6,3.141001,0.039002,0.503231,0.599808
7,2.965008,0.064992,0.396887,0.499028
8,3.175004,0.022997,0.481196,0.717489
9,3.188,0.023,0.324742,0.613242


In [5]:
import pandas as pd

tmp = pd.read_csv("results/results.csv")
tmp = tmp.append(   {
                    "model_name": "Random Forest",
                    "parameters": "default",
                    "dataset": "preprocessed (no augmentation), tfidf",
                    "macro_f1": avg_macro_f1,
                    "weighted_f1": avg_weighted_f1
                    },
                    ignore_index=True)

tmp.to_csv("results/results.csv", index=False)
tmp.head(10)

Unnamed: 0,model_name,parameters,dataset,macro_f1,weighted_f1
0,BERT base baseline,"default, 12 epochs",raw without duplicates,0.402406,0.714564
1,Random Forest,default,"preprocessed (no augmentation), tfidf",0.399113,0.576411


### Default Random Forest with BERT sentence embeddings

Reference: https://github.com/BramVanroy/bert-for-inference/blob/master/introduction-to-bert.ipynb

In [6]:
import torch
from transformers import BertModel, BertTokenizer
from lib.bert_pytorch.helper_functions import get_device

class BERT_sentences_embeddings:
    def __init__(self):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased')
        self.device = get_device()
        self.model = BertModel.from_pretrained('bert-base-german-cased', 
                                                output_hidden_states=True)\
                                                    .to(self.device)
        self.model.eval()
    
    def embed_sentence(self, sentence: str):

        ids_tensor = self.tokenizer.encode(sentence, return_tensors='pt')

        ids_tensor = ids_tensor.to(self.device)

        with torch.no_grad():
            out = self.model(input_ids=ids_tensor)

        hidden_states = out.hidden_states

        # # Last hidden layer
        # # print(hidden_states[-1].size()) # torch.Size([1, n_words, 768])
        # sentence_embedding = torch.mean(hidden_states[-1], dim=1).squeeze()
        # # print(sentence_embedding.size()) # torch.Size([768])

        # # Concat last 4 hidden layers (see reference)
        # last_four_layers = [hidden_states[i] for i in (-1, -2, -3, -4)]
        # cat_hidden_states = torch.cat(tuple(last_four_layers), dim=-1)
        # cat_sentence_embedding = torch.mean(cat_hidden_states, dim=1).squeeze()

        # Sum last 4 hidden layers
        last_four_layers = [hidden_states[i] for i in (-1, -2, -3, -4)]
        sentence_embedding = torch.cat(tuple(last_four_layers), dim=0)
        # print(cat_hidden_states.size()) # torch.Size([4, n_words, 768])
        sentence_embedding = torch.mean(sentence_embedding, dim=0)
        # print(cat_sentence_embedding.size()) # torch.Size([n_words, 768])
        sentence_embedding = torch.mean(sentence_embedding, dim=0)
        # print(cat_sentence_embedding.size()) # torch.Size([768])
        
        return sentence_embedding.cpu().numpy()

In [7]:
sentences_encoder = BERT_sentences_embeddings()

X_bert_sentences_embeddings = X.copy()

X_bert_sentences_embeddings = X_bert_sentences_embeddings["Zweck"].apply(lambda s: sentences_encoder.embed_sentence(s))

X_bert_sentences_embeddings = pd.DataFrame(X_bert_sentences_embeddings.values.tolist())

X_bert_sentences_embeddings.head(5)

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 2070 with Max-Q Design


Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.321901,-0.172469,0.164475,0.090086,-0.620622,0.396059,-0.243543,0.143387,-0.269414,0.46035,...,-0.505437,-0.021107,-0.197963,-0.090727,-0.629851,0.024782,-0.517913,0.63502,-0.026206,0.067707
1,-0.380983,0.107464,0.550635,0.463361,-0.0915,0.342285,0.030745,-0.16056,0.061748,0.040033,...,-0.889845,0.006009,0.832944,-0.184656,-0.191374,-0.032211,0.089964,-0.028981,-0.436644,0.309008
2,-0.014324,-0.254108,0.207883,0.526265,-0.495684,-0.270026,-0.108872,0.26652,-0.361461,0.144416,...,-0.683479,-0.281656,0.535357,-0.361051,-0.399404,0.091353,-0.240713,0.110006,-0.098588,-0.307411
3,0.691038,0.376838,0.289249,0.238886,-0.051425,-0.265966,-0.228752,0.695132,0.159335,-0.254782,...,-0.079896,0.448843,0.331829,-0.230279,-0.283521,-0.025756,0.083556,-0.596719,0.272429,0.569072
4,-0.281747,0.571193,-0.15027,0.253571,-0.243861,0.186255,0.091825,-0.001978,-0.095922,0.039364,...,0.043577,0.178394,0.231799,-0.655587,-0.779325,-0.244315,0.582203,-0.185745,0.348684,-0.036223


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score, make_scorer

def custom_scorer_macro_f1(y_true,y_pred):
    macro_f1 = f1_score(y_true, y_pred, average='macro')
    return macro_f1

scorer_macro_f1 = make_scorer(custom_scorer_macro_f1, greater_is_better=True)

def custom_scorer_weighted_f1(y_true,y_pred):
    weighted_f1 = f1_score(y_true, y_pred, average='weighted')
    return weighted_f1

scorer_weighted_f1 = make_scorer(custom_scorer_weighted_f1, greater_is_better=True)


rf = RandomForestClassifier(random_state=42)

scores = cross_validate(    rf, 
                            X_bert_sentences_embeddings, 
                            y["Politikbereich"].values,
                            cv=10,
                            scoring = {"macro_f1": scorer_macro_f1,"weighted_f1": scorer_weighted_f1},
                            return_train_score = False,
                            verbose=1,
                            n_jobs=10
                        )
                                    
scores = pd.DataFrame(scores)

avg_macro_f1 = scores["test_macro_f1"].mean()
avg_weighted_f1 = scores["test_weighted_f1"].mean()

scores

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    1.9s remaining:    8.1s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    2.1s finished


Unnamed: 0,fit_time,score_time,test_macro_f1,test_weighted_f1
0,2.015987,0.020002,0.145224,0.425327
1,2.016987,0.022999,0.140488,0.462081
2,2.109027,0.022007,0.168477,0.474726
3,2.008998,0.022002,0.251913,0.496268
4,2.100026,0.023,0.251456,0.518174
5,2.043998,0.018998,0.383842,0.579672
6,2.000996,0.020001,0.273865,0.525934
7,2.010997,0.024001,0.202736,0.4825
8,2.145999,0.022026,0.139992,0.42913
9,2.124033,0.013996,0.1696,0.451497


In [9]:
import pandas as pd

tmp = pd.read_csv("results/results.csv")
tmp = tmp.append(   {
                    "model_name": "Random Forest",
                    "parameters": "default",
                    "dataset": "preprocessed (no augmentation), bert sentence embeddings",
                    "macro_f1": avg_macro_f1,
                    "weighted_f1": avg_weighted_f1
                    },
                    ignore_index=True)

tmp.to_csv("results/results.csv", index=False)
tmp.head(10)

Unnamed: 0,model_name,parameters,dataset,macro_f1,weighted_f1
0,BERT base baseline,"default, 12 epochs",raw without duplicates,0.402406,0.714564
1,Random Forest,default,"preprocessed (no augmentation), tfidf",0.399113,0.576411
2,Random Forest,default,"preprocessed (no augmentation), bert sentence ...",0.212759,0.484531


### Parameters hypertuning and model selection with GridSearchCV

In [10]:
# import os
# import pickle

# from sklearn.pipeline import Pipeline
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import f1_score, make_scorer


# def custom_scorer(y_true,y_pred):
#     macro_f1 = f1_score(y_true, y_pred, average='macro')
#     print(macro_f1)
#     weighted_f1 = f1_score(y_true, y_pred, average='weighted')
#     print(weighted_f1)
#     return macro_f1

# scorer = make_scorer(custom_scorer, greater_is_better=True)

# def execute_pipeline(features,labels, search_space=[
#                     {"estimator": [RandomForestClassifier(random_state=42, verbose=1, n_jobs=-1)],
#                     "estimator__n_estimators": [10, 25],
#                     "estimator__max_depth": [2, 6]
#                     }], 
#                     cv=3,
#                     verbose=1,
#                     n_jobs=os.cpu_count() - 2,
#                     scoring= scorer):
    
#     pipe = Pipeline([("estimator", RandomForestClassifier())])
    
#     gridsearch = GridSearchCV(pipe, search_space, scoring=scoring, cv=cv, verbose=verbose,n_jobs=n_jobs)
#     best_model = gridsearch.fit(features, labels)
#     print(best_model.best_estimator_)
#     print(best_model.best_score_)
#     return best_model

# best_estimator = execute_pipeline(X,y)

# pickle.dump(best_estimator,open( "pretrained_models/random_forest/best_estimator.pkl", "wb" ))