In [2]:
import pandas as pd

df = pd.read_csv("data/selected_data.csv")

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

def replace_class_label(label):
    if label == "Bürgerschaftliches Engagement, Bürgerbeteiligung":
        return "Bürgerbeteiligung"
    elif label == "Kirchen, Religions-, Weltanschauungsgemeinschaften":
        return "Religion"
    else:
        return label
df["Politikbereich"] = df["Politikbereich"].apply(replace_class_label)
                     
# df.head(15)

In [3]:
tmp_df = df.groupby(["Politikbereich"]).count().sort_values(by="Zweck", ascending = False).reset_index()
# display(tmp_df)

minority_classes = tmp_df[tmp_df["Zweck"]<2]["Politikbereich"].values
# display(df[df["Politikbereich"].isin(minority_classes)].sort_values(by=["Politikbereich"]))

df["Politikbereich"] = df["Politikbereich"].apply(lambda s: "Sonstiges" if s in minority_classes else s)
# display(df.groupby(["Politikbereich"]).count().sort_values(by="Zweck", ascending = False).reset_index())

In [4]:
from lib.utils.helper_functions import custom_train_test_split

X_train_dfs, X_test_dfs, y_train_dfs, y_test_dfs = custom_train_test_split(df, index = 0, cv = 10)

In [10]:
import re
import spacy
from lib.charsplit.splitter import Splitter

class CleanText():
    def remove_digits(self, text):
        # Remove mid slash and digits
        text = re.sub(r'-', ' ', text)
        text = re.sub(r'\d+', '', text)
        return text

    def replace_abbreviations(self, text):
        # Custom ones not supported by spacy
        text = re.sub(r'Abs\.', 'Absatz', text)
        text = re.sub(r'e\.V\.', 'eingetragener Verein', text)
        text = re.sub(r'co\.', 'Kompanie', text)
        text = re.sub(r'Co\.', 'Kompanie', text)
        text = re.sub(r'gem\.', 'gemäß', text)
        text = re.sub(r"'s", '', text)
        return text

    def __call__(self, X, remove_digits = True, replace_abbreviations = True):
        if remove_digits:
            X = X["Zweck"].apply(self.remove_digits)
            X = pd.DataFrame(X.values.tolist(), columns = ["Zweck"])
        if replace_abbreviations:
            X = X["Zweck"].apply(self.replace_abbreviations)
            X = pd.DataFrame(X.values.tolist(), columns = ["Zweck"])
        return X

class SpacyPreprocessor():
    def __init__(self):
        self.nlp = spacy.load("de_core_news_lg")
        self.nlp.remove_pipe("ner")
        self.nlp.remove_pipe("parser")
        self.nlp.remove_pipe("attribute_ruler")
        self.splitter = Splitter()

    def split_compound_word(self, word):
        score, word1, word2 = self.splitter.split_compound(word)[0]
        if score > 0.7:
            word = " ".join([word1, word2])
            # word = " ".join([self.split_compound_word(word1), self.split_compound_word(word2)])
        return word

    def normalize(self, text, remove_punctuation, remove_stopwords,
                            remove_spaces, split_compound_words,
                            lemmatize):
        doc = self.nlp(text)
        output = []
        for token in doc:
            if remove_punctuation and token.is_punct:
                continue
            if remove_stopwords and token.is_stop:
                continue
            if remove_spaces and token.is_space:
                continue
            if split_compound_words and token.pos_ == 'NOUN':
                word = self.split_compound_word(token.text)
            elif lemmatize:
                word = token.lemma_
            else:
                word = token.text
            
            output.append(word)
        
        return " ".join(output)

    def __call__(self, X, remove_punctuation = True, remove_stopwords = False,
                            remove_spaces = True, split_compound_words = True,
                            lemmatize = False):
        X = X["Zweck"].apply(lambda text: self.normalize(text,
                                                            remove_punctuation,
                                                            remove_stopwords,
                                                            remove_spaces,
                                                            split_compound_words,
                                                            lemmatize))
        X = pd.DataFrame(X.values.tolist(), columns = ["Zweck"])
        return X

text_cleaner_preprocessor = CleanText()
spacy_preprocessor = SpacyPreprocessor()

X = X_train_dfs.copy()
X = text_cleaner_preprocessor(X, remove_digits = False, replace_abbreviations = True)
X = spacy_preprocessor(X, remove_punctuation = False, remove_stopwords = False,
                            remove_spaces = True, split_compound_words = False,
                            lemmatize = False)

pd.concat([X,y_train_dfs], axis=1).head(30)

Unnamed: 0,Zweck,Politikbereich
0,Kauf eines Segelflugzeuges,Sport
1,In Afrika nichts Neues ? Das Verhältnis zum Westen aus afrikanischen Perspektiven,Wirtschaft
2,Fachkraft für Sicherheitsdienstleistungen ( ABT-Kurs SFK-16.2 ),Arbeit
3,Integration und Partnerschaft im Garten der Begegnung,Jugend
4,Entwicklung eines digitalen Workshop-Konzept für Enduser im Bereich Produktion inkl. zugehörigen mobiler Applikation,Wirtschaft
5,FAV - Erlebniswelt Tier und Natur im Jungfernheidepark ( FAV ),Arbeit
6,Solaranlage Goldbeckweg ( Masterplan Integration und Sicherheit ),Jugend
7,Deli Social,Soziales
8,Demokratie leben Frauenreise,Jugend
9,BVBO 2.0 - 2016 / 2017 - Virchow 10,Arbeit
