# Bereinigung

In [1]:
import re, warnings
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import pandas as pd
import pickle
warnings.filterwarnings('ignore')  # Let's not pay heed to them right now

nlp = spacy.load("en_core_web_sm")

## Bereinigungsfunktionen

In [2]:
#text to lower case
def tolower(doc):
    doc = [token.lower_ for token in doc]
    doc = ' '.join(doc)
    return nlp.make_doc(doc)

In [3]:
# Lemmatizer
# Bring words in a normal form
def lemmatizer(doc):
    doc = [token.lemma_ for token in doc if token.lemma_ != 'PRON']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)

In [4]:
# Removes all numbers without letters
def notNUM(doc):
    doc = [token.text for token in doc if token.pos_ != 'NUM']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)

In [5]:
# Removes all symbols
def noSYM(doc):
    doc = [token.text.lower() for token in doc if token.pos_ != 'SYM']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)

In [6]:
# Removes single characters
def single_char(doc):
    doc = [token.text for token in doc if len(token.text) > 2]
    doc = u' '.join(doc)
    return nlp.make_doc(doc)

In [7]:
# Removes punctuation
def rm_punct(doc):
    doc = [token.text for token in doc if not token.is_punct]
    doc = u' '.join(doc)
    return nlp.make_doc(doc)

In [8]:
# Removes adjectives
def rm_adj(doc):
    doc = [token.text for token in doc if token.pos_ != 'ADJ']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)

In [9]:
# Removes other not identifable word
def rm_other(doc):
    doc = [token.text for token in doc if token.pos_ != 'X']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)

In [10]:
# Removes verbs
def rm_verb(doc):
    doc = [token.text for token in doc if token.pos_ != 'VERB']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)

In [11]:
def remove_additional_patterns(doc):
    #print(doc)
    doc = re.sub('ã*', '', doc)
    doc = re.sub('\n', '', doc)
    doc = re.sub('â*', '', doc)
    doc = re.sub('ã*', '', doc)
    doc = re.sub('ë*', '', doc)
    doc = re.sub('\d*', '', doc)
    doc = re.sub('¢*', '', doc)
    #print(doc)
    #doc = re.sub('*Ã£â€šÃ¢', '', doc)
    #print(doc)
    #doc = re.sub('*ã‚â', '', doc)
    #print(doc)
    return doc

### Stop Words

In [12]:
# Updates spacy's default stop words list with my additional words
def add_own_stopwords(stop_list=[]):
    nlp.Defaults.stop_words.update(stop_list)

    # Iterates over the words in the stop words list and resets the "is_stop" flag.
    for word in STOP_WORDS:
        lexeme = nlp.vocab[word]
        lexeme.is_stop = True


def remove_stopwords(doc):
    # This will remove stopwords and punctuation.
    # Use token.text to return strings, which we'll need for Gensim.
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True and token.is_space != True]
    return doc

### Pipeline

In [13]:
def to_pipline(my_stop_words):
    try:
        add_own_stopwords(my_stop_words)
        nlp.add_pipe(tolower, name='tolower', first=True)
        nlp.add_pipe(lemmatizer, name='lemmatizer', after='ner')
        nlp.add_pipe(rm_other, name='rm_other', after='ner')
        nlp.add_pipe(notNUM, name='notNUM', after='ner')
        nlp.add_pipe(single_char, name='single_char', after='ner')
        #nlp.add_pipe(remove_additional_patterns, name='rmpat', after='ner')
        
        #nlp.add_pipe(rm_adj, name='rm_adj', after='ner')
        #nlp.add_pipe(rm_verb,name='rm_verb',after='ner')
        
        # Bringen keine Ergebnisse
            #nlp.add_pipe(noSYM, name='noSYM', after='ner')
            #nlp.add_pipe(rm_punct, name='rm_punct', after='ner')
        
        nlp.add_pipe(remove_stopwords, name="stopwords", last=True)
        #nlp.to_pickle("data/nlp.pkl")
    except ValueError as err:
        print(err)
    return nlp

my_stop_words = ['\ufeff1', '¢‚¬*', 'ë*']
nlp = to_pipline(my_stop_words)

### Anwendung auf eine Spalte

In [14]:
from tqdm import tqdm_notebook as tqdm

def clean_column(column, filename):
    indizes = column.index.tolist()
    index = 0
    for text in column:
        pr = nlp(text)
        strpr = u' '.join(pr)
        strpr = remove_additional_patterns(strpr)
        column.loc[indizes[index]] = strpr
        index += 1
    column.to_pickle("data/"+ filename +".pkl")
    return column

In [15]:
pd.set_option('display.max_colwidth', 100)

# ****************************************************************************
# Define own Stop Words
my_stop_words = ['\ufeff1', '¢‚¬*', 'ë*']


# Laden des Corpus
file = open("data/knowledgebase.pkl", "rb")
kb = pickle.load(file)

# Ausführen der Bereinigung
kb['Topic'] = clean_column(kb['Topic'], 'clean_Topic')
kb['Question'] = clean_column(kb['Question'], 'clean_Question')
kb['Reference'] = clean_column(kb['Reference'], 'clean_Reference')
kb['Enviroment'] = clean_column(kb['Enviroment'], 'clean_Enviroment')
kb['Resolution'] = clean_column(kb['Resolution'], 'clean_Resolution')
kb.to_csv("r/kb.csv", sep=";")
kb.to_pickle("data/clean_kb.pkl")