## File Preprocessing

### Text Preprocessing
- remove digits
- remove punctuations
- tokenize sentences
- remove stop words
- stemming tokens
- lemmatize words

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

### Preprocessing using nltk pagages

In [2]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer

def nltkPreprocessing(text):
    
    #remove digits
    dig_translator = str.maketrans('', '', '0123456789-/€®–„“|')
    text = text.translate(dig_translator)
    
    #remove punctuation
    str_translator = str.maketrans('', '', string.punctuation)
    text = text.translate(str_translator).lower()
    text = text.strip()
    
    #tokenize sentences
    word_tokens = word_tokenize(text)
    stop_words = stopwords.words('german')
    
    #remove stop words
    filtered_tokens = [w.lower() for w in word_tokens if not w.lower() in stop_words]
    
    #using PorterStemmer to stem the tokens (effect not good)
#     ps = SnowballStemmer('german')
#     stem_tokens = [ps.stem(w) for w in filtered_tokens]
    
    return filtered_tokens
#     return stem_tokens



### Preprocessing using Spacy packages

In [3]:
import spacy
nlp = spacy.load('de')

def spacyPreprocessing(text): 
    #remove digits
    dig_translator = str.maketrans('', '', '0123456789-/€®–|')
    text = text.translate(dig_translator)
    #tokenize texts
    word_tokens = nlp(text.lower())
    word_tokens = [w for w in word_tokens if not w.is_stop] #remove stop words
    word_tokens = [w for w in word_tokens if not w.is_punct]#remove punctuation
    word_tokens = [w for w in word_tokens if not w.is_space]#remove space
    word_tokens = [w for w in word_tokens if not w.is_currency]#remove currency
    word_tokens = [w.lemma_ for w in word_tokens if not w.like_num]#remove num like, lemmatize words
    return word_tokens

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


### Updated Preprocessing using Spacy packages

In [11]:
import spacy
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import re

MAXLEN = 999999 # max len processed by spaCy
pos    = ['ADJ', 'ADV', 'NOUN', 'NUM', 'PROPN', 'VERB']
german = spacy.load('de', disable=['parser', 'ner'])
german.vocab["%"].is_punct = True

def preprocessing_spacy(text): 
    dig_translator = str.maketrans('', '', '-–')
    text = text.translate(dig_translator)
    
    text_len = len(text)
    left = text_len
    
    document_contents = []
    while left >= 0:
        doc_model = None
        if left-MAXLEN >= 0:
            doc_model = german(text[0:MAXLEN])
        else:
            doc_model = german(text[0:left])
        lemmata = [token.lemma_ for token in doc_model if token.pos_ in pos and not token.is_punct and not token.is_stop and not token.is_currency and not token.like_num]
        document_contents.append(' '.join(lemmata))
        left -= MAXLEN
    
    german_word = '[a-zA-ZäöüÄÖÜß][a-zA-ZäöüÄÖÜß0-9]{2,}' # At least for characters long
    my_stop_words  = ['einer', 'eine', 'eines', 'einen', 'oder', 'aber', 'dass',  'teur', 'euro', 'eur', 'jahr', 'million', 'tausend', 'mio']
    stop_words = stopwords.words('german')
    stop_words.extend(my_stop_words)
    
    counter = CountVectorizer(analyzer='word', lowercase=True, 
                          token_pattern=german_word, stop_words=stop_words)

    word_counts = counter.fit_transform(document_contents)
    words       = counter.get_feature_names()
    
    rebuilt_doc = []
    
    for i in range(len(words)):
        rebuilt_doc.extend([words[i]] * word_counts.toarray()[0][i])
    return rebuilt_doc