In [1]:
# Normalization Function: 

# Does tokenization, lemmatization, filters characters only, removes stopwords, 
# converts to lower case, handels umlauts - all optional

# Input: Tweets as array and the options listed above
# Output: Normalized tweets as array

import pandas as pd
import numpy as np

import re
import nltk
nltk.download('punkt') 
import string
from nltk.stem import WordNetLemmatizer

from pattern.de import tag
from nltk.corpus import wordnet as wn

from germalemma import GermaLemma
lemmatizer = GermaLemma()


stopword_list = nltk.corpus.stopwords.words('german')
stopword_list = stopword_list + ['frau', 'herr', 'komme', 'go', 'get',
                                 'tell', 'listen', 'ein', 'two', 'three',
                                 'vier', 'fünf', 'sechs', 'sieben', 'acht',
                                 'neun', 'null', 'dass']

def tokenize_text(text):
    tokens = nltk.word_tokenize(text, language='german') 
    tokens = [token.strip() for token in tokens]
    return tokens

def expand_contractions(text, contraction_mapping):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text


# Annotate text tokens with POS tags
def pos_tag_text(text):
    
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('ADJ...'):
            return wn.ADJ
        elif pos_tag.startswith('V...'):
            return wn.VERB
        elif pos_tag.startswith('N...'):
            return wn.NOUN
        elif pos_tag.startswith('ADV...'):
            return wn.ADV
        else:
            return None
    
    tagged_text = tag(text)
    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in
                         tagged_text]
    return tagged_lower_text
    
# lemmatize text based on POS tags    
    
from germalemma import GermaLemma
lemmatizer = GermaLemma()

def lemmatize_text(text):
    
    pos_tagged_text = pos_tag_text(text)
    lemmatized_tokens = [lemmatizer.find_lemma(word, pos_tag) if pos_tag
                         else word                     
                         for word, pos_tag in pos_tagged_text]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

def remove_special_characters(text):
    tokens = tokenize_text(text)
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = filter(None, [pattern.sub(' ', token) for token in tokens]) #pattern.sub(replacement,string), if pattern found then replace with replacement, if not found return string unchanged
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text
    
    
def remove_stopwords(text):
    tokens = tokenize_text(text)
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

def keep_text_characters(text):
    filtered_tokens = []
    tokens = tokenize_text(text)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

def remove_repeatition(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text) # if a letter occurs more than once

# Replace umlauts for a given text
def umlauts(text):
    tempVar = text # local variable
    
    # Using str.replace() 
    
    tempVar = tempVar.replace('ä', 'ae')
    tempVar = tempVar.replace('ö', 'oe')
    tempVar = tempVar.replace('ü', 'ue')
    tempVar = tempVar.replace('Ä', 'Ae')
    tempVar = tempVar.replace('Ö', 'Oe')
    tempVar = tempVar.replace('Ü', 'Ue')
    tempVar = tempVar.replace('ß', 'ss')
    
    return tempVar


def normalize_tweet(tweet, lemmatize=True, 
                     only_text_chars=True,
                     tokenize=False, 
                     stopwords_removal = True,
                     lower_case = False,
                     umlauts_removal = True): #
    
    normalized_tweet = []    
    for index, text in enumerate(tweet):
        if lemmatize:
            text = lemmatize_text(text) # ok
        if lower_case:
            text = text.lower()
        text = remove_special_characters(text) # ok
        if stopwords_removal:
            text = remove_stopwords(text) # ok
        if umlauts_removal:
            text = umlauts(text)
        if only_text_chars:
            text = keep_text_characters(text) # ok  
        if tokenize:
            text = tokenize_text(text) # ok
            normalized_tweet.append(text)
        else:
            normalized_tweet.append(text)
            
    return normalized_tweet

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NALMPI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Feature Extraction Function

# Works basically for BoW, Tf-IDF and different n-grams (or n-gram-ranges)

# Input: Tweets as arrays, type of feature extraction, ngram_range, 
# max_df / min_df (ignore terms that have a document frequency strictly higher
# /lower than the given threshold)

# Output: Vectorizer and feature matrix

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd

def build_feature_matrix(documents, feature_type='frequency',
                         ngram_range=(1, 1), min_df=0.0, max_df=1.0):

    feature_type = feature_type.lower().strip()  
    
    if feature_type == 'binary':
        vectorizer = CountVectorizer(binary=True, min_df=min_df,
                                     max_df=max_df, ngram_range=ngram_range, token_pattern=u"(?u)\\b\\w+\\b")
    elif feature_type == 'frequency':
        vectorizer = CountVectorizer(binary=False, min_df=min_df,
                                     max_df=max_df, ngram_range=ngram_range, token_pattern=u"(?u)\\b\\w+\\b")
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, 
                                     ngram_range=ngram_range, token_pattern=u"(?u)\\b\\w+\\b")
    else:
        raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")

    feature_matrix = vectorizer.fit_transform(documents).astype(float)
    
    return vectorizer, feature_matrix
    

In [3]:
# Prepares arrays for both relevant variables (tweets and labels) for later 
# training and easier handling

def prepare_arrays(dataset):
    train_tweets = np.array(dataset["full_text"])
    train_labels_binary = np.array(dataset["label_binary"])
    
    return train_tweets, train_labels_binary

In [4]:
# Get the document frequency matrix after feature extraction

def get_dfm(features, names):
    df = pd.DataFrame(data = features, columns = names)
    df_trans = df.T
    return df_trans