In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

import pickle, gensim, numpy as np

from utilities import get_train_data, get_test_data, Tokenizer, find_subtoken

PICKLE_FOLDER_PATH = ...

TRAIN_FILENAME = ...
TEST_FILENAME  = ...

#------------------------------
#source:
#http://www.cl.uni-heidelberg.de/english/research/downloads/resource_pages/GermanTwitterEmbeddings/GermanTwitterEmbeddings_data.shtml
MODEL_FILENAME  = "twitter-de_d100_w5_min10.bin" # 821,8 MB
MODEL_DIMENSION = 100

In [None]:
%%time
word2vec_model  = gensim.models.KeyedVectors.load_word2vec_format(MODEL_FILENAME, binary=True)

X_train, y_train_t1, y_train_t2 = get_train_data(TRAIN_FILENAME)
X_test                          = get_test_data(TEST_FILENAME)

### NGRAM FEATURES

In [None]:
char_vect  = TfidfVectorizer(analyzer="char", ngram_range=(3, 7), max_df=0.01, min_df=0.0002,
                             preprocessor=Tokenizer(preserve_case=False, join=True).tokenize)

token_vect = TfidfVectorizer(analyzer="word", ngram_range=(1, 3), max_df=0.01, min_df=0.0002,
                             tokenizer=Tokenizer(preserve_case=False, use_stemmer=True).tokenize)

X_CNGR_train = char_vect.fit_transform(X_train)
X_CNGR_test  = char_vect.transform(X_test)

X_TNGR_train = token_vect.fit_transform(X_train)
X_TNGR_test  = token_vect.transform(X_test)

In [None]:
pickle.dump(X_CNGR_train, open(PICKLE_FOLDER_PATH + "X_CNGR_train.p", "wb" ))
pickle.dump(X_CNGR_test,  open(PICKLE_FOLDER_PATH + "X_CNGR_test.p", "wb" ))

pickle.dump(X_TNGR_train, open(PICKLE_FOLDER_PATH + "X_TNGR_train.p", "wb" ))
pickle.dump(X_TNGR_test,  open(PICKLE_FOLDER_PATH + "X_TNGR_test.p", "wb" ))

### EMB FEATURES

In [None]:
def get_EMB_feats(tweets):   
    tknzr = Tokenizer(preserve_case=True)
    tweets = [tknzr.tokenize(tweet) for tweet in tweets]
    
    X_EMB = []

    for tweet in tweets:
        emb = np.zeros(MODEL_DIMENSION)
        extra_tokens = 0
        
        for token in tweet:
            try:
                emb += word2vec_model[token]
            except:
                prefix = find_subtoken(token, word2vec_model, mode='initial')
                suffix = find_subtoken(token, word2vec_model, mode='final')
                    
                if prefix != None and suffix != None:
                    emb += word2vec_model[prefix] + word2vec_model[suffix]
                    extra_tokens += 1
                elif prefix != None and suffix == None:
                    emb += word2vec_model[prefix]
                elif prefix == None and suffix != None:
                    emb += word2vec_model[suffix]
                    
        emb /= (len(tweet) + extra_tokens)
        X_EMB.append(emb)
        
    return normalize(X_EMB)

In [None]:
%%time
X_EMB_train = get_EMB_feats(X_train)
X_EMB_test  = get_EMB_feats(X_test)

In [None]:
pickle.dump(X_EMB_train, open(PICKLE_FOLDER_PATH + "X_EMB_train.p", "wb" ))
pickle.dump(X_EMB_test,  open(PICKLE_FOLDER_PATH + "X_EMB_test.p", "wb" ))

### TIMP FEATURES

In [None]:
def k_most_imp_tokenlvl(k, category, max_df=0.01, min_df=0.0002):      
    token_vect = TfidfVectorizer(analyzer="word", ngram_range=(1, 1), lowercase=False,
                                 max_df=max_df, min_df=min_df,
                                 tokenizer=Tokenizer(preserve_case=True).tokenize)
    
    tfidf = token_vect.fit_transform(X_train)
    
    vocab = token_vect.vocabulary_
    inv_vocab = {index: word for word, index in vocab.items()}
    
    if category in ['OTHER', 'OFFENSE']:
        cat_ids = np.where(y_train_t1 == category)
    elif category in ['PROFANITY', 'ABUSE', 'INSULT']:
        cat_ids = np.where(y_train_t2 == category)
        
    most_imp_ids = np.argsort(np.asarray(np.mean(tfidf[cat_ids], axis=0)).flatten())[::-1]
        
    most_imp = []
    for index in most_imp_ids:
        most_imp.append(inv_vocab[index])

    return most_imp[:k]

def get_TIMP_feats(tweets, k, category, max_df=0.01, min_df=0.0002):
    feats_max = []
    feats_min = []
           
    imp_tokens_vectors = []
    for imp_token in k_most_imp_tokenlvl(k, category, max_df=max_df, min_df=min_df):
        try:
            imp_tokens_vectors.append(word2vec_model[imp_token])
        except:
            imp_tokens_vectors.append(np.zeros(MODEL_DIMENSION))
    
    tknzr = Tokenizer(preserve_case=True)
    tweets = [tknzr.tokenize(tweet) for tweet in tweets]
    
    for tweet in tweets:
        tweet_vectors = []
        for token in tweet:
            try:
                tweet_vectors.append(word2vec_model[token])
            except:
                prefix = find_subtoken(token, word2vec_model, mode='initial')
                suffix = find_subtoken(token, word2vec_model, mode='final')
                 
                if prefix != None and suffix != None:
                    tweet_vectors.append(word2vec_model[prefix])
                    tweet_vectors.append(word2vec_model[suffix])
                elif prefix != None and suffix == None:
                    tweet_vectors.append(word2vec_model[prefix])
                elif prefix == None and suffix != None:
                    tweet_vectors.append(word2vec_model[suffix])
                else:
                    tweet_vectors.append(np.zeros(MODEL_DIMENSION))
                    
        similarity = cosine_similarity(np.asarray(tweet_vectors), np.asarray(imp_tokens_vectors))
        
        feats_max.append(np.amax(similarity, axis=0))
        feats_min.append(np.amin(similarity, axis=0))
        
    return np.concatenate((feats_max, feats_min), axis=1)

In [None]:
%%time
N_TIMP_TASK1 = 1250
N_TIMP_TASK2 = 170

X_TIMP_task1_train = \
np.concatenate((get_TIMP_feats(X_train, N_TIMP_TASK1, 'OTHER'),
                get_TIMP_feats(X_train, N_TIMP_TASK1, 'OFFENSE')), axis=1)

X_TIMP_task1_test = \
np.concatenate((get_TIMP_feats(X_test,  N_TIMP_TASK1, 'OTHER'),
                get_TIMP_feats(X_test,  N_TIMP_TASK1, 'OFFENSE')), axis=1)

X_TIMP_task2_train = \
np.concatenate((get_TIMP_feats(X_train, N_TIMP_TASK2, 'OTHER'),
                get_TIMP_feats(X_train, N_TIMP_TASK2, 'ABUSE'),
                get_TIMP_feats(X_train, N_TIMP_TASK2, 'INSULT'),
                get_TIMP_feats(X_train, N_TIMP_TASK2, 'PROFANITY')), axis=1)

X_TIMP_task2_test = \
np.concatenate((get_TIMP_feats(X_test,  N_TIMP_TASK2, 'OTHER'),
                get_TIMP_feats(X_test,  N_TIMP_TASK2, 'ABUSE'),
                get_TIMP_feats(X_test,  N_TIMP_TASK2, 'INSULT'),
                get_TIMP_feats(X_test,  N_TIMP_TASK2, 'PROFANITY')), axis=1)

In [None]:
pickle.dump(X_TIMP_task1_train, open(PICKLE_FOLDER_PATH + "X_TIMP_task1_train.p", "wb" ))
pickle.dump(X_TIMP_task1_test,  open(PICKLE_FOLDER_PATH + "X_TIMP_task1_test.p", "wb" ))
pickle.dump(X_TIMP_task2_train, open(PICKLE_FOLDER_PATH + "X_TIMP_task2_train.p", "wb" ))
pickle.dump(X_TIMP_task2_test,  open(PICKLE_FOLDER_PATH + "X_TIMP_task2_test.p", "wb" ))

### CIMP FEATURES

In [None]:
def k_most_imp_charlvl(k, category, max_df=0.01, min_df=0.0002):    
    char_vect  = TfidfVectorizer(analyzer="char", ngram_range=(3, 7), lowercase=False,
                                 max_df=max_df, min_df=min_df,
                                 preprocessor=Tokenizer(preserve_case=True, join=True).tokenize)

    tfidf = char_vect.fit_transform(X_train)
    
    vocab = char_vect.vocabulary_
    inv_vocab = {index: word for word, index in vocab.items()}
    
    if category in ['OTHER', 'OFFENSE']:
        cat_ids = np.where(y_train_t1 == category)
    elif category in ['PROFANITY', 'ABUSE', 'INSULT']:
        cat_ids = np.where(y_train_t2 == category)       
        
    most_imp_ids = np.argsort(np.asarray(np.mean(tfidf[cat_ids], axis=0)).flatten())[::-1]
        
    most_imp = []
    for index in most_imp_ids:
        most_imp.append(inv_vocab[index])

    return most_imp[:k]

def get_CIMP_feats(tweets, k, category, max_df=0.01, min_df=0.0002):
    feats = np.zeros((len(tweets), k))
    for imp_ngram_index, imp_ngram in enumerate(k_most_imp_charlvl(k, category, max_df=max_df, min_df=min_df)):
        for tweet_index, tweet in enumerate(tweets):
            if tweet.find(imp_ngram) != -1:
                feats[tweet_index][imp_ngram_index] = 1
    return feats

In [None]:
%%time
N_CIMP_TASK1 = 3200
N_CIMP_TASK2 = 370
            
X_CIMP_task1_train = \
np.concatenate((get_CIMP_feats(X_train, N_CIMP_TASK1, 'OTHER'),
                get_CIMP_feats(X_train, N_CIMP_TASK1, 'OFFENSE')), axis=1)

X_CIMP_task1_test = \
np.concatenate((get_CIMP_feats(X_test,  N_CIMP_TASK1, 'OTHER'),
                get_CIMP_feats(X_test,  N_CIMP_TASK1, 'OFFENSE')), axis=1)

X_CIMP_task2_train = \
np.concatenate((get_CIMP_feats(X_train, N_CIMP_TASK2, 'OTHER'),
                get_CIMP_feats(X_train, N_CIMP_TASK2, 'ABUSE'),
                get_CIMP_feats(X_train, N_CIMP_TASK2, 'INSULT'),
                get_CIMP_feats(X_train, N_CIMP_TASK2, 'PROFANITY')), axis=1)

X_CIMP_task2_test = \
np.concatenate((get_CIMP_feats(X_test,  N_CIMP_TASK2, 'OTHER'),
                get_CIMP_feats(X_test,  N_CIMP_TASK2, 'ABUSE'),
                get_CIMP_feats(X_test,  N_CIMP_TASK2, 'INSULT'),
                get_CIMP_feats(X_test,  N_CIMP_TASK2, 'PROFANITY')), axis=1)

In [None]:
pickle.dump(X_CIMP_task1_train, open(PICKLE_FOLDER_PATH + "X_CIMP_task1_train.p", "wb" ))
pickle.dump(X_CIMP_task1_test,  open(PICKLE_FOLDER_PATH + "X_CIMP_task1_test.p", "wb" ))
pickle.dump(X_CIMP_task2_train, open(PICKLE_FOLDER_PATH + "X_CIMP_task2_train.p", "wb" ))
pickle.dump(X_CIMP_task2_test,  open(PICKLE_FOLDER_PATH + "X_CIMP_task2_test.p", "wb" ))