In [None]:
import pandas as pd
from OpenDutchWordnet import Wn_grid_parser
import argparse
import re
import nltk
import xgboost
import re
import numpy as np
import stanza
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import scale
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from mlxtend.plotting import plot_confusion_matrix
from sklearn import metrics
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
import json
import seaborn as sns
from sklearn.metrics import confusion_matrix

In [None]:
def clean_text(text):
    if not isinstance(text, str):
        print(text)
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    no_emoji = emoji_pattern.sub(r'', text)
    
    # Remove urls
    no_urls = re.sub(r"http\S+", "", no_emoji)
    
    # Remove punctuation, numbers and symbols
    no_punct_symbols_nrs = re.sub(r'[^A-Za-z\s]+', '', no_urls)
    
    # Remove trailing white space
    no_trailing_ws = " ".join(no_punct_symbols_nrs.split())
    
    # Lowercase
    text_clean = no_trailing_ws.lower()
    return text_clean

def lemmatize(nlp, text):
    doc = nlp(text)
    lemmatized = [word.lemma for sent in doc.sentences for word in sent.words]
    return lemmatized

In [None]:
def process_data(nlp, df_train, df_test):
    # Clean text
    df_train['clean_text'] = df_train['text'].apply(lambda x: clean_text(x))

    # Remove empty values
    df_train = df_train[df_train['clean_text'] != '']
    
    # Lemmatize 
    df_train['lemmatized_clean_text'] = df_train['clean_text'].apply(lambda x: lemmatize(nlp, x))    

    df_test['clean_text'] = df_test['message'].apply(lambda x: clean_text(x))
    df_test = df_test[df_test['clean_text'] != '']        
    df_test['lemmatized_clean_text'] = df_test['clean_text'].apply(lambda x: lemmatize(nlp, x))  
    
    # Binary labels
    df_test['labels'].replace({"y": 1, "n": 0}, inplace=True)
    return df_train, df_test

In [None]:
def get_hypernyms (instance, synset_id, hypers):
    synset = instance.synsets_find_synset(synset_id)
    if synset:
        hypernyms = synset.get_relations("has_hyperonym")
        if hypernyms:
            for h in hypernyms:
                if (h):
                    if not h.get_target() in hypers:
                        hypers.append(h.get_target())
                        get_hypernyms(instance, h.get_target(), hypers)


def get_hypernyms_lemmas():
    # ['ontmoeten',
    #  'voelen',
    #  'meemaken',
    #  'ondervinden',
    #  'ondergaan',
    #  'gevoelen',
    #  'zich omkleden',
    #  'ervaren',
    #  'gewaarworden',
    #  'kenteren',
    #  'doorleven',
    #  'veranderen',
    #  'keren',
    #  'beleven']
    instance = Wn_grid_parser(path_wn_grid_lmf='odwn_orbn_gwg-LMF_1.3.xml.gz')
    le_el = instance.les_find_le("voelen-v-2")
    synset_el = instance.synsets_find_synset(le_el.get_synset_id())
    hypers = []
    get_hypernyms(instance, synset_el.get_id(), hypers)
    new_hypers = []

    lemmas = []
    for hyper in hypers:
        new_hypers.append(hyper)
        for le in instance.les_all_les_of_one_synset(hyper):
            lemmas.append(le.get_lemma())  

    for hyper in new_hypers:
        hypers = []
        get_hypernyms(instance, hyper, hypers)
        new_hypers = []
        for hyper in hypers:
            new_hypers.append(hyper)
            for le in instance.les_all_les_of_one_synset(hyper):
                lemmas.append(le.get_lemma())
                
    return list(set(lemmas+['voelen']))

In [None]:
def check_match(match, clean_text):
    if re.search(match, clean_text):
        return 1
    else: 
        return 0
    
def lemma_replace(word_list, clean_text, lemmatized_clean_text):
    clean_text_tokenized = clean_text.split()
    for lemma_word in word_list:
        if lemma_word in lemmatized_clean_text:
            for i, word in enumerate(lemmatized_clean_text.split()):
                if word == lemma_word:
                    clean_text_tokenized[i] = clean_text_tokenized[i].replace(clean_text_tokenized[i], lemma_word)
    clean_text = " ".join(clean_text_tokenized)
    return clean_text

def heuristics_labelling(df_train, df_test, WN_synsets, filters, remove):
    
    df_train['clean_text_lemma'] = df_train['clean_text']
    df_test['clean_text_lemma'] = df_test['clean_text']
    
    matches = []
    if 0 in filters:
        matches.append("((heb|heeft|hebben) [a-z]* (gehad))")
    if 1 in filters:
        direct_relation = 'vader|moeder|ouder|schoonvader|schoonmoeder|kind|zoon|dochter|man|vrouw|broer|zus|neef|nicht|tante|oom'
        df_train['clean_text_lemma'] = df_train[['clean_text', 'lemmatized_clean_text']].apply(lambda x: lemma_replace(direct_relation.split('|'), x.clean_text, x.lemmatized_clean_text), axis=1)
        df_test['clean_text_lemma'] = df_test[['clean_text', 'lemmatized_clean_text']].apply(lambda x: lemma_replace(direct_relation.split('|'), x.clean_text, x.lemmatized_clean_text), axis=1)
        
        matches.append("((mijn|mn|me|m n|mij|men) " + '(' + direct_relation + '))') 
                       
    if 2 in filters:                       
        df_train['clean_text_lemma'] = df_train[['clean_text', 'lemmatized_clean_text']].apply(lambda x: lemma_replace(WN_synsets, x.clean_text, x.lemmatized_clean_text), axis=1)
        df_test['clean_text_lemma'] = df_test[['clean_text', 'lemmatized_clean_text']].apply(lambda x: lemma_replace(WN_synsets, x.clean_text, x.lemmatized_clean_text), axis=1)
                
        matches.append('('+'|'.join(WN_synsets)+')')

    match = '|'.join(matches)
    df_train['labels'] = df_train['clean_text_lemma'].apply(lambda x: check_match(match, x))
    df_test['predicted'] = df_test['clean_text_lemma'].apply(lambda x: check_match(match, x))
      
    if remove == True:
        df_train['clean_text_removals'] = df_train['clean_text_lemma'].apply(lambda x: re.sub(match, '', x))
        df_test['clean_text_removals'] = df_test['clean_text_lemma'].apply(lambda x: re.sub(match, '', x))

In [None]:
def resample_data(df_train, df_train_labels):
    over= RandomOverSampler(sampling_strategy=1, random_state=42)
    df_train_sampled, df_train_sampled_labels = over.fit_resample(df_train, df_train_labels)
    return df_train_sampled


In [None]:
def create_train_test(df_train, df_test, vectorizer, remove):
    vect = vectorizer
    y_train = df_train['labels']
    y_test = df_test['labels']
    if remove == True:
        corpus = df_train['clean_text_removals'].tolist()
        X_train = vect.fit_transform(corpus)
        X_test = vect.transform(df_test['clean_text_removals'])
    else:
        corpus = df_train['clean_text'].tolist()
        X_train = vect.fit_transform(corpus)
        X_test = vect.transform(df_test['clean_text'])
    return X_train, y_train, X_test, y_test

In [None]:
def create_train_test_unlabelled(df_train, df_test, vectorizer, remove):
    vect = vectorizer
    y_train = df_train['labels']
    if remove == True:
        corpus = df_train['clean_text_removals'].tolist()
        X_train = vect.fit_transform(corpus)
        X_test = vect.transform(df_test['clean_text_removals'])
    else:
        corpus = df_train['clean_text'].tolist()
        X_train = vect.fit_transform(corpus)
        X_test = vect.transform(df_test['clean_text'])
    return X_train, y_train, X_test

In [None]:
def create_cm(true_labels, predicted_labels):
    cm = metrics.confusion_matrix(true_labels, predicted_labels)
        
    fig, ax = plot_confusion_matrix(conf_mat=cm,
                                    colorbar=False,
                                    show_absolute=False,
                                    show_normed=True,
                                    class_names=['non-exp','exp'])
    fig.set_size_inches(10, 10.5)

    
def class_feature_importance(X, Y, feature_importances, vect):
    N, M = X.shape
    X = scale(X, with_mean=False)

    out = {}
    for c in set(Y):
        out[c] = dict(
            zip(vect.get_feature_names(), np.mean(X[Y==c, :], axis=0)*feature_importances)
        )

    return out    

def classification_experiments(X_train, y_train, X_test, y_test):
    print("--------------------------------")
    print("Logistic Regression")
    
    clf = LogisticRegression(random_state=42).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))
    print("LR: ROC-AUC:", roc_auc_score(y_test, y_pred))
#     create_cm(y_test, y_pred)
    
    print("--------------------------------")
    print("XGBoost Random Forest")
    
    xgbc = XGBClassifier(objective="binary:logistic", random_state=42, eval_metric='logloss')
    xgb = xgbc.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)
    print(classification_report(y_test, y_pred))
    print("XGB: ROC-AUC:", roc_auc_score(y_test, y_pred))
#     create_cm(y_test, y_pred)

In [None]:
def full_data_train_test_split(full_df, sub_df, test_df):
    test_ids = test_df['object_id'].tolist()
    sub_ids = sub_df['object_id'].tolist()
    full_df = full_df[~full_df.object_id.isin(test_ids)]
    full_df = full_df[~full_df.object_id.isin(sub_ids)]
    
    # full_df_test is for prediction
    full_df_train_no_sentiment, full_df_test = train_test_split(full_df, test_size=0.42, random_state=42)
    full_df_train = pd.concat([full_df_train_no_sentiment, sub_df]) #sub_df  is around 2 percent
    return full_df_train, full_df_test

In [None]:
def create_predicted(unlabelled_df, unlabelled_X_test, best_model, name):
    predictions = best_model.predict_proba(unlabelled_X_test)
    unlabelled_df['best_model_pred'] = predictions
    unlabelled_df[['text', 'best_model_pred']].to_csv(name+'.tsv', delimiter='\t', index=False)

In [None]:
def create_lemmatized_files():
    df_train = pd.read_csv('fb_preprocessed_FB_NOS_NU_Telegraaf_NRC_all_endFeb.csv', sep='\t')
    df_test = pd.read_csv('experience_test/Fb_random_sample_500_annotated_discussed.tsv', sep='\t')
    sub_df = pd.read_csv('high_sent_subjFB_NOS_NU_Telegraaf_NRC_all_endFeb.csv', sep='\t')
    
    df_train = df_train[~df_train['text'].isna()]
    df_test = df_test[~df_test['message'].isna()]
    sub_df = sub_df[~sub_df['text'].isna()]
    
    nlp = stanza.Pipeline(lang='nl', processors='tokenize,pos,lemma')
    df_train_sent, df_test_sent = process_data(nlp, sub_df, df_test)
    print("Finished sentiment")
    df_train, df_test = process_data(nlp, df_train, df_test)
    print("Finished train and test")
 
    df_test_sent.to_csv('lemmatized_test_high_sent_subjFB_NOS_NU_Telegraaf_NRC_all_endFeb.tsv', sep='\t')
    df_train_sent.to_csv('lemmatized_train_high_sent_subjFB_NOS_NU_Telegraaf_NRC_all_endFeb.tsv', sep='\t')

    df_train.to_csv('lemmatized_train_fb_preprocessed_FB_NOS_NU_Telegraaf_NRC_all_endFeb.tsv', sep='\t') # no test 
    df_test.to_csv('lemmatized_test_fb_preprocessed_FB_NOS_NU_Telegraaf_NRC_all_endFeb.tsv', sep='\t')
    

In [None]:
def results_models():
    # comment this when not using unlabelled    
#     df_train = pd.read_csv('lemmatized_train_fb_preprocessed_FB_NOS_NU_Telegraaf_NRC_all_endFeb.tsv', sep='\t')
#     sub_df = pd.read_csv('lemmatized_train_high_sent_subjFB_NOS_NU_Telegraaf_NRC_all_endFeb.tsv', sep='\t')
#     df_train, df_unlabelled = full_data_train_test_split(df_train, sub_df, df_test)
#     df_train.to_csv('lemmatized_train_no_unlabelled_fb_preprocessed_FB_NOS_NU_Telegraaf_NRC_all_endFeb.tsv', sep='\t') 
#     df_unlabelled.to_csv('lemmatized_unlabelled_fb_preprocessed_FB_NOS_NU_Telegraaf_NRC_all_endFeb.tsv', sep='\t')

    df_train = pd.read_csv('lemmatized_train_no_unlabelled_fb_preprocessed_FB_NOS_NU_Telegraaf_NRC_all_endFeb.tsv', sep='\t')
    df_test = pd.read_csv('lemmatized_test_fb_preprocessed_FB_NOS_NU_Telegraaf_NRC_all_endFeb.tsv', sep='\t')

    WN_synsets = get_hypernyms_lemmas()
    
    vectorizers = [TfidfVectorizer(), CountVectorizer(binary=True)]
    removes = [False, True]
    filters = [[0], [1], [2], [0, 1], [1, 2], [0, 2], [0, 1, 2]]
    for vect in vectorizers:
        print(vect)
        print('-----------------')
        for remove in removes:
            print(remove)
            print('-----------------')
            for filter_ in filters:
                print(filter_)
                print('-------------------')
                heuristics_labelling(df_train, df_test, WN_synsets, filter_, remove)
                print('DATA STATS: ', df_train['labels'].value_counts())

                df_train_sampled = resample_data(df_train, df_train['labels'])
                X_train, y_train, X_test, y_test = create_train_test(df_train_sampled, df_test, vect, remove)
                print('-------------------')
                print("BASELINE")
                print(classification_report(df_test['labels'], df_test['predicted']))
                classification_experiments(X_train, y_train, X_test, y_test)

In [None]:
def prediction_labelling(df_train, df_test, df_unlabelled_test, WN_synsets):
    # Best XGB 
    heuristics_labelling(df_train, df_unlabelled_test, WN_synsets, [0,1,2], False)
    df_train_sampled = resample_data(df_train, df_train['labels'])
    bin_vect = CountVectorizer(binary=True)
    X_train, y_train, X_test, y_test = create_train_test(df_train_sampled, df_test, bin_vect, False)

    _, _, unlabelled_X_test = create_train_test_unlabelled(df_train_sampled, df_unlabelled_test, bin_vect, False)
    
    predictions = best_xgb.predict_proba(unlabelled_X_test)[:, 1]

    df_unlabelled_test['best_model_pred'] = predictions
    df_unlabelled_test[['text', 'best_model_pred']].to_csv('unlabelled_predictions_by_bestmodel.tsv', sep='\t', index=False)

def best_xgb_results(df_train, df_test, WN_synsets):
    heuristics_labelling(df_train, df_test, WN_synsets, [0,1,2], False)
    df_train_sampled = resample_data(df_train, df_train['labels'])
    bin_vect = CountVectorizer(binary=True)
    X_train, y_train, X_test, y_test = create_train_test(df_train_sampled, df_test, bin_vect, False)
    best_xgb = XGBClassifier(objective="binary:logistic", random_state=42, eval_metric='logloss').fit(X_train, y_train)  
    y_pred = best_xgb.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    importance = best_xgb.feature_importances_

    result = class_feature_importance(X_test.toarray(), y_pred, importance, bin_vect)

    d = result.get(1)

    sorted_d = sorted(d.items(), key=lambda x: x[1], reverse=True)
    print(sorted_d[:20])
    
    return best_xgb, y_pred, y_test
    
def best_lr_results(df_train, df_test, WN_synsets):
    heuristics_labelling(df_train, df_test, WN_synsets, [0,1], True)
    df_train_sampled = resample_data(df_train, df_train['labels'])
    tfidf = TfidfVectorizer()
    X_train, y_train, X_test, y_test = create_train_test(df_train_sampled, df_test, tfidf, True)
    clf = LogisticRegression(random_state=42).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("Logistic Regression")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    
    importance = clf.coef_[0]

    result = class_feature_importance(X_test.toarray(), y_pred, importance, tfidf)

    d = result.get(1)

    sorted_d = sorted(d.items(), key=lambda x: x[1], reverse=True)

    print(sorted_d[:20])
    
    return clf, y_pred, y_test

In [None]:
def create_sample_dfs():
    bm_df = pd.read_csv('predicted_data/unlabelled_predictions_by_xgb-binary-keepfeat.tsv', sep='\t', header=None)
#     bm_df[:250].to_csv('predicted_data/top-250-unlabelled_predictions_by_xgb-binary-keepfeat.tsv', sep='\t', header=False)

    bm_df_unlabelled = bm_df.fillna('Unlabelled')
    labelled = bm_df_unlabelled[bm_df_unlabelled[3] != 'Unlabelled']
    labelled[[0, 1, 3]].to_csv('predicted_data/workshop_labelled_predictions_by_xgb-binary-keepfeat.csv', sep='\t', header=False)

    bm_df_unlabelled = bm_df[~bm_df[1].isin(labelled.index)]
    above_unlabelled_09 = bm_df_unlabelled[bm_df_unlabelled[1] >= 0.9]
    above_unlabelled_09_sample = above_unlabelled_09.sample(n=250, random_state=50)
    above_unlabelled_09_sample[[0]].to_csv('predicted_data/sample_unlabelledpredictions_over0-9_by_xgb-binary-keepfeat.csv', sep='\t', header=False, index=False)
    
    below_unlabelled_09 = bm_df_unlabelled[bm_df_unlabelled[1] < 0.9]
    below_unlabelled_09 = below_unlabelled_09[below_unlabelled_09[1] >= 0.5]
    below_unlabelled_09_sample = below_unlabelled_09.sample(n=250, random_state=50)
    below_unlabelled_09_sample[[0]].to_csv('predicted_data/sample_unlabelledpredictions_below0-9_by_xgb-binary-keepfeat.csv', sep='\t', header=False, index=False)

    
#     sns.histplot(below_250_unlabelled[1], stat="probability", bins=10, kde=True)
#     below_250_unlabelled['bin'] = pd.cut(below_250[1], 10)
#     print(below_250_unlabelled.bin.value_counts())

#     sample_df = below_250_unlabelled.groupby('bin').sample(n=100, random_state=1)
#     sample_df[[0, 1, 3, 'bin']].to_csv('predicted_data/sample_unlabelled_predictions_by_xgb-binary-keepfeat.tsv', sep='\t', header=False)

In [None]:
def correct_preds_index(y_test, y_pred):
    return [1 if i == j else 0 for i, j in zip(y_test, y_pred)]

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
def main():
    df_train = pd.read_csv('lemmatized_train_no_unlabelled_fb_preprocessed_FB_NOS_NU_Telegraaf_NRC_all_endFeb.tsv', sep='\t')
    df_test = pd.read_csv('lemmatized_test_fb_preprocessed_FB_NOS_NU_Telegraaf_NRC_all_endFeb.tsv', sep='\t')
    df_unlabelled_test = pd.read_csv('lemmatized_unlabelled_fb_preprocessed_FB_NOS_NU_Telegraaf_NRC_all_endFeb.tsv', sep='\t')
    WN_synsets = get_hypernyms_lemmas()
    
#     create_lemmatized_files()
    results_models()
#     prediction_labelling(df_train, df_test, df_unlabelled_test, WN_synsets)

    print("LR")
    lr, y_pred_lr, y_test = best_lr_results(df_train, df_test, WN_synsets)
    lr_correct = correct_preds_index(y_test, y_pred_lr)
    
    print("XGB")
    xgb, y_pred_xgb, y_test = best_xgb_results(df_train, df_test, WN_synsets)
    xgb_correct = correct_preds_index(y_test, y_pred_xgb)
#     create_sample_dfs()

    print("correct matrix")
    print(confusion_matrix(lr_correct, xgb_correct))
    
    print("prediction matrix")
    print(confusion_matrix(y_pred_lr, y_pred_xgb))
    
    

In [None]:
if __name__ == "__main__":
    main()
    