In [None]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB
import sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import mutual_info_classif
import xgboost as xgb
import random
import shap
import datatable as dt
shap.initjs()
%matplotlib inline

<H1> Preparing data split: 5 fold cross validation </H1>
<br>
<h3> The procedure is identical as when splitting data for the purpose of training selected Language Models

In [None]:
# read original data file
df = pd.read_excel("./data/source_data/tweet_sentiment_input_file.xlsx", converters={'dummy_id': str})

In [None]:
# drop not needed columns
df = df.drop(["row", "dummy_id"], axis=1)

In [None]:
# 5 fold CV
# setup random state
np.random.seed(13)

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# define number of folds
fold_number = 5
kf = KFold(n_splits=fold_number, random_state=13, shuffle=True)

In [None]:
# create data splits for Deep Learning Language Models trained with Flair framework
train_indexes = {}
val_indexes = {}
test_indexes = {}

# train sets for Machine Learning
train_ml = {}
i = 0

# this split (with fold_number=5) results in: 20% test, 10% val, 70% train for Flair framework
# and the same 20% test and 80 % train for Machine Learning
indexes = list(range(0, len(df)))
for train_index, test_index in kf.split(indexes):

    test_indexes[i] = test_index
    train_ml[i] = train_index
    train_index, val_index = train_test_split(train_index, test_size=0.125, random_state=13, shuffle=True)
    train_indexes[i] = train_index
    val_indexes[i] = val_index
    i += 1

# test sets for Machine Learning are equal to those for Flair framework
test_ml = test_indexes

<h1>Reading data: tweets encoded by various Language Models

<h3> Linguistic Inquiry and Word Count (LIWC) feature file

In [None]:
dfliwc = pd.read_excel("./data/embeddings/LIWC2015_5k.xlsx", converters={'dummy_id': str})

# rename columns to get unique names
dfliwc.rename(columns={'text': 'text_liwc', "sentiment": 'liwc_sent'}, inplace=True)

In [None]:
# define LIWC features names
liwcfeatures = ['WC', 'Analytic', 'Clout', 'Authentic',
       'Tone', 'WPS', 'Sixltr', 'Dic', 'function', 'pronoun', 'ppron', 'i',
       'we', 'you', 'shehe', 'they', 'ipron', 'article', 'prep', 'auxverb',
       'adverb', 'conj', 'negate', 'verb', 'adj', 'compare', 'interrog',
       'number', 'quant', 'affect', 'posemo', 'negemo', 'anx', 'anger', 'sad',
       'social', 'family', 'friend', 'female', 'male', 'cogproc', 'insight',
       'cause', 'discrep', 'tentat', 'certain', 'differ', 'percept', 'see',
       'hear', 'feel', 'bio', 'body', 'health', 'sexual', 'ingest', 'drives',
       'affiliation', 'achieve', 'power', 'reward', 'risk', 'focuspast',
       'focuspresent', 'focusfuture', 'relativ', 'motion', 'space', 'time',
       'work', 'leisure', 'home', 'money', 'relig', 'death', 'informal',
       'swear', 'netspeak', 'assent', 'nonflu', 'filler', 'AllPunc', 'Period',
       'Comma', 'Colon', 'SemiC', 'QMark', 'Exclam', 'Dash', 'Quote',
       'Apostro', 'Parenth', 'OtherP']

<h3> Vector representations (embeddings) created by selected Deep Learning Language Models trained previously on here addressed task

In [None]:
# define which embedding files to read
embeddings = [("FastText_lstm", "fasttext"), ("Roberta_lstm", "roberta_lstm"),
              ("Roberta_CLS", "roberta_large_ft")]

# instantiate list of data frames with features and a list of feature names for each df
dfemblist = []

# Initialize a dictionary with all features used later on in Machine Learning
allFeatures = {}

# read embedding files and define corresponding feature names (lists of names)
for emname, embedding in embeddings:
    embfeaturedict = {}
    for fold in range(fold_number):
        # read encoded sentences by the selected language model
        dfemb = dt.fread(f"./data/embeddings/{embedding}_encoded_sentences_{fold}.csv").to_pandas()
        embfeatures = [f"{emname}{fold}row"]
        
        # define number of feature columns (columns - 3)
        number_of_feature_columns = len(dfemb.columns) - 3
        
        # create unique feature (column) names
        embfeatures.extend([f"{emname}{fold}{x}" for x in range(number_of_feature_columns)])
        embfeatures.extend([f"{emname}{fold}_sentiment_", f"{emname}{fold}_dummy_id_"])
        dfemb.columns = embfeatures
        
        # append features from each language model in tuple ((model_name,fold), [features])
        embfeaturedict[fold] = [f"{emname}{fold}{x}" for x in range(number_of_feature_columns)]
        
        # append encoded sentences by the selected language model to a list of data frames
        dfemblist.append(dfemb)
    
    # create entry in dictionary with all features for each trained language model    
    allFeatures[emname] = embfeaturedict

<h3> Vector representations (embeddings) created by selected pre-trained Deep Learning Language Models. No special training was carried out for here addressed task

In [None]:
# read pooled embeddings and Universal Sentence Encoder (USE) embeddings
pooled_embeddings = [["Pooled FastText", "fasttext"], ["Pooled RoBERTa", "roberta"],
                     ["Universal Sentence Encoder", "USE"]]

for emname, embedding in pooled_embeddings:
    # two options due to naming convention
    if emname != "Universal Sentence Encoder":
        dfemb = dt.fread(f"./data/embeddings/{embedding}_encoded_sentences_pooled.csv").to_pandas()
    else:
        dfemb = dt.fread(f"./data/embeddings/USE_encoded_sentences.csv").to_pandas()
    
    embfeatures = [f"{emname}row"]
    
    # define number of feature columns (columns - 3)
    number_of_feature_columns = len(dfemb.columns) - 3
    
    # create unique feature (column) names
    embfeatures.extend([f"{emname}{x}" for x in range(number_of_feature_columns)])
    embfeatures.extend([f"{emname}_sentiment_", f"{emname}_dummy_id_"])
    dfemb.columns = embfeatures
    
    # add features from each fold to a local dictionary
    embfeaturedict = {}
    for fold in range(fold_number): 
        embfeaturedict[fold] = [f"{emname}{x}" for x in range(number_of_feature_columns)]
    
    # append encoded sentences by the selected language model to a list of data frames
    dfemblist.append(dfemb)
    
    # create entry in dictionary with all features for each language model    
    allFeatures[emname] = embfeaturedict

<h3> Vector representations (embeddings) created by Term Frequency Language Model

In [None]:
# Create a per-fold feature dictionary for Term Frequency model
foldTFfeatures = {}
allWords = []
for fold, rows in train_ml.items():
    vectorizer = CountVectorizer(min_df=4, binary=True)
    tf = vectorizer.fit_transform(dfliwc.iloc[rows]["text_liwc"])
    dftf = pd.DataFrame(tf.A, columns = vectorizer.get_feature_names())
    mi_imps = list(zip(mutual_info_classif(dftf, df.iloc[rows]["sentiment"], discrete_features=True), dftf.columns))
    mi_imps = sorted(mi_imps, reverse=True)
    topFeaturesN = 300
    foldTFfeatures[fold] = [f"TF_{y}" for x,y in mi_imps[0:topFeaturesN]].copy()
    # save all words found by TF models as important features
    allWords.extend([y for x,y in mi_imps[0:topFeaturesN]].copy())

# add the Term Frequency language model key to dictionary with allFeatures from various language models
allFeatures["Term Frequency"] = foldTFfeatures

In [None]:
# Create TF features for all the text instances and create a corresponding data frame
allWords = list(set(allWords))
vectorizer = CountVectorizer(min_df=4, binary=True, vocabulary=allWords)
tf = vectorizer.fit_transform(dfliwc["text_liwc"])
dftf = pd.DataFrame(tf.A, columns = vectorizer.get_feature_names())
dftf.columns = [f"TF_{x}" for x in dftf.columns]

In [None]:
# Create per-fold feature dictionary for LIWC model.
foldLIWCfeatures = {}
for fold, rows in train_ml.items():
    foldLIWCfeatures[fold] = liwcfeatures.copy()

# add the LIWC language model key to dictionary with allFeatures from various language models
allFeatures["LIWC"] = foldLIWCfeatures

In [None]:
LTF = {}
for fold in range(fold_number):
    LTF[fold] = foldLIWCfeatures[fold]
    LTF[fold].extend(foldTFfeatures[fold])
allFeatures["LTF"] = LTF

In [None]:
# concat all Data Frames: liwc, TF, DL embedding into one df_ml that will be used in Machine Learning
dftemp = pd.concat([dfliwc, dftf], axis=1)
for dfemb in dfemblist:
    dftemp = pd.concat([dftemp, dfemb], axis=1)
df_ml = dftemp

In [None]:
# define the target variable in the final df_ml data frame
df_ml["target_ml"] = df["sentiment"]

<h1> Machine Learning part

In [33]:
# Define separate lists of names of trained and not trained language models that can be tested
trained_LMs = ["FastText_lstm", "Roberta_lstm", "Roberta_CLS", "Term Frequency"]
not_trained_LMs = ["LIWC", "Pooled FastText", "Pooled RoBERTa", "Universal Sentence Encoder"]
explainable_LMs = ["Term Frequency", "LIWC", "LTF"]  # LTF stands for LIWC+Term Frequency features

all_language_models = trained_LMs.copy()
all_language_models.extend(not_trained_LMs)
all_language_models.append("LTF")

# a list of all language models
# trained_LMs.extend(not_trained_LMs)
# all_language_models = trained_LMs

In [31]:
# function that trains
def ML_classification(classification_model, language_model):
    """
    Function to train classification models on features provided by language models
    Example use: classification_model=RandomForestClassifier(n_estimators=100, max_depth=7, min_samples_split=2,
                             min_samples_leaf=1, max_features='auto', n_jobs=-1, random_state=2020)
                language_model=
    possible options for language model list are: "Term Frequency", "LIWC", "Pooled FastText", "Pooled RoBERTa" or "Universal Sentence Encoder"
    """
    # list of analyzed language models
    model = classification_model
    print(type(model).__name__)
    preds = []
    trues = []

    # for each fold
    for fold in range(fold_number):
        # chose appropriate features and data
        features = set(allFeatures[language_model][fold])
        train_index = train_ml[fold]
        test_index = test_ml[fold]

        train_data = df_ml[features].iloc[train_index]
        target_train_data = df_ml["target_ml"].iloc[train_index]
        test_data = df_ml[features].iloc[test_index]
        target_test_data = df_ml.iloc[test_index]["target_ml"]
        model.fit(train_data, target_train_data)

        preds.append(model.predict(test_data).tolist())
        trues.append(target_test_data.tolist())

    print(language_model)
    mcc = metrics.matthews_corrcoef(y_true=sum(trues, []), y_pred=sum(preds, []))
    f1 = metrics.f1_score(y_true=sum(trues, []), y_pred=sum(preds, []), average="weighted")
    print("MCC: ", round(mcc, 3))
    print("F1: ", round(f1, 3))
    return sum(preds, [])

In [None]:
# instantiate dictionary for data frames with results
allPreds = {}

# define which classification models to use
models = [RandomForestClassifier(n_estimators=1, max_depth=7, min_samples_split=2,
                             min_samples_leaf=1, max_features='auto', n_jobs=-1, random_state=2020)]#,
#          xgb.XGBClassifier(objective='multi:softprob', n_jobs=24, learning_rate=0.03,
#                                  max_depth=10, subsample=0.7, colsample_bytree=0.6,
#                                 random_state=2020, n_estimators=1)]

# use features from selected language models
for language_model in all_language_models:
    
    # for training of selected classification models
    for classification_model in models:
        preds = ML_classification(classification_model, language_model)
        
        # save model predictions
        allPreds[f"{language_model}_{type(classification_model).__name__}"] = preds.copy()

RandomForestClassifier
FastText_lstm
MCC:  0.445
F1:  0.625
RandomForestClassifier
Roberta_lstm
MCC:  0.513
F1:  0.671
RandomForestClassifier
Roberta_CLS
MCC:  0.555
F1:  0.699
RandomForestClassifier
Term Frequency
MCC:  0.202
F1:  0.433
RandomForestClassifier
LIWC
MCC:  0.297
F1:  0.526
RandomForestClassifier
Pooled FastText
MCC:  0.286
F1:  0.519
RandomForestClassifier
Pooled RoBERTa
MCC:  0.275
F1:  0.513
RandomForestClassifier


In [None]:
# save model predictions together with true sentiment labels
allPreds["sentiment"] = df["sentiment"]
pd.DataFrame(allPreds).to_excel("predictions.xlsx")

<H1> Model Explanations

In [None]:
def train_model_for_shap(classification_model, language_model, fold):
    """
    Function to train a single Language Model for SHAP explanations
    Example use: classification_model=RandomForestClassifier(n_estimators=100, max_depth=7, min_samples_split=2,
                             min_samples_leaf=1, max_features='auto', n_jobs=-1, random_state=2020),
                language_model="Term Frequency",
                fold = 2
    possible options for language model are: "Term Frequency" or "LIWC".
    possible fold values: 0, 1, 2, 3, 4
    """
    # list of analyzed language models
    language_model = language_model
    fold=fold
    model = classification_model
    print(type(model).__name__)
    results = {}
    names = []
    
    features = set(allFeatures[language_model][fold])
    preds = []
    trues = []

    train_index = train_ml[fold]
    test_index = test_ml[fold]

    train_data = df_ml[features].iloc[train_index]
    target_train_data = df_ml["target_ml"].iloc[train_index]
    test_data = df_ml[features].iloc[test_index]
    target_test_data = df_ml.iloc[test_index]["target_ml"]
    model.fit(train_data, target_train_data)

    preds.append(model.predict(test_data).tolist())
    trues.append(target_test_data.tolist())
    
    print(language_model)
    mcc = metrics.matthews_corrcoef(y_true=sum(trues, []), y_pred=sum(preds, []))
    f1 = metrics.f1_score(y_true=sum(trues, []), y_pred=sum(preds, []), average="weighted")
    print("MCC: ", round(mcc, 3))
    print("F1: ", round(f1, 3))
    return model, train_data, test_data

In [None]:
def explain_model(model, train_data, test_data):
    """
    Function that computes and displays SHAP model explanations
    """
    model_name = type(shap_model).__name__
    random.seed(13)
    if model_name not in ["RandomForestClassifier", "XGBClassifier"]:
        explainer = shap.KernelExplainer(model.predict_proba, train_data[:50], link="identity")
        samples_to_explain = 100
        shap_values = explainer.shap_values(train_data[:50], nsamples=200, l1_reg="num_features(100)")
        shap.summary_plot(shap_values, test_data, max_display=10)
    else:
        explainer = shap.TreeExplainer(model)
        samples_to_explain = 100
        shap_values = explainer.shap_values(train_data)
        shap.summary_plot(shap_values, test_data, max_display=10)

In [None]:
# prepare model for SHAP explanations
shap_model, train_data, test_data = train_model_for_shap(classification_model=RandomForestClassifier(n_estimators=30, max_depth=7, min_samples_split=2,
                             min_samples_leaf=1, max_features='auto', n_jobs=-1, random_state=2020),
                            language_model="LTF",
                            fold=4)
explain_model(model=shap_model, train_data=train_data, test_data=test_data)

In [None]:
# prepare model for SHAP explanations
shap_model, train_data, test_data = train_model_for_shap(classification_model=RandomForestClassifier(n_estimators=100, max_depth=7, min_samples_split=2,
                             min_samples_leaf=1, max_features='auto', n_jobs=-1, random_state=2020),
                            language_model="Term Frequency",
                            fold=4)
explain_model(model=shap_model, train_data=train_data, test_data=test_data)

In [None]:
# prepare model for SHAP explanations
shap_model, train_data, test_data = train_model_for_shap(classification_model=xgb.XGBClassifier(objective='multi:softprob', n_jobs=24, learning_rate=0.03,
                                 max_depth=10, subsample=0.7, colsample_bytree=0.6,
                                random_state=2020, n_estimators=50),
                            language_model="LTF",
                            fold=4)
explain_model(model=shap_model, train_data=train_data, test_data=test_data)

In [None]:
# prepare model for SHAP explanations
shap_model, train_data, test_data = train_model_for_shap(classification_model=xgb.XGBClassifier(objective='multi:softprob', n_jobs=24, learning_rate=0.03,
                                 max_depth=10, subsample=0.7, colsample_bytree=0.6,
                                random_state=2020, n_estimators=40),
                                language_model="LIWC",
                                fold=0)
explain_model(model=shap_model, train_data=train_data, test_data=test_data)

In [None]:
# prepare model for SHAP explanations
shap_model, train_data, test_data = train_model_for_shap(classification_model=xgb.XGBClassifier(objective='multi:softprob', n_jobs=24, learning_rate=0.03,
                                 max_depth=10, subsample=0.7, colsample_bytree=0.6,
                                random_state=2020, n_estimators=20),
                                language_model="Universal Sentence Encoder",
                                fold=0)
explain_model(model=shap_model, train_data=train_data, test_data=test_data)

In [None]:
# prepare model for SHAP explanations
shap_model, train_data, test_data = train_model_for_shap(classification_model=MultinomialNB(),
                                                         language_model="LIWC",
                                                         fold=0)
explain_model(model=shap_model, train_data=train_data, test_data=test_data)

In [None]:
# prepare model for SHAP explanations
shap_model, train_data, test_data = train_model_for_shap(classification_model=MultinomialNB(),
                                    language_model="Term Frequency",
                                    fold=0)
explain_model(model=shap_model, train_data=train_data, test_data=test_data)