In [0]:
import os
from typing import Tuple
from pathlib import Path
import argparse
import pickle
import _pickle as cPickle
from zipfile import ZipFile
import string

import numpy as np
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import StratifiedKFold

import fasttext
import flair
from flair.models import TextClassifier
from flair.data import Sentence
import allennlp
import classifiers as cl

import keras as k
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
from gensim.models import KeyedVectors

from tensorflow import keras, split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Dense, Lambda, Reshape, Dropout, Embedding, GRU, LSTM, Bidirectional, Conv1D, GlobalMaxPool1D, concatenate
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import optimizers

# Data Preprocessing

In [0]:
columns =['id',	'label'	,'statement',	'subject',	'speaker', 	'job', 	'state',
        'party',	'barely_true_c',	'false_c',	'half_true_c',	'mostly_true_c',
        'pants_on_fire_c',	'venue']
train_data = pd.read_table('train.tsv', names = columns)
train_data = train_data.drop(['barely_true_c','false_c','half_true_c','mostly_true_c','pants_on_fire_c'], axis=1)
#train_data[:] = train_data[:].fillna('0')
headers = ['subject',	'speaker', 	'job', 	'state','party','venue']
for header in headers:
  frequent = train_data[header].str.lower().value_counts()[:15].reset_index().to_dict()['index']
  frequent = dict((i,k) for k,i in frequent.items())
  def get_venue_id(venue):
    if isinstance(venue, str):
      matched = [ven for ven in frequent if ven in venue.lower() ]
      if len(matched)>0:
        return frequent[matched[0]]
      else:
        return len(set(frequent.values())) 
    else:
        return len(set(frequent.values()))
  train_data[header+'_id'] = train_data[header].apply(get_venue_id)

valid_data = pd.read_table('valid.tsv', names = columns)
valid_data = valid_data.drop(['barely_true_c','false_c','half_true_c','mostly_true_c','pants_on_fire_c'], axis=1)
#valid_data[:] = valid_data[:].fillna('0')
for header in headers:
  frequent = valid_data[header].str.lower().value_counts()[:15].reset_index().to_dict()['index']
  frequent = dict((i,k) for k,i in frequent.items())
  def get_venue_id(venue):
    if isinstance(venue, str):
      matched = [ven for ven in frequent if ven in venue.lower() ]
      if len(matched)>0:
        return frequent[matched[0]]
      else:
        return len(set(frequent.values())) 
    else:
        return len(set(frequent.values()))
  valid_data[header+'_id'] = valid_data[header].apply(get_venue_id)

test_data = pd.read_table('test.tsv', names = columns)
test_data = test_data.drop(['barely_true_c','false_c','half_true_c','mostly_true_c','pants_on_fire_c'], axis=1)
#test_data[:] = test_data[:].fillna('0')
for header in headers:
  frequent = test_data[header].str.lower().value_counts()[:15].reset_index().to_dict()['index']
  frequent = dict((i,k) for k,i in frequent.items())
  def get_venue_id(venue):
    if isinstance(venue, str):
      matched = [ven for ven in frequent if ven in venue.lower() ]
      if len(matched)>0:
        return frequent[matched[0]]
      else:
        return len(set(frequent.values())) 
    else:
        return len(set(frequent.values()))
    test_data[header+'_id'] = test_data[header].apply(get_venue_id)

In [0]:
for header in headers:
  frequent = train_data[header].str.lower().value_counts()[:15].reset_index().to_dict()['index']
  frequent = dict((i,k) for k,i in frequent.items())
  choices = frequent.keys()
  header_id= []
  for value in range(len(train_data[header])):
    new, score = process.extractOne(train_data[header][value], choices,scorer=fuzz.ratio)
    header_id.append(frequent[new])
  train_data[header+'_id'] = header_id

valid_data = pd.read_table('valid.tsv', names = columns)
valid_data = valid_data.drop(['barely_true_c','false_c','half_true_c','mostly_true_c','pants_on_fire_c'], axis=1)
valid_data[:] = valid_data[:].fillna('0')
for header in headers:
  frequent = valid_data[header].str.lower().value_counts()[:15].reset_index().to_dict()['index']
  frequent = dict((i,k) for k,i in frequent.items())
  choices = frequent.keys()
  header_id= []
  for value in range(len(valid_data[header])):
    new, score = process.extractOne(valid_data[header][value], choices,scorer=fuzz.ratio)
    header_id.append(frequent[new])
  valid_data[header+'_id'] = header_id

test_data = pd.read_table('test.tsv', names = columns)
test_data = test_data.drop(['barely_true_c','false_c','half_true_c','mostly_true_c','pants_on_fire_c'], axis=1)
test_data[:] = test_data[:].fillna('0')
for header in headers:
  frequent = test_data[header].str.lower().value_counts()[:15].reset_index().to_dict()['index']
  frequent = dict((i,k) for k,i in frequent.items())
  choices = frequent.keys()
  header_id= []
  for value in range(len(test_data[header])):
    new, score = process.extractOne(test_data[header][value], choices,scorer=fuzz.ratio)
    header_id.append(frequent[new])
  test_data[header+'_id'] = header_id

#### Label

In [0]:
y_label_dict = {"pants-fire" : 0, "false" : 1, "barely-true" : 2, "half-true" : 3, "mostly-true" : 4, "true" : 5}
train_data['output'] = train_data['label'].apply(lambda x: y_label_dict[x])
valid_data['output'] = valid_data['label'].apply(lambda x: y_label_dict[x])
test_data['output'] = test_data['label'].apply(lambda x: y_label_dict[x])

#### Statements

In [0]:
import pickle
def load_tokenizer(filename = 'tokenizer.pickle'):
    with open(filename, 'rb') as handle:
        tokenizer = pickle.load(handle)
    return tokenizer
def load_tokenizers(count=6, filename='tokenizer'):
    ts = []
    for i in range(count):
        ts.append(load_tokenizer(filename=filename+"_"+str(i)+".pickle"))
    return ts
cols = ['subject', 'speaker', 'job', 'state', 'party', 'venue']
def load_df(filename):
    return pd.read_pickle(filename)
def load_embedding_matrix(filename = 'embedding_matrix.npy'):
    return np.load(filename)
def df_to_input(df, tokenizer, max_len):
    return pad_sequences(tokenizer.texts_to_sequences(df.p_statement), maxlen=max_len, padding='post', truncating='post')

def load_x(path, keywords, size):
  master_data = np.load(path)
  x = np.empty((size,))
  for data in master_data:
    if all(f in data for f in keywords):
      x = np.column_stack([x, master_data[data]])
  x = x[:,1:]
  if any(w in keywords for w in ['LR', 'SVM', 'FT']):
    x = np.array(np.column_stack([x[:,:3], np.stack(x[:,3])]), dtype=np.float)
  return x

def df_to_meta_input(df, tokenizers, columns):
    a = np.zeros((df.shape[0], len(columns)), dtype=int)
    for i, col in enumerate(columns):
        a[:,i] = np.array(tokenizers[i].texts_to_sequences(df[col])).reshape(-1)
    return a

In [0]:
sentence_max_len = 15
t = load_tokenizer()
stmt_embedding = load_embedding_matrix()
df_train = load_df('df_train.pkl')
df_val = load_df('df_val.pkl')
df_test = load_df('df_test.pkl')
df_train = df_train.replace('', 'nan')

stmt_train = df_to_input(df_train, t, sentence_max_len)
stmt_val = df_to_input(df_val, t, sentence_max_len)
stmt_test = df_to_input(df_test, t, sentence_max_len)

#### DEP

In [0]:
dep_dict = {'punct' : 0, 'prep' : 1, 'pobj' : 2, 'compound' : 3, 'det' : 4, 
            'nsubj' : 5, 'ROOT' : 6, 'amod' : 7, 'dobj' : 8, 'aux' : 9, 
            'advmod' : 10, 'nummod' : 10, 'ccomp' : 10, 'conj' : 10, 'cc' : 10, 
            'advcl' : 10, 'poss' : 10, 'mark' : 10, 'quantmod' : 10, 'relcl' : 10, 
            'attr' : 10, 'xcomp' : 10, 'npadvmod' : 10, 'nmod' : 10, 'auxpass' : 10, 
            'acl' : 10, 'nsubjpass' : 10, 'pcomp' : 10, 'acomp' : 10, 'neg' : 10, 
            'appos' : 10, 'prt' : 10, '' : 10, 'expl' : 10, 'dative' : 10, 
            'agent' : 10, 'case' : 10, 'oprd' : 10, 'csubj' : 10, 'dep' : 10, 
            'intj' : 10, 'predet' : 10, 'parataxis' : 10, 'preconj' : 10, 
            'meta' : 10, 'csubjpass' : 10}

In [0]:
def get_dep_parse(statement):
  # doc = nlp(statement.decode('utf-8', 'ignore'))
  doc = nlp(statement)
  deplist = []
  for token in doc:
    deplist.append(dep_dict.get(token.dep_, max(dep_dict.values())))
  return deplist

train_data['dep_id'] = train_data['statement'].apply(get_dep_parse)
valid_data['dep_id'] = valid_data['statement'].apply(get_dep_parse)
test_data['dep_id'] = test_data['statement'].apply(get_dep_parse)

#### POS

In [0]:
pos_dict = {'NOUN' : 0, 'VERB' : 1, 'ADP' : 2, 'PROPN' : 3, 'PUNCT' : 4, 
            'DET' : 5, 'ADJ' : 6, 'NUM' : 7, 'ADV' : 8, 'PRON' : 9, 'X' : 9, 
            'PART' : 9, 'SYM' : 9, 'INTJ' : 9 }

In [0]:
def get_pos(statement):
  # doc = nlp(statement.decode('utf-8', 'ignore'))
  doc = nlp(statement)
  taglist = []
  deplist = []
  for token in doc:
    taglist.append(pos_dict.get(token.pos_,max(pos_dict.values())))
    #deplist.append(token.dep_)
  return taglist

train_data['pos_id'] = train_data['statement'].apply(get_pos)
valid_data['pos_id'] = valid_data['statement'].apply(get_pos)
test_data['pos_id'] = test_data['statement'].apply(get_pos)

#### Metadata

In [0]:
ts = load_tokenizers()

meta_train = df_to_meta_input(df_train, ts, cols)
meta_val = df_to_meta_input(df_val, ts, cols)
meta_test = df_to_meta_input(df_test, ts, cols)

#### Sentiment Analysis

The codes below are adopted from https://github.com/prrao87/fine-grained-sentiment

In [0]:
basic = cl.Base()
liar_train = basic.read_data(fname='./data/liar_dataset/train-clean.txt', lower_case=True)
liar_valid = basic.read_data(fname='./data/liar_dataset/valid-clean.txt', lower_case=True)
liar_test = basic.read_data(fname='./data/liar_dataset/test-clean.txt', lower_case=True)

TextBlob

In [0]:
# TextBlob
model0 = cl.TextBlobSentiment()
df0 = model0.predict(train_file=None, test_file='./data/test.txt', lower_case=True)
print('TextBlob')
model0.accuracy(df0)

TextBlob
Accuracy: 22.134
Macro F1-score: 19.036


In [0]:
liar_train['textblob_score'] = liar_train['text'].apply(model0.score)
liar_train['textblob_pred'] = pd.cut(liar_train['textblob_score'],bins=3,labels=[0, 1, 2])
liar_valid['textblob_score'] = liar_valid['text'].apply(model0.score)
liar_valid['textblob_pred'] = pd.cut(liar_valid['textblob_score'],bins=3,labels=[0, 1, 2])
liar_test['textblob_score'] = liar_test['text'].apply(model0.score)
liar_test['textblob_pred'] = pd.cut(liar_test['textblob_score'],bins=3,labels=[0, 1, 2])

Vader

In [0]:
# Vader
model1 = cl.VaderSentiment()
df1 = model1.predict(train_file=None, test_file='./data/test.txt', lower_case=True)
print('Vader')
model1.accuracy(df1)



Vader
Accuracy: 38.933
Macro F1-score: 37.135


In [0]:
liar_train['vader_score'] = liar_train['text'].apply(model1.score)
liar_train['vader_pred'] = pd.cut(liar_train['vader_score'],bins=[-float('Inf'), -0.05, 0.05, float('Inf')],labels=[0, 1, 2])
liar_valid['vader_score'] = liar_valid['text'].apply(model1.score)
liar_valid['vader_pred'] = pd.cut(liar_valid['vader_score'],bins=[-float('Inf'), -0.05, 0.05, float('Inf')],labels=[0, 1, 2])
liar_test['vader_score'] = liar_test['text'].apply(model1.score)
liar_test['vader_pred'] = pd.cut(liar_test['vader_score'],bins=[-float('Inf'), -0.05, 0.05, float('Inf')],labels=[0, 1, 2])

Logistic Regression

In [0]:
# Logistic Regression
model2 = cl.LogisticRegressionSentiment()
df2 = model2.predict(train_file='./data/train.txt', test_file='./data/test.txt', lower_case=True)
print('Logistic Regression')
model2.accuracy(df2)



Logistic Regression
Accuracy: 65.609
Macro F1-score: 57.293


In [0]:
liar_train[['lr_prob_neg','lr_prob_neu','lr_prob_pos']] = pd.DataFrame(model2.pipeline.predict_proba(liar_train['text']))
liar_train['lr_pred'] = model2.pipeline.predict(liar_train['text'])
liar_valid[['lr_prob_neg','lr_prob_neu','lr_prob_pos']] = pd.DataFrame(model2.pipeline.predict_proba(liar_valid['text']))
liar_valid['lr_pred'] = model2.pipeline.predict(liar_valid['text'])
liar_test[['lr_prob_neg','lr_prob_neu','lr_prob_pos']] = pd.DataFrame(model2.pipeline.predict_proba(liar_test['text']))
liar_test['lr_pred'] = model2.pipeline.predict(liar_test['text'])

SVM

In [0]:
# SVM
model3 = cl.SVMSentiment()
df3 = model3.predict(train_file='./data/train.txt', test_file='./data/test.txt', lower_case=True)
print('SVM')
model3.accuracy(df3)



SVM
Accuracy: 63.086
Macro F1-score: 51.346


In [0]:
liar_train[['svm_df_neg','svm_df_neu','svm_df_pos']] = pd.DataFrame(model3.pipeline.decision_function(liar_train['text']))
liar_train['svm_pred'] = model3.pipeline.predict(liar_train['text'])
liar_valid[['svm_df_neg','svm_df_neu','svm_df_pos']] = pd.DataFrame(model3.pipeline.decision_function(liar_valid['text']))
liar_valid['svm_pred'] = model3.pipeline.predict(liar_valid['text'])
liar_test[['svm_df_neg','svm_df_neu','svm_df_pos']] = pd.DataFrame(model3.pipeline.decision_function(liar_test['text']))
liar_test['svm_pred'] = model3.pipeline.predict(liar_test['text'])

FastText

In [0]:
def extract_flair(df_in, model_flair, k):
  outcome = pd.DataFrame(df_in['text'].apply(lambda x: model_flair.model.predict(x, k)).tolist())
  df_out = pd.DataFrame(columns=['flair_prob_neg','flair_prob_neu','flair_prob_pos'])
  for i in range(df_in.shape[0]):
    d = {}
    for j in range(k):
      if outcome[0][i][j] == '__label__0':
        d['flair_prob_neg'] = outcome[1][i][j]
      elif outcome[0][i][j] == '__label__1':
        d['flair_prob_neu'] = outcome[1][i][j]
      elif outcome[0][i][j] == '__label__2':
        d['flair_prob_pos'] = outcome[1][i][j]
    df_out = df_out.append(d, ignore_index=True)
  return df_out

In [0]:
#fasttext
!python3 train_fasttext.py

In [0]:
# FastText
model4 = cl.FastTextSentiment('./data/fasttext-model.bin')
df4 = model4.predict(train_file=None, test_file='./data/test.txt', lower_case=True)
print('FastText')
model4.accuracy(df4)

FastText
Accuracy: 58.832
Macro F1-score: 48.330




In [0]:
liar_train[['ft_prob_neg','ft_prob_neu','ft_prob_pos']] = extract_flair(liar_train, model4, 3)
liar_train['ft_pred'] = liar_train['text'].apply(model4.score)
liar_valid[['ft_prob_neg','ft_prob_neu','ft_prob_pos']] = extract_flair(liar_valid, model4, 3)
liar_valid['ft_pred'] = liar_valid['text'].apply(model4.score)
liar_test[['ft_prob_neg','ft_prob_neu','ft_prob_pos']] = extract_flair(liar_test, model4, 3)
liar_test['ft_pred'] = liar_test['text'].apply(model4.score)

Flair

In [0]:
#flair
!python3 train_flair.py

In [0]:
def score(model_flair, text):
  doc = Sentence(text)
  model_flair.model.predict(doc)
  output_class = doc.labels[0].value
  output_prob = doc.labels[0].score
  return [output_class, output_prob]

Flair-GloVe

In [0]:
# Flair GloVe
model51 = cl.FlairSentiment('./data/glove/best-model.pt')
df51 = model51.predict(train_file=None, test_file='./data/test.txt', lower_case=True)
print('Flair with GloVe')
model51.accuracy(df51)

2020-04-30 23:00:58,758 loading file ./data/glove/best-model.pt


100%|██████████| 1387/1387 [02:25<00:00,  9.55it/s]

Flair with GloVe
Accuracy: 59.481
Macro F1-score: 39.407





In [0]:
liar_train['flair_glove_prob'], liar_train['flair_glove_class']  = liar_train['text'].progress_apply(lambda x: score(model51, x)[1]), liar_train['text'].progress_apply(lambda x: score(model51, x)[0])
liar_valid['flair_glove_prob'], liar_valid['flair_glove_class']  = liar_valid['text'].progress_apply(lambda x: score(model51, x)[1]), liar_valid['text'].progress_apply(lambda x: score(model51, x)[0])
liar_test['flair_glove_prob'], liar_test['flair_glove_class']  = liar_test['text'].progress_apply(lambda x: score(model51, x)[1]), liar_test['text'].progress_apply(lambda x: score(model51, x)[0])

100%|██████████| 10240/10240 [21:30<00:00,  7.93it/s]
100%|██████████| 10240/10240 [21:26<00:00,  7.96it/s]
100%|██████████| 1284/1284 [02:41<00:00,  7.97it/s]
100%|██████████| 1284/1284 [02:41<00:00,  7.97it/s]
100%|██████████| 1267/1267 [02:41<00:00,  7.83it/s]
100%|██████████| 1267/1267 [02:42<00:00,  7.81it/s]


Flair-ELMo

In [0]:
# Flair ELMo
model52 = cl.FlairSentiment('./data/elmo/best-model.pt')
df52 = model52.predict(train_file=None, test_file='./data/test.txt', lower_case=True)
print('Flair with ELMo')
model52.accuracy(df52)

2020-04-30 23:57:34,423 loading file ./data/elmo/best-model.pt


100%|██████████| 1387/1387 [02:24<00:00,  9.59it/s]

Flair with ELMo
Accuracy: 60.634
Macro F1-score: 38.016





In [0]:
liar_train['flair_elmo_prob'], liar_train['flair_elmo_class']  = liar_train['text'].progress_apply(lambda x: score(model52, x)[1]), liar_train['text'].progress_apply(lambda x: score(model52, x)[0])
liar_valid['flair_elmo_prob'], liar_valid['flair_elmo_class']  = liar_valid['text'].progress_apply(lambda x: score(model52, x)[1]), liar_valid['text'].progress_apply(lambda x: score(model52, x)[0])
liar_test['flair_elmo_prob'], liar_test['flair_elmo_class']  = liar_test['text'].progress_apply(lambda x: score(model52, x)[1]), liar_test['text'].progress_apply(lambda x: score(model52, x)[0])

100%|██████████| 10240/10240 [21:26<00:00,  7.96it/s]
100%|██████████| 10240/10240 [21:23<00:00,  7.98it/s]
100%|██████████| 1284/1284 [02:40<00:00,  8.01it/s]
100%|██████████| 1284/1284 [02:40<00:00,  7.98it/s]
100%|██████████| 1267/1267 [02:41<00:00,  7.83it/s]
100%|██████████| 1267/1267 [02:42<00:00,  7.80it/s]


Flair-BERT

In [0]:
# Flair BERT
model53 = cl.FlairSentiment('./data/bert/best-model.pt')
df53 = model53.predict(train_file=None, test_file='./data/test.txt', lower_case=True)
print('Flair with BERT')
model53.accuracy(df53)

2020-05-01 01:09:09,771 loading file ./data/bert/best-model.pt


100%|██████████| 1387/1387 [23:02<00:00,  1.00it/s]

Flair with BERT
Accuracy: 61.283
Macro F1-score: 41.172





In [0]:
liar_train['flair_bert_prob'], liar_train['flair_bert_class']  = liar_train['text'].progress_apply(lambda x: score(model53, x)[1]), liar_train['text'].progress_apply(lambda x: score(model53, x)[0])
liar_valid['flair_bert_prob'], liar_valid['flair_bert_class']  = liar_valid['text'].progress_apply(lambda x: score(model53, x)[1]), liar_valid['text'].progress_apply(lambda x: score(model53, x)[0])
liar_test['flair_bert_prob'], liar_test['flair_bert_class']  = liar_test['text'].progress_apply(lambda x: score(model53, x)[1]), liar_test['text'].progress_apply(lambda x: score(model53, x)[0])

100%|██████████| 10240/10240 [21:32<00:00,  7.92it/s]
100%|██████████| 10240/10240 [21:31<00:00,  7.93it/s]
100%|██████████| 1284/1284 [02:41<00:00,  7.93it/s]
100%|██████████| 1284/1284 [02:41<00:00,  7.95it/s]
100%|██████████| 1267/1267 [02:42<00:00,  7.79it/s]
100%|██████████| 1267/1267 [02:42<00:00,  7.80it/s]


# Model Building

#### Bi-LSTM

In [0]:
def run_model_sequence(model_type, dict_X_train, y_train, parameters, embeddings, input_dims, output_dims, bidirect=False):
  X_train = []
  names = []
  for key, value in dict_X_train.items():
    names.append(key)
    X_train.append(value)
  models = {}
  list_models = []
  inputs = {}
  dict_train = {}
  list_inputs = []
  dims = {}
  for i, f in enumerate(X_train):
    dims[i] = f.shape[1]
    inputs[i] = Input(shape=(dims[i],), dtype='int32', name='{}_input'.format(names[i]))
    dict_train['{}_input'.format(names[i])] = f
    list_inputs.append(inputs[i])
    if names[i] in ['statement', 'pos', 'dep']:
      x = Embedding(input_dim=input_dims[names[i]], output_dim=output_dims[names[i]], weights=[embeddings[names[i]]], 
                    input_length=dims[i], trainable=False)(inputs[i])
      if model_type == 'LSTM':
        if bidirect:
          x = Bidirectional(LSTM(units=parameters['model_unit'], dropout = parameters['model_dropout'],
                                 activation = 'relu'))(x)
        else:
          x = LSTM(units=parameters['model_unit'], dropout = parameters['model_dropout'],
                   activation = 'relu')(x)
      elif model_type == 'GRU':
        if bidirect:
          x = Bidirectional(GRU(units=parameters['model_unit'], dropout = parameters['model_dropout'],
                                 activation = 'relu'))(x)
        else:
          x = GRU(units=parameters['model_unit'], dropout = parameters['model_dropout'],
                   activation = 'relu')(x)
    else:
      x = Dense(units=parameters['dense_unit'], activation='relu')(inputs[i])
    models[i] = x
    list_models.append(models[i])

  if len(list_models) == 1:
    x = list_models[0]
  else:
    x = concatenate(list_models)
  main_output = Dense(units=len(dlabel), activation='softmax', name='main_output')(x)
  model_dl = Model(inputs=list_inputs, outputs=[main_output])
  model_dl.compile(optimizer=optimizers.SGD(learning_rate=parameters['eta'], clipvalue=0.3, nesterov=True), 
                   loss='categorical_crossentropy', metrics=['categorical_accuracy'])
  model_dl.fit(dict_train, {'main_output': y_train}, epochs=parameters['epoch'], batch_size=parameters['batch'], verbose=0)
  return model_dl

#### CNN

In [0]:
def run_model_cnn(dict_X_train, y_train, parameters, embeddings, input_dims, output_dims):
  X_train = []
  names = []
  for key, value in dict_X_train.items():
    names.append(key)
    X_train.append(value)
  models = {}
  list_models = []
  inputs = {}
  dict_train = {}
  list_inputs = []
  dims = {}
  for i, f in enumerate(X_train):
    dims[i] = f.shape[1]
    inputs[i] = Input(shape=(dims[i],), dtype='int32', name='{}_input'.format(names[i]))
    dict_train['{}_input'.format(names[i])] = f
    list_inputs.append(inputs[i])
    if names[i] in ['statement', 'pos', 'dep']:
      x = Embedding(input_dim=input_dims[names[i]], output_dim=output_dims[names[i]], weights=[embeddings[names[i]]], 
                    input_length=dims[i], trainable=False)(inputs[i])
      #Multichannel convolution for statment input
      conv0 = Conv1D(parameters['filters'], parameters['kernels'][0])(x)
      conv1 = Conv1D(parameters['filters'], parameters['kernels'][1])(x)
      conv2 = Conv1D(parameters['filters'], parameters['kernels'][2])(x)

      #individual GlobalMaxPooling
      maxpool0 = GlobalMaxPool1D()(conv0)
      maxpool1 = GlobalMaxPool1D()(conv1)
      maxpool2 = GlobalMaxPool1D()(conv2)

      #Statement convolution input
      conv_in = concatenate([maxpool0, maxpool1, maxpool2])
      conv_in = Dropout(parameters['model_dropout'])(conv_in)
      x = Dense(units=parameters['dense_model_unit'], activation='relu')(conv_in)
    else:
      x = Dense(units=parameters['dense_unit'], activation='relu')(inputs[i])
    models[i] = x
    list_models.append(models[i])

  if len(list_models) == 1:
    x = list_models[0]
  else:
    x = concatenate(list_models)
  main_output = Dense(units=len(dlabel), activation='softmax', name='main_output')(x)
  model_dl = Model(inputs=list_inputs, outputs=[main_output])
  model_dl.compile(optimizer=optimizers.SGD(learning_rate=parameters['eta'], clipvalue=0.3, nesterov=True), 
                   loss='categorical_crossentropy', metrics=['categorical_accuracy'])
  model_dl.fit(dict_train, {'main_output': y_train}, epochs=parameters['epoch'], batch_size=parameters['batch'], verbose=0)
  return model_dl

#### Hybrid CNN-BiLSTM

In [0]:
def run_model_hybrid(dict_X_train, y_train, parameters, embeddings, input_dims, output_dims):
  cols = ['subject', 'speaker', 'job', 'state', 'party', 'venue']
  X_train = []
  names = []
  for key, value in dict_X_train.items():
    names.append(key)
    X_train.append(value)
  #models = {}
  list_models = []
  list_embeddings = []
  inputs = {}
  dict_train = {}
  list_inputs = []
  dims = {}
  for i, f in enumerate(X_train):
    dims[i] = f.shape[1]
    inputs[i] = Input(shape=(dims[i],), dtype='int32', name='{}_input'.format(names[i]))
    dict_train['{}_input'.format(names[i])] = f
    list_inputs.append(inputs[i])
    if names[i] in ['statement']:
      x = Embedding(input_dim=input_dims[names[i]], output_dim=parameters['dimensions'], weights=[embeddings[names[i]]], 
                    input_length=dims[i], trainable=False)(inputs[i])
      #Multichannel convolution for statment input
      conv00 = Conv1D(parameters['cnn_filters'], parameters['cnn_kernels'][0])(x)
      conv01 = Conv1D(parameters['cnn_filters'], parameters['cnn_kernels'][1])(x)
      conv02 = Conv1D(parameters['cnn_filters'], parameters['cnn_kernels'][2])(x)

      #individual GlobalMaxPooling
      maxpool00 = GlobalMaxPool1D()(conv00)
      maxpool01 = GlobalMaxPool1D()(conv01)
      maxpool02 = GlobalMaxPool1D()(conv02)

      #Statement convolution input
      x = concatenate([maxpool00, maxpool01, maxpool02])
      list_models.append(x)
    elif names[i] in ['dep', 'pos', 'meta']:
      if names[i] == 'meta':
        input_split = Lambda(lambda x: split(x, 6, axis=1))(inputs[i])
        for j, c in enumerate(cols):
          vocab_dim = len(ts[j].word_index) + 1
          x = Embedding(input_dim=vocab_dim, output_dim=parameters['dimensions'])(input_split[j])
          list_embeddings.append(x)
      else:
        x = Embedding(input_dim=input_dims[names[i]], output_dim=parameters['dimensions'], 
                    input_length=dims[i], trainable=False)(inputs[i])
        list_embeddings.append(x)
    else:
      x = Dense(units=parameters['dense_unit'], activation='relu')(inputs[i])
      list_models.append(x)

  if 'dep' in names or 'pos' in names or 'meta' in names:
    if len(list_embeddings) == 1:
      x = list_embeddings[0]
    else:
      x = concatenate(list_embeddings, axis=1)
    #x = concatenate(list_embeddings, axis=1)
    #Multichannel convolution for dep/pos/meta input
    conv10 = Conv1D(filters=parameters['md_filters'], kernel_size=parameters['md_kernels'][0], padding='same')(x)
    conv11 = Conv1D(filters=parameters['md_filters'], kernel_size=parameters['md_kernels'][1], padding='same')(x)
    conv12 = Conv1D(filters=parameters['md_filters'], kernel_size=parameters['md_kernels'][2], padding='same')(x)

    #individual GlobalMaxPooling
    maxpool10 = GlobalMaxPool1D()(conv10)
    maxpool11 = GlobalMaxPool1D()(conv11)
    maxpool12 = GlobalMaxPool1D()(conv12)

    x = concatenate([maxpool10, maxpool11, maxpool12])
    x_reshape = Reshape((x.shape[1], 1))(x)
    x = Bidirectional(LSTM(parameters['lstm_unit'], return_sequences=False))(x_reshape)
    list_models.append(x)

  if len(list_models) == 1:
    x = list_models[0]
  else:
    x = concatenate(list_models)
  dropout_layer = Dropout(parameters['final_dropout'])(x)
  main_output = Dense(units=len(dlabel), activation='softmax', name='main_output')(dropout_layer)
  model_dl = Model(inputs=list_inputs, outputs=[main_output])
  model_dl.compile(optimizer=optimizers.SGD(learning_rate=parameters['eta'], clipvalue=0.3, nesterov=True), 
                   loss='categorical_crossentropy', metrics=['categorical_accuracy'])
  model_dl.fit(dict_train, {'main_output': y_train}, epochs=parameters['epoch'], batch_size=parameters['batch'], verbose=0)
  return model_dl  

#### k-fold Cross Validation

In [0]:
def run_cv(model_type, dict_features_cv, dict_features_test, y_cv, y_test, parameters, embeddings, input_dims, output_dims, bidirect=False):
  if bidirect:
    print('Bi-'+model_type)
  else:
    print(model_type)
  
  for j in range(len(dict_features_cv)):
    print('features:')
    dict_X_cv = dict_features_cv[j]
    X_cv = dict_X_cv[next(iter(dict_X_cv))]
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    cv_scores = []
    names = []
    for k in dict_X_cv.keys():
      print(k)
      names.append(k)
    
    #run k-fold CV
    for train, val in kfold.split(X_cv, y_cv):
      #refine dictionary
      dict_X_train = {}
      dict_X_val = {}
      for k, v in dict_X_cv.items():
        dict_X_train[k] = v[train]
        dict_X_val[k] = v[val]
      
      y_train = to_categorical(y_cv[train], num_classes=6)
      y_val = to_categorical(y_cv[val], num_classes=6)
      if model_type in ['LSTM', 'GRU']:
        model_dl = run_model_sequence(model_type, dict_X_train, y_train, parameters, embeddings, input_dims, output_dims, bidirect)
      elif model_type == 'CNN':
        model_dl = run_model_cnn(dict_X_train, y_train, parameters, embeddings, input_dims, output_dims)
      else:
        model_dl = run_model_hybrid(dict_X_train, y_train, parameters, embeddings, input_dims, output_dims)
      #prepare validation data
      X_val = [v for v in dict_X_val.values()]
      dict_val = {}
      for i in range(len(X_val)):
        dict_val['{}_input'.format(names[i])] = X_val[i]

      scores = model_dl.evaluate(dict_val, {'main_output': y_val}, verbose=0)
      print("%s: %.2f%%" % (model_dl.metrics_names[1], scores[1]*100))
      cv_scores.append(scores[1]*100)
      
    print("%.2f%% (+/- %.2f%%)" % (np.mean(cv_scores), np.std(cv_scores)))
    #prepare test data
    dict_X_test = dict_features_test[j]
    X_test = [v for v in dict_X_test.values()]
    dict_test = {}
    for i in range(len(X_test)):
      dict_test['{}_input'.format(names[i])] = X_test[i]
    if model_type in ['LSTM', 'GRU']:
      model_dl_full = run_model_sequence(model_type, dict_X_cv, to_categorical(y_cv, num_classes=6), parameters, embeddings, input_dims, output_dims, bidirect)
    elif model_type == 'CNN':
      model_dl_full = run_model_cnn(dict_X_cv, to_categorical(y_cv, num_classes=6), parameters, embeddings, input_dims, output_dims)
    else:
      model_dl_full = run_model_hybrid(dict_X_cv, to_categorical(y_cv, num_classes=6), parameters, embeddings, input_dims, output_dims)
    #model_dl_full.summary()
    score_full = model_dl_full.evaluate(dict_test, {'main_output': y_test}, verbose=0)
    print("%s: %.2f%%" % (model_dl_full.metrics_names[1], score_full[1]*100))

# Experiments

In [0]:
embeddings = {
    'statement': stmt_embedding,
    'pos': pos_embedding,
    'dep': dep_embedding
}
input_dims = {
    'statement': len(t.word_index) + 1,
    'pos': 10,
    'dep': 11
}
output_dims = {
    'statement': 300,
    'pos': 10,
    'dep': 11,
    'meta': 300
}

In [0]:
parameters_lstm = {
        'model_unit': 256, #64,
        'dense_unit': 64,
        'model_dropout': 0.5, #0.2,
        'eta': 0.01,
        'epoch': 30,
        'batch': 32
}
parameters_cnn = {
        'kernels': [7, 7, 7], 
        'filters': 256, 
        'dense_model_unit': 256, #64,
        'dense_unit': 64,
        'model_dropout': 0.5, #0.2,
        'eta': 0.01,
        'epoch': 30,
        'batch': 32
}
parameters_hybrid = {
        'dimensions': 300,
        'cnn_kernels': [6, 7, 8], #[7, 7, 7], 
        'cnn_filters': 256, #256, 
        'md_kernels': [6, 7, 8],
        'md_filters': 256,
        'lstm_unit': 64, #256,
        'dense_unit': 64,
        'final_dropout': 0.5, #0.2,
        'eta': 0.01,
        'epoch': 30,
        'batch': 32 #32
}

In [0]:
dict_features_cv = [{'statement': stmt_cv},
                    {'statement': stmt_cv, 'dep': dep_cv},
                    {'statement': stmt_cv, 'pos': pos_cv},
                    {'statement': stmt_cv, 'meta': meta_cv},
                    {'statement': stmt_cv, 'svm': svm_cv},
                    {'statement': stmt_cv, 'bert': flair_bert_cv},
                    {'statement': stmt_cv, 'dep': dep_cv, 'pos': pos_cv},
                    {'statement': stmt_cv, 'dep': dep_cv, 'pos': pos_cv, 'meta': meta_cv},
                    {'statement': stmt_cv, 'dep': dep_cv, 'pos': pos_cv, 'meta': meta_cv, 'svm': svm_cv},
                    {'statement': stmt_cv, 'dep': dep_cv, 'pos': pos_cv, 'meta': meta_cv, 'bert': flair_bert_cv}]

dict_features_test = [{'statement': stmt_test},
                      {'statement': stmt_test, 'dep': dep_test},
                      {'statement': stmt_test, 'pos': pos_test},
                      {'statement': stmt_test, 'meta': meta_test},
                      {'statement': stmt_test, 'svm': svm_test},
                      {'statement': stmt_test, 'bert': flair_bert_test},
                      {'statement': stmt_test, 'dep': dep_test, 'pos': pos_test},
                      {'statement': stmt_test, 'dep': dep_test, 'pos': pos_test, 'meta': meta_test},
                      {'statement': stmt_test, 'dep': dep_test, 'pos': pos_test, 'meta': meta_test, 'svm': svm_test},
                      {'statement': stmt_test, 'dep': dep_test, 'pos': pos_test, 'meta': meta_test, 'bert': flair_bert_test}]

#### Bi-LSTM

In [0]:
run_cv('LSTM', dict_features_cv, dict_features_test, y_cv, y_test, parameters_lstm, embeddings, input_dims, output_dims, bidirect=True)

#### CNN

In [0]:
run_cv('CNN', dict_features_cv, dict_features_test, y_cv, y_test, parameters_cnn, embeddings, input_dims, output_dims)

#### Hybrid CNN-BiLSTM

In [0]:
run_cv('Hybrid', dict_features_cv, dict_features_test, y_cv, y_test, parameters_hybrid, embeddings, input_dims, output_dims)