# Classifier without Active Learning

This file contains all results of the classifier without Active Learning. For this, execute all cells until the headline "Model training of the proposed model and baseline model" is reached. Then, execute one of the models: Either one of the models under "Model training of the proposed model and baseline models" or one of the models under "Experiments". Note: Make sure, that the desired target domain is chosen when loading the data below.

Make sure to adjust the checkpoint paths when training the models such that the weights are saved in the desired paths.

## Importing libraries and setting configurations

In [1]:
# imports


#from keras.models import Model
#from keras.layers import Input,Dense
from keras_self_attention import SeqSelfAttention



import os
import h5py
import numpy as np
import random as rn
import pickle as pkl
import tensorflow as tf
import pandas as pd


ModuleNotFoundError: No module named 'keras_self_attention'

In [None]:
# setting seeds in order to reproduce the results
os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(1)
rn.seed(2)
tf.random.set_seed(3)

# configurations so we use a single thread
session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
tf.compat.v1.keras.backend.set_session(sess)

## Loading the split data

Make sure to load the data of the desired target domain here:

In [1610]:
# importing the data for the general sentence embeddings, here corresponding data from domain 0 was chosen
with open('data/sentence_embeddings/general/sorted/train/train_data2_2.p', 'rb') as f:
    X_train_gen = pkl.load(f)

with open('data/sentence_embeddings/general/sorted/train/train_labels2_2.p', 'rb') as f:
    y_train = pkl.load(f)
    
with open('data/sentence_embeddings/general/sorted/val_test/vt_data2_2.p', 'rb') as f:
    X_val_test_spec = pkl.load(f)

with open('data/sentence_embeddings/general/sorted/val_test/vt_labels2_2.p', 'rb') as f:
    y_val_test = pkl.load(f)

labels_total = np.hstack((y_train[:,:1400], y_val_test))
X_train_gen, X_val_gen, X_test_gen = X_train_gen[:1400], X_val_test_spec[:200], X_val_test_spec[200:]
y_train, y_val, y_test = y_train[0,:1400], y_val_test[0,:200], y_val_test[0,200:]

In [2122]:
labels_total.shape

(3, 6000)

In [2131]:
# set the target domain
index_spec = 5


# importing the data for the general sentence embeddings, here corresponding data from domain 0 was chosen
with open('data/sentence_embeddings/general/sorted/train/train_data5_8.p', 'rb') as f:
    X_train_gen = pkl.load(f)

with open('data/sentence_embeddings/general/sorted/train/train_labels5_8.p', 'rb') as f:
    y_train = pkl.load(f)
    
with open('data/sentence_embeddings/general/sorted/val_test/vt_data5_8.p', 'rb') as f:
    X_val_test_spec = pkl.load(f)

with open('data/sentence_embeddings/general/sorted/val_test/vt_labels5_8.p', 'rb') as f:
    y_val_test = pkl.load(f)


labels_total = np.hstack((y_train[:,:4200], y_val_test))
X_train_gen, X_val_gen, X_test_gen = X_train_gen[:4200], X_val_test_spec[:600], X_val_test_spec[600:]
y_train, y_val, y_test = y_train[0,:4200], y_val_test[0,:600], y_val_test[0,600:]

# import the data from the specific sentence embeddings, here corresponding data from domain 0 was chosen
with open('data/sentence_embeddings/specific/sentemb/sentemb_unlabeled5_8.p', 'rb') as f:
    X_spec = pkl.load(f)
    
#X_train_spec, X_val_spec, X_test_spec = X_spec[:1400], X_spec[1400:1600], X_spec[1600:2000] 

import numpy as np
X_spec=np.repeat(X_spec,repeats=3, axis=1)

X_train_spec, X_val_spec, X_test_spec = X_spec.transpose()[:4200], X_spec.transpose()[4200:4800], X_spec.transpose()[4800:]

FileNotFoundError: [Errno 2] No such file or directory: 'data/sentence_embeddings/general/sorted/train/train_data5_8.p'

## Model training of proposed model and baseline models

Choose one fo the three models below to execute.

### Proposed Model:

In [2102]:
# initialize the proposed classifier

INPUT_SIZE = 300
LATENT_SIZE = 300

# domain-general model parts
inp_gen = tf.keras.Input(shape=(1,INPUT_SIZE))
inp_gen_att, attn_weights_gen = SeqSelfAttention(return_attention = True)(inp_gen)
out_gen = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(LATENT_SIZE, input_shape=(None,1,INPUT_SIZE)))(inp_gen_att)

# domain-specific model parts
inp_spec = tf.keras.Input(shape=(1,INPUT_SIZE))
inp_spec_att, attn_weights_spec = SeqSelfAttention(return_attention = True)(inp_spec)
out_spec = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(LATENT_SIZE, input_shape=(None,1,INPUT_SIZE)))(inp_spec_att)

# concatenate domain-general and domain-specific results
merged = tf.keras.layers.Concatenate()([out_gen, out_spec])

# drop out layer and dense layer
merged = tf.keras.layers.Dropout(.5)(merged)
merged = tf.keras.layers.Dense(1, activation='sigmoid')(merged)

classifier = tf.keras.Model([inp_gen,inp_spec], merged)
#classifier.summary()

In [2105]:
# initialize the proposed classifier

INPUT_SIZE = 300
LATENT_SIZE = 300

# domain-general model parts
inp_gen = tf.keras.Input(shape=(1,INPUT_SIZE))
#inp_gen_att, attn_weights_gen = SeqSelfAttention(return_attention = True)(inp_gen)
#out_gen = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(LATENT_SIZE, input_shape=(None,1,INPUT_SIZE)))(inp_gen)

# domain-specific model parts
inp_spec = tf.keras.Input(shape=(1,INPUT_SIZE))
#inp_spec_att, attn_weights_spec = SeqSelfAttention(return_attention = True)(inp_spec)
#out_spec = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(LATENT_SIZE, input_shape=(None,1,INPUT_SIZE)))(inp_spec)

# concatenate domain-general and domain-specific results
merged = tf.keras.layers.Concatenate()([inp_gen, inp_spec])
merged = tf.keras.layers.Dense(300, activation='sigmoid')(merged)
# drop out layer and dense layer
merged = tf.keras.layers.Dropout(.5)(merged)
merged = tf.keras.layers.Dense(1, activation='sigmoid')(merged)

classifier4 = tf.keras.Model([inp_gen,inp_spec], merged)
#classifier.summary()|

In [875]:
# initialize the proposed classifier

INPUT_SIZE = 300
LATENT_SIZE = 300

# domain-general model parts
inp_gen = tf.keras.Input(shape=(1,INPUT_SIZE))
inp_gen_att, attn_weights_gen = SeqSelfAttention(return_attention = True)(inp_gen)
out_gen = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(LATENT_SIZE, input_shape=(None,1,INPUT_SIZE), return_sequences=True))(inp_gen)
out_gen_att, attn_weights_gen = SeqSelfAttention(return_attention = True)(out_gen)

# domain-specific model parts
inp_spec = tf.keras.Input(shape=(1,INPUT_SIZE))
inp_spec_att, attn_weights_spec = SeqSelfAttention(return_attention = True)(inp_spec)
out_spec = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(LATENT_SIZE, input_shape=(None,1,INPUT_SIZE), return_sequences=True))(inp_spec)
out_spec_att, attn_weights_spec= SeqSelfAttention(return_attention = True)(out_spec)

# concatenate domain-general and domain-specific results
merged = tf.keras.layers.Concatenate()([out_gen_att, out_spec_att])

# drop out layer and dense layer
merged = tf.keras.layers.Dropout(.5)(merged)
merged = tf.keras.layers.Dense(1, activation='sigmoid')(merged)

classifier3 = tf.keras.Model([inp_gen,inp_spec], merged)
classifier3.summary()

Model: "model_137"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_291 (InputLayer)          [(None, 1, 300)]     0                                            
__________________________________________________________________________________________________
input_292 (InputLayer)          [(None, 1, 300)]     0                                            
__________________________________________________________________________________________________
bidirectional_287 (Bidirectiona (None, 1, 600)       1442400     input_291[0][0]                  
__________________________________________________________________________________________________
bidirectional_288 (Bidirectiona (None, 1, 600)       1442400     input_292[0][0]                  
__________________________________________________________________________________________

In [873]:
# initialize the proposed classifier

INPUT_SIZE = 300
LATENT_SIZE = 300

# domain-general model parts
inp_gen = tf.keras.Input(shape=(1,INPUT_SIZE))
#inp_gen_att, attn_weights_gen = SeqSelfAttention(return_attention = True)(inp_gen)
out_gen = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(LATENT_SIZE, input_shape=(None,1,INPUT_SIZE), return_sequences=True))(inp_gen)
out_gen_att, attn_weights_gen = SeqSelfAttention(return_attention = True)(out_gen)

# domain-specific model parts
inp_spec = tf.keras.Input(shape=(1,INPUT_SIZE))
#inp_spec_att, attn_weights_spec = SeqSelfAttention(return_attention = True)(inp_spec)
out_spec = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(LATENT_SIZE, input_shape=(None,1,INPUT_SIZE), return_sequences=True))(inp_spec)
out_spec_att, attn_weights_spec= SeqSelfAttention(return_attention = True)(out_spec)

# concatenate domain-general and domain-specific results
merged = tf.keras.layers.Concatenate()([out_gen_att, out_spec_att])

# drop out layer and dense layer
merged = tf.keras.layers.Dropout(.5)(merged)
merged = tf.keras.layers.Dense(1, activation='sigmoid')(merged)

classifier2 = tf.keras.Model([inp_gen,inp_spec], merged)
#classifier4.summary()

Model: "model_136"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_289 (InputLayer)          [(None, 1, 300)]     0                                            
__________________________________________________________________________________________________
input_290 (InputLayer)          [(None, 1, 300)]     0                                            
__________________________________________________________________________________________________
bidirectional_285 (Bidirectiona (None, 1, 600)       1442400     input_289[0][0]                  
__________________________________________________________________________________________________
bidirectional_286 (Bidirectiona (None, 1, 600)       1442400     input_290[0][0]                  
__________________________________________________________________________________________

In [None]:
# visualisation of the model
plot_model(classifier, to_file='classifier_plot.png', show_shapes=True, show_layer_names=False)

In [2109]:
classifier4.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(0.01), metrics=['accuracy'])
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath="weights/classifier/classifier_without_al/certainty_sampling/classifier_domain_data4_4+.h5")
    #print(X_train_gen.shape, X_train_spec.shape)
history = classifier4.fit([np.expand_dims(np.asarray(X_train_gen).astype(np.float32), 1), np.expand_dims(np.asarray(X_train_spec).astype(np.float32), 1)], np.asarray(y_train).astype(np.float32), epochs=30, validation_data = ([np.expand_dims(np.asarray(X_val_gen).astype(np.float32), 1), np.expand_dims(np.asarray(X_val_spec).astype(np.float32), 1)], np.asarray(y_val).astype(np.float32)), callbacks = [checkpoint, es], batch_size=32)

# evaluating the model
score = classifier4.evaluate([np.expand_dims(np.asarray(X_test_gen).astype(np.float32), 1), np.expand_dims(np.asarray(X_test_spec).astype(np.float32), 1)], np.asarray(y_test).astype(np.float32), verbose=0) 

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30


In [495]:
X_test_gen.shape

(0, 300)

### Baseline classifier (using specific embeddings):

In [None]:
# initialize the baseline classifier

INPUT_SIZE = 300
LATENT_SIZE = 300

classifier_baseline = tf.keras.Sequential()
classifier_baseline.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(INPUT_SIZE, input_shape=(None,1,LATENT_SIZE))))
classifier_baseline.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [None]:
classifier_baseline.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath="weights/classifier/classifier_without_al/baseline_specific/classifier_domain_0.h5")
history = classifier_baseline.fit(np.expand_dims(X_train_spec, 1), y_train, epochs=100,validation_data = (np.expand_dims(X_val_spec, 1), y_val),callbacks = [checkpoint,es],batch_size=32)


# evaluating the model
score = classifier_baseline.evaluate(np.expand_dims(X_test_spec, 1), y_test, verbose=0) 
print('Final accuracy score: '+str(score[1]))                                                    

### Baseline classifier (using general embeddings):

In [None]:
# initialize the baseline classifier

INPUT_SIZE = 300
LATENT_SIZE = 300

classifier_baseline = Sequential()
classifier_baseline.add(Bidirectional(LSTM(INPUT_SIZE, input_shape=(None,1,LATENT_SIZE))))
classifier_baseline.add(Dense(1, activation='sigmoid'))

In [None]:
# load the original, unsorted data
with open('data/sentence_embeddings/general/unsorted/sentemb/sentemb.p', 'rb') as f:
    data_general = pkl.load(f)

with open('data/sentence_embeddings/general/unsorted/label_domain/label_domain_train_sentemb.p', 'rb') as f:
    temp_train = pkl.load(f)

with open('data/sentence_embeddings/general/unsorted/label_domain/label_domain_test_sentemb.p', 'rb') as f:
    temp_test = pkl.load(f)
    
labels_general = np.hstack((temp_train, temp_test))

In [None]:
# get the indices to reverse the shuffling that was done before the autoencoder
# this is necessary to keep the sample order the exact same to the other baseline
idx_train = np.random.RandomState(seed=42).permutation(25358)
idx_train_new = np.argsort(idx_train)

idx_test = np.random.RandomState(seed=43).permutation(6395)
idx_test_new = np.argsort(idx_test)

In [None]:
# reverse the shuffling that was done to the general embeddings before the autoencoder and shuffle them the same way as the specific embeddings
# this is necessary in order to sustain the order
data_train, data_test = data_general[:temp_train.shape[1]], data_general[temp_train.shape[1]:]
temp_train, temp_test, data_train, data_test = temp_train[:,idx_train_new], temp_test[:,idx_test_new], data_train[idx_train_new], data_test[idx_test_new]
temp_train, temp_test,data_train, data_test = temp_train[:,idx_train], temp_test[:,idx_test], data_train[idx_train], data_test[idx_test]

index_to_keep = [index for index, value in enumerate(temp_train[1]) if int(value) == index_spec]
temp_train, data_train = temp_train[:, index_to_keep], data_train[index_to_keep]

index_to_keep = [index for index, value in enumerate(temp_test[1]) if int(value) == index_spec]
temp_test, data_test = temp_test[:, index_to_keep], data_test[index_to_keep]

In [None]:
# data splitting
X_spec = np.concatenate([data_train,data_test])
X_train_spec, X_val_spec, X_test_spec = X_spec[:1400], X_spec[1400:1600], X_spec[1600:] 

In [None]:
classifier_baseline.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', patience=3)
checkpoint = ModelCheckpoint(filepath="weights/classifier/classifier_without_al/baseline_general/classifier_domain_0.h5")
history = classifier_baseline.fit(np.expand_dims(X_train_spec, 1), y_train, epochs=50,validation_data = (np.expand_dims(X_val_spec, 1), y_val),callbacks = [checkpoint,es],batch_size=32)


# evaluating the model
score = classifier_baseline.evaluate(np.expand_dims(X_test_spec, 1), y_test, verbose=0) 
print('Final accuracy score: '+str(score[1]))                                                    

## Experiments

Now, choose one of the three classifiers to train (Model using general embeddings instead of specific embeddings, Model using BERT embeddings, Model choosing general embeddings out of the most similar domains to the target domain):

### Model using general embeddings instead of specific embeddings:

In [None]:
# initialize the proposed classifier

INPUT_SIZE = 300
LATENT_SIZE = 300

# domain-general model parts
inp_gen = Input(shape=(1,INPUT_SIZE))
inp_gen_att, attn_weights_gen = SeqSelfAttention(return_attention = True)(inp_gen)
out_gen = Bidirectional(LSTM(LATENT_SIZE, input_shape=(None,1,INPUT_SIZE)))(inp_gen_att)

# domain-specific model parts
inp_spec = Input(shape=(1,INPUT_SIZE))
inp_spec_att, attn_weights_spec = SeqSelfAttention(return_attention = True)(inp_spec)
out_spec = Bidirectional(LSTM(LATENT_SIZE, input_shape=(None,1,INPUT_SIZE)))(inp_spec_att)

# concatenate domain-general and domain-specific results
merged = Concatenate()([out_gen, out_spec])

# drop out layer and dense layer
merged = Dropout(.5)(merged)
merged = Dense(1, activation='sigmoid')(merged)

classifier = Model([inp_gen,inp_spec], merged)
#classifier.summary()

In [None]:
# load the original, unsorted data
with open('data/sentence_embeddings/general/unsorted/sentemb/sentemb.p', 'rb') as f:
    data_general = pkl.load(f)

with open('data/sentence_embeddings/general/unsorted/label_domain/label_domain_train_sentemb.p', 'rb') as f:
    temp_train = pkl.load(f)

with open('data/sentence_embeddings/general/unsorted/label_domain/label_domain_test_sentemb.p', 'rb') as f:
    temp_test = pkl.load(f)
    
labels_general = np.hstack((temp_train, temp_test))

In [None]:
# get the indices to reverse the shuffling that was done to the embeddings before the autoencoder
# this is necessary in order to ensure that the order of the samples of the new input 
# is equivalent to the order of the sentence embeddings of the original model (specific sentence embeddings)
idx_train = np.random.RandomState(seed=42).permutation(25358)
idx_train_new = np.argsort(idx_train)

idx_test = np.random.RandomState(seed=43).permutation(6395)
idx_test_new = np.argsort(idx_test)

In [None]:
# reverse the shuffling that was done to the general embeddings before the autoencoder and shuffle them the same way as the specific embeddings
# this is necessary in order to sustain the original order
data_train, data_test = data_general[:temp_train.shape[1]], data_general[temp_train.shape[1]:]
temp_train, temp_test, data_train, data_test = temp_train[:,idx_train_new], temp_test[:,idx_test_new], data_train[idx_train_new], data_test[idx_test_new]
temp_train, temp_test,data_train, data_test = temp_train[:,idx_train], temp_test[:,idx_test], data_train[idx_train], data_test[idx_test]

index_to_keep = [index for index, value in enumerate(temp_train[1]) if int(value) == index_spec]
temp_train, data_train = temp_train[:, index_to_keep], data_train[index_to_keep]

index_to_keep = [index for index, value in enumerate(temp_test[1]) if int(value) == index_spec]
temp_test, data_test = temp_test[:, index_to_keep], data_test[index_to_keep]

In [None]:
# data splitting
X_spec = np.concatenate([data_train,data_test])
X_train_spec, X_val_spec, X_test_spec = X_spec[:1400], X_spec[1400:1600], X_spec[1600:] 

In [None]:
# training the model
classifier.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', patience=3)
checkpoint = ModelCheckpoint(filepath="weights/classifier/classifier_without_al/only_general_embeddings/classifier_domain_0.h5")
history = classifier.fit([np.expand_dims(X_train_gen, 1), np.expand_dims(X_train_spec, 1)], y_train, epochs=50, validation_data = ([np.expand_dims(X_val_gen, 1), np.expand_dims(X_val_spec, 1)], y_val), callbacks = [checkpoint, es], batch_size=32)

# evaluating the model
score = classifier.evaluate([np.expand_dims(X_test_gen, 1), np.expand_dims(X_test_spec, 1)], y_test, verbose=0) 
print('Final accuracy score: '+str(score[1]))

### Model using BERT embeddings:

In [None]:
# model for BERT embeddings

INPUT_SIZE_BERT = 768
INPUT_SIZE_SPEC = 300
LATENT_SIZE = 300

# domain-general model parts
input_gen = Input(shape=(1,INPUT_SIZE_BERT))
input_att_gen, attn_weights_gen = SeqSelfAttention(return_attention = True)(input_gen)
output_gen = Bidirectional(LSTM(LATENT_SIZE, input_shape=(None,1,INPUT_SIZE_BERT)))(input_att_gen)

# domain-specific model parts
input_spec = Input(shape=(1,INPUT_SIZE_SPEC))
input_att_spec, attn_weights_spec = SeqSelfAttention(return_attention = True)(input_spec)
output_spec = Bidirectional(LSTM(LATENT_SIZE, input_shape=(None,1,INPUT_SIZE_SPEC)))(input_att_spec)

# concatenate domain-general and domain-specific results
merged = Concatenate()([output_gen, output_spec])

# drop out layer and dense layer
merged = Dropout(.5)(merged)
merged = Dense(1, activation='sigmoid')(merged)

classifier_bert_embeddings = Model([input_gen,input_spec], merged)
#classifier_bert_embeddings.summary()

In [None]:
# load and prepare the data
df_train, df_test = load_data('data/bert_embeddings/train/*'), load_data('data/bert_embeddings/test/*')
df_train['domain'], df_test['domain'] = [re.sub('.task.train.pkl$', '', word) for word in np.array(df_train['domain'])], [re.sub('.task.test.pkl$', '', word) for word in np.array(df_test['domain'])]

# create a dictionary that numerically encodes the domain
dict_domain = dict(zip(sorted(set(np.array(df_train['domain']))), np.arange(16)))

# divide the data in embeddings and encodings of label/domain
X_train, label_domain_train = divide_data(df_train, dict_domain)
X_test, label_domain_test = divide_data(df_test, dict_domain)

In [None]:
# shuffle the data (same shuffling as sentence embeddings generated by the autoencoder)
idx_train = np.random.RandomState(seed=42).permutation(X_train.shape[0])
X_train,label_domain_train = X_train[idx_train], label_domain_train[:,idx_train]

idx_test = np.random.RandomState(seed=43).permutation(X_test.shape[0])
X_test,label_domain_test = X_test[idx_test], label_domain_test[:,idx_test]

labels_general, data_general = np.hstack((label_domain_train, label_domain_test)), np.hstack((X_train, X_test))

In [None]:
# get indices for sorting the array
ind = sort_array(labels_general, labels_total)

# sort general sentence embeddings
data_general, labels_general = data_general[ind], labels_general[:, ind]

In [None]:
# getting the indices of all samples that aren't nan in order to eliminate them in both general and specific embeddings to sustain the input pairs
indices_to_remove = []
for i in range(data_general.shape[0]):
    if np.isnan(data_general[i]).any():
        indices_to_remove.append(i)
        
indices_to_keep = list(set(np.arange(0,data_general.shape[0])) - set(indices_to_remove))

In [None]:
# split the data
data_general, labels_general, X_spec = data_general[indices_to_keep], labels_general[:, indices_to_keep], X_spec[indices_to_keep]
    
X_train_spec, X_val_spec, X_test_spec = X_spec[:1400], X_spec[1400:1600], X_spec[1600:] 
y_train, y_valid, y_test = labels_general[0, :1400], labels_general[0, 1400:1600], labels_general[0, 1600:]
X_train_gen, X_val_gen, X_test_gen = data_general[:1400], data_general[1400:1600], data_general[1600:]
X_train_gen, X_val_gen, X_test_gen = np.vstack(X_train_gen), np.vstack(X_val_gen), np.vstack(X_test_gen)

In [None]:
# training the model
classifier_bert_embeddings.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', patience=3)
checkpoint = ModelCheckpoint(filepath="weights/classifier/classifier_without_al/bert_embedding_usage/classifier_domain_0.h5")
history = classifier_bert_embeddings.fit([np.expand_dims(X_train_gen, 1), np.expand_dims(X_train_spec, 1)], y_train, epochs=50, validation_data = ([np.expand_dims(X_val_gen, 1), np.expand_dims(X_val_spec, 1)], y_val), callbacks = [checkpoint, es], batch_size=32)

# evaluating the model
score = classifier_bert_embeddings.evaluate([np.expand_dims(X_test_gen, 1), np.expand_dims(X_test_spec, 1)], y_test, verbose=0) 
print('Final accuracy score: '+str(score[1]))

### Model choosing general embeddings out of the most similar domains to the target domain:

In [746]:
# initialize the proposed classifier

INPUT_SIZE = 300
LATENT_SIZE = 300

# domain-general model parts
inp_gen = Input(shape=(1,INPUT_SIZE))
inp_gen_att, attn_weights_gen = SeqSelfAttention(return_attention = True)(inp_gen)
out_gen = Bidirectional(LSTM(LATENT_SIZE, input_shape=(None,1,INPUT_SIZE)))(inp_gen_att)

# domain-specific model parts
inp_spec = Input(shape=(1,INPUT_SIZE))
inp_spec_att, attn_weights_spec = SeqSelfAttention(return_attention = True)(inp_spec)
out_spec = Bidirectional(LSTM(LATENT_SIZE, input_shape=(None,1,INPUT_SIZE)))(inp_spec_att)

# concatenate domain-general and domain-specific results
merged = Concatenate()([out_gen, out_spec])

# drop out layer and dense layer
merged = Dropout(.5)(merged)
merged = Dense(1, activation='sigmoid')(merged)

classifier = Model([inp_gen,inp_spec], merged)
#classifier.summary()

NameError: name 'Input' is not defined

In [2123]:
# load the original, unsorted data
with open('data/sentence_embeddings/general/unsorted/sentemb/sentemb_unlabeled3.p', 'rb') as f:
    data_general = pkl.load(f)

with open('data/sentence_embeddings/general/unsorted/label_domain/label_domain_train_sentemb_unlabeled3.p', 'rb') as f:
    labels_train = pkl.load(f)
    
with open('data/sentence_embeddings/general/unsorted/label_domain/label_domain_test_sentemb_unlabeled3.p', 'rb') as f:
    labels_test = pkl.load(f)
    
labels_general = np.hstack((labels_train, labels_test))

data_general = data_general.transpose()

# load the cleaned data
with open('data/cleaned_data/merged_cleaned.p', 'rb') as f:
    df_train = pkl.load(f)
with open('data/cleaned_data/test_cleaned.p', 'rb') as f:
    df_test = pkl.load(f)

# create a list of data frames dfs, each data frame represents one domain
#df = pd.concat([df_train, df_test],ignore_index=True)
#dfs = [x for _, x in df.groupby('domain')]

list_unlabel = df_train.index[df_train['label'] == 3].to_list()

df_train = df_train[~df_train.index.isin(list_unlabel)].reset_index(drop=True)



# create a list of data frames dfs, each data frame represents one domain
df = pd.concat([df_train, df_test],ignore_index=True)
dfs = [x for _, x in df.groupby('domain')]

# get the word distribution of each domain
# the frequency of each existing word is computed in every domain
import collections
import regex as re
word_counter = []
for df in dfs:
    counts = collections.Counter()
    words = re.compile(r'\w+')
    reviews = np.array([s for s in df['text']])
    for review in reviews:
        counts.update(words.findall(review.lower()))
    word_counter.append(counts)

# the rows of df are the 16 domains, the columns are all existing words
# the number of the cells of df is the word frequency for the word in the domain
df_dist = pd.DataFrame(word_counter)
df_dist = df_dist.fillna(0)

In [2130]:
import re
import os
import glob
import numpy as np
import pandas as pd
import random as rn
import pickle as pkl
from collections import Counter
import matplotlib.pyplot as plt
from scipy.spatial import distance


# get list js_d of jensen_shannon distances to the target domain
js_d = []
for i in range(df_dist.shape[0]):
    d = distance.jensenshannon(np.array(df_dist.iloc[index_spec]), np.array(df_dist.iloc[i]))
    js_d.append(d)
    
# take 5 most similiar distributions
# most_sim_dist is a list of 5 elements with the 5 closest domains to the target domain
most_sim_dist = sorted(range(len(js_d)), key=lambda i: js_d[i], reverse=True)[-6:]
most_sim_dist.remove(index_spec)
#print(labels_general.shape)
# remove general embeddings that aren't from these 5 domains
index_to_keep = [index for index, value in enumerate(labels_general[1]) if int(value) in most_sim_dist]
labels_general, data_general = labels_general[:, index_to_keep], data_general[index_to_keep]
#print(labels_general.shape)
# function for sorting two arrays such that both arrays have the same labels
# returns indeces_sorted which consists of indices and is used for sorting array_to_sort
def sort_array(array_to_sort, array_ref):
    
    y, y_ref = array_to_sort[0].astype(int), array_ref[0].astype(int)
    indeces_zeros, indeces_ones = [], []

    # get indices when array_to_sort is 0 (indeces_zeros) and when it is 1 (indeces_ones)
    for i in np.arange(y.shape[0]):
        if y[i] == 0:
            indeces_zeros.append(i)
        else:
            indeces_ones.append(i)

    indeces_sorted = np.zeros(y_ref.shape[0])
    cnt_zeros, cnt_ones = 0,0
    
    # get sorted indeces
    # pair the first positive (/negative) instance of both arrays, etc. 
    for i in np.arange(y_ref.shape[0]):
        if y_ref[i] == 0:
            indeces_sorted[i] = indeces_zeros[cnt_zeros]
            cnt_zeros += 1
        else:
            indeces_sorted[i] = indeces_ones[cnt_ones]
            cnt_ones += 1
    
    return indeces_sorted.astype(int)
#print(labels_general.shape, labels_total.shape)
# get indices for sorting the array
print(labels_general, labels_total)
ind = sort_array(labels_general, labels_total)

# sort general sentence embeddings
data_general, labels_general = data_general[ind], labels_general[:, ind]

# data splitting
X_train_gen, X_val_gen, X_test_gen = data_general[:4200], data_general[4200:4800], data_general[4800:]



[[   0    0    0 ...    0    0    0]
 [  14   14    7 ...    9   13    2]
 [1246  852  405 ...  552 1349 1230]] [[0 0 0 ... 0 0 0]
 [4 10 15 ... 11 11 11]
 [96 916 1208 ... 352 1303 490]]


In [1560]:
import keras_tuner as kt

In [2088]:
 class MyHyperModel(kt.HyperModel):
    def build(self, hp):
        INPUT_SIZE = 300
        LATENT_SIZE = 300
   # hp_units = hp.Int('units', min_value=32, max_value=512, step=32)
    
        hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
        hp_units1 = hp.Int('units1', min_value=100, max_value=500, step=100)
       # hp_units2 = hp.Int('units2', min_value=60, max_value=130, step=10)
# domain-general model parts
        inp_gen = tf.keras.Input(shape=(1,INPUT_SIZE))
      #  inp_gen_att, attn_weights_gen = SeqSelfAttention(return_attention = True)(inp_gen)
        #out_gen = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(hp_units1, input_shape=(None,1,INPUT_SIZE)))(inp_gen)
        #out_gen = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(hp_units2, input_shape=(None,1,INPUT_SIZE)))(out_gen1)
# domain-specific model parts
        inp_spec = tf.keras.Input(shape=(1,INPUT_SIZE))
       # inp_spec_att, attn_weights_spec = SeqSelfAttention(return_attention = True)(inp_spec)
        #out_spec = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(300, input_shape=(None,1,INPUT_SIZE)))(inp_spec)
       # out_spec = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(120, input_shape=(None,1,INPUT_SIZE)))(out_spec1)
# concatenate domain-general and domain-specific results
        merged = tf.keras.layers.Concatenate()([inp_gen, inp_spec])
        merged = tf.keras.layers.Dense(hp_units1, activation='sigmoid')(merged)
# drop out layer and dense layer
        merged = tf.keras.layers.Dropout(hp.Float('dropout', 0, 0.5, step=0.1, default=0.5))(merged)
        merged = tf.keras.layers.Dense(1, activation='sigmoid')(merged)

        classifier = tf.keras.Model([inp_gen,inp_spec], merged)
        classifier.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate), metrics=['accuracy'])
#classifier.summary()
        return classifier
        

    def fit(self, hp, model, *args, **kwargs):
        return model.fit(
            *args,
            batch_size=hp.Choice("batch_size", values=[32, 64]),
            epochs= hp.Int('epochs', min_value=20, max_value=70, step=10),
            **kwargs,
        )

In [2089]:
tuner2=kt.BayesianOptimization(
    MyHyperModel(),
    objective="val_accuracy",
    max_trials=50,
    overwrite=True,
    num_initial_points=25,
    alpha=0.001,
    beta=2.6
    
)

In [2090]:
#NUM_EPOCHS = 20
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
#checkpoint = ModelCheckpoint(filepath="weights/classifier/classifier_without_al/standard_model/classifier_domain_1.h5")

tuner2.search([np.expand_dims(X_train_gen, 1), np.expand_dims(X_train_spec, 1)], y_train, validation_data = ([np.expand_dims(X_val_gen, 1), np.expand_dims(X_val_spec, 1)], y_val), callbacks = [es])

Trial 42 Complete [00h 00m 07s]
val_accuracy: 0.8833333253860474

Best val_accuracy So Far: 0.8899999856948853
Total elapsed time: 00h 08m 46s

Search: Running Trial #43

Value             |Best Value So Far |Hyperparameter
0.01              |0.01              |learning_rate
100               |300               |units1
0.5               |0.3               |dropout
64                |32                |batch_size
70                |40                |epochs

Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70

KeyboardInterrupt: 

In [None]:
# Get the optimal hyperparameters
best_hps=tuner2.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. The optimal batch_size
layer is {best_hps.get('batch_size')}, the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')}, the optimal dropout rate is {best_hps.get('dropout')}, the optimal number of epochs is {best_hps.get('epochs')} the optimal number of units1 is {best_hps.get('units1')} and th.
""")

In [2133]:


#hp_units1 = hp.Int('units', min_value=200, max_value=350, step=10)
#hp_units2 = hp.Int('units', min_value=50, max_value=180, step=10)
# domain-general model parts
inp_gen = tf.keras.Input(shape=(1,INPUT_SIZE))
#inp_gen_att, attn_weights_gen = SeqSelfAttention(return_attention = True)(inp_gen)
#out_gen = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(300, input_shape=(None,1,INPUT_SIZE)))(inp_gen)
#out_gen = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(120, input_shape=(None,1,INPUT_SIZE)))(out_gen1)


# domain-specific model parts
inp_spec = tf.keras.Input(shape=(1,INPUT_SIZE))
#inp_spec_att, attn_weights_spec = SeqSelfAttention(return_attention = True)(inp_spec)
#out_spec = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(300, input_shape=(None,1,INPUT_SIZE)))(inp_spec)
#out_spec = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(120, input_shape=(None,1,INPUT_SIZE)))(out_spec1)


# concatenate domain-general and domain-specific results
merged = tf.keras.layers.Concatenate()([inp_gen, inp_spec])
#merged = tf.keras.layers.AveragePooling1D()([out_gen, out_spec])
merged = tf.keras.layers.Dense(100, activation='sigmoid')(merged)
# drop out layer and dense layer
#merged = tf.keras.layers.Dense(300, activation='sigmoid')(merged)
merged = tf.keras.layers.Dropout(0.3)(merged)
merged = tf.keras.layers.Dense(1, activation='sigmoid')(merged)

classifier = tf.keras.Model([inp_gen,inp_spec], merged)
classifier.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(0.001), metrics=['accuracy'])

In [2134]:
# training the model
print(X_train_gen.shape, X_train_spec.shape, y_train.shape)
classifier.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(0.001), metrics=['accuracy'])
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath="weights/classifier/classifier_without_al/standard_model/classifier_domain_3_15.h5")
history = classifier.fit([np.expand_dims(X_train_gen, 1), np.expand_dims(X_train_spec, 1)], y_train, epochs=30, validation_data = ([np.expand_dims(X_val_gen, 1), np.expand_dims(X_val_spec, 1)], y_val), callbacks = [checkpoint, es], batch_size=64)

# evaluating the model
score = classifier.evaluate([np.expand_dims(X_test_gen, 1), np.expand_dims(X_test_spec, 1)], y_test, verbose=1) 
print('Final accuracy score: '+str(score[1]))

(4200, 300) (4200, 300) (4200,)


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).