# Training Classifier based on BERT embeddings and other features
A version of this notebook was used to train some of the models presented in our paper. All models trained here are based either only on the fine-tuned BERT embeddings extracted as fixed vectors or on the fine-tuned BERT embeddings extracted as fixed vectors and enhanced with the speaker information. The trained models can be found in the GitHub repository.

In [None]:
# import libraries
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import tensorflow as tf

from tqdm import tqdm, trange
import pandas as pd
import io
import random
import numpy as np

In [None]:
# read in the files with the fine-tuned embeddings
df_embeds_train = pd.read_csv("fine-tuned_bert_embeddings/fine-tuned_bert_embeds_on_queAndCtxAfter_trainset.csv", delimiter=',', header=0 )
df_embeds_test = pd.read_csv("fine-tuned_bert_embeddings/fine-tuned_bert_embeds_on_queAndCtxAfter_testset.csv", delimiter=',', header=0 )

In [None]:
# make sure that the file is read in the way it should
df_embeds_test.head()

In [None]:
# read in the files containing the extra features for each question
# here: the speaker before and after information
train_df = pd.read_csv("simple_features_annotations/rquet_trainset_simple_features.csv", delimiter='\t', header=0 )
test_df = pd.read_csv("simple_features_annotations/rquet_testset_simple_features.csv", delimiter='\t', header=0 )

test_df.head()

In [None]:
# merge the simple features with the bert embeddings dataframe
merged_df_train = pd.merge(train_df, df_embeds_train, on='ID')
merged_df_test = pd.merge(test_df,df_embeds_test, on='ID')
merged_df_test.head()


In [None]:
# feature selection: The all_feats list contains the speaker feature and the fine-tuned bert embeddings. In this list,
# you can modify the specific speaker feature you want to train on: if you want to train on the speaker-after feature,
# you need to use index 2; if you want to train on the speaker-before feature, you need to use index 3
# The bert_feats list contains only the fine-tuned bert embeddings.  
all_feats = np.r_[2, 4:772]
bert_feats = np.r_[2:772]


In [None]:
# If you want to train only on the bert embeddings, use the bert_feats list. If you want to 
# train on bert plus the speaker information, use the all_feats list.
X_train = merged_df_train.values[0:, all_feats].astype("float32")
Y_train = merged_df_train.values[0:,1].astype("float32")
X_test = merged_df_test.values[0:, all_feats].astype("float32")
Y_test = merged_df_test.values[0:,1].astype("float32")
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)


In [None]:
# import classifiers
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn import metrics 
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier

In [None]:
#### Gaussian
clf = GaussianNB()
clf.fit(X_train, Y_train)

y_pred_gaus_test = clf.predict(X_test)

print("Accuracy:",metrics.accuracy_score(Y_test, y_pred_gaus_test))

In [None]:
### SVM 

svm_final = svm.SVC(C=10, gamma= 'scale', kernel='rbf')
svm_final.fit(X_train, Y_train)

y_pred_svm_test = svm_final.predict(X_test)

print("Accuracy:",metrics.accuracy_score(Y_test, y_pred_svm_test))

In [None]:
### MLP
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

model_mlp = MLPClassifier(hidden_layer_sizes=(7,), #
                                       activation='relu',
                                       solver='adam',
                                       learning_rate='adaptive',
                                       max_iter=1000,
                                       learning_rate_init=0.01,
                                       alpha=0.01)
model_mlp.fit(X_train, Y_train)
y_pred_mlp_test = model_mlp.predict(X_test)

print("Accuracy:",metrics.accuracy_score(Y_test, y_pred_mlp_test))

In [None]:
### decision tree
clf_gini = tree.DecisionTreeClassifier(criterion = "gini", max_depth = 5)
clf_gini.fit(X_train, Y_train) 

y_pred_test = clf_gini.predict(X_test)

print("Accuracy:",metrics.accuracy_score(Y_test, y_pred_test))

In [None]:
### FF neural net
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Flatten, Dense
from tensorflow.keras.layers import Embedding, SimpleRNN
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional

# depending on whether you trained on all the features or only the bert embeddings, uncomment the following lines
in_shape = len(all_feats)
#in_shape = len(bert_feats)

model = models.Sequential()
model.add(layers.Dense(512, activation='relu', input_shape=(in_shape,))) 
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['accuracy'])


history = model.fit(X_train,
Y_train,
epochs=3, 
batch_size=32, validation_split=0.2)

score = model.evaluate(X_test.astype("float32"), Y_test.astype("float32"), verbose=0)
print(f'Test loss: {score[0]} / Test accuracy: {score[1]}')

In [None]:
### LSTM

# depending on whether you trained on all the features or only the bert embeddings, uncomment the following lines
in_shape = len(all_feats)
#in_shape = len(bert_feats)


reshaped_X = X_train.reshape((1588, 1, in_shape)) 
reshaped_X_test = X_test.reshape((180, 1, in_shape))

rnn = Sequential()
rnn.add(layers.LSTM(200, return_sequences=True))
rnn.add(layers.LSTM(100))
rnn.add(Dense(1, activation='sigmoid'))
rnn.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = rnn.fit(reshaped_X.astype("float32"), Y_train.astype("float32"),
epochs=10,
batch_size=32,
validation_split=0.2)


score = rnn.evaluate(reshaped_X_test.astype("float32"), Y_test.astype("float32"), verbose=0)
print(f'Test loss: {score[0]} / Test accuracy: {score[1]}')