In [1]:
import os

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold

import numpy as np
import csv
import re
import pickle
import time
from datetime import timedelta
import pandas as pd
from pathlib import Path
from sklearn import svm

import sys
# add parent directory to the path as well, if running from the finetune folder
parent_dir = os.path.dirname(os.getcwd())
sys.path.insert(0, parent_dir)
sys.path.insert(0, os.getcwd())

import utils.gen_utils as utils

In [2]:
def get_inputs(inp_dir, dataset, embed, embed_mode, mode, layer):
    """Read data from pkl file and prepare for training."""
    file = open(
        inp_dir + dataset + "-" + embed + "-" + embed_mode + "-" + mode + ".pkl", "rb"
    )
    data = pickle.load(file)
    author_ids, data_x, data_y = list(zip(*data))
    file.close()

    # alphaW is responsible for which BERT layer embedding we will be using
    if layer == "all":
        alphaW = np.full([n_hl], 1 / n_hl)

    else:
        alphaW = np.zeros([n_hl])
        alphaW[int(layer) - 1] = 1

    # just changing the way data is stored (tuples of minibatches) and
    # getting the output for the required layer of BERT using alphaW
    inputs = []
    targets = []
    n_batches = len(data_y)
    for ii in range(n_batches):
        inputs.extend(np.einsum("k,kij->ij", alphaW, data_x[ii]))
        targets.extend(data_y[ii])

    inputs = np.array(inputs)
    full_targets = np.array(targets)

    return inputs, full_targets

# SVM Classification

In [3]:
def SVMclassification(X_train, X_test, y_train, y_test, file_name):
    """Run classification algorithm (SVM)"""
    """ (commented lines can save SVM model) """

    classifier = svm.SVC(gamma="scale")
    classifier.fit(X_train, y_train)
    # model_name = file_name + '.joblib'
    # joblib.dump(classifier, model_name)
    acc = classifier.score(X_test, y_test)
    return acc


def SVMtraining(dataset, inputs, full_targets):
    """Train model for each trait on 10-fold corss-validtion."""

    trait_labels = ["E", "N", "F", "J"]
    n_splits = 10
    expdata = {}
    expdata["acc"], expdata["trait"], expdata["fold"] = [], [], []
    print

    for trait_idx in range(full_targets.shape[1]):
        # convert targets to one-hot encoding
        targets = full_targets[:, trait_idx]
        n_data = targets.shape[0]

        expdata["trait"].extend([trait_labels[trait_idx]] * n_splits)
        expdata["fold"].extend(np.arange(1, n_splits + 1))

        skf = StratifiedKFold(n_splits=n_splits, shuffle=False)
        k = -1
        for train_index, test_index in skf.split(inputs, targets):
            x_train, x_test = inputs[train_index], inputs[test_index]
            y_train, y_test = targets[train_index], targets[test_index]

            k += 1
            acc = SVMclassification(
                x_train,
                x_test,
                y_train,
                y_test,
                "SVM-" + dataset + "-" + embed + "-" + str(k) + "_t" + str(trait_idx),
            )
            expdata["acc"].append(100 * acc)

    df = pd.DataFrame.from_dict(expdata)
    return df

# MLP Model

In [4]:
def MLPtraining(dataset, inputs, full_targets):
    """Train MLP model for each trait on 10-fold corss-validtion."""
    trait_labels = ["E", "N", "F", "J"]
    
    
    n_classes = 2
    lr = 3e-4
    epochs = 20
    batch_size = 2
    
    
    
    n_splits = 10
    fold_acc = {}
    expdata = {}
    expdata["acc"], expdata["trait"], expdata["fold"] = [], [], []

    for trait_idx in range(full_targets.shape[1]):
        # convert targets to one-hot encoding
        targets = full_targets[:, trait_idx]
        n_data = targets.shape[0]

        expdata["trait"].extend([trait_labels[trait_idx]] * n_splits)
        expdata["fold"].extend(np.arange(1, n_splits + 1))

        skf = StratifiedKFold(n_splits=n_splits, shuffle=False)
        k = -1
        for train_index, test_index in skf.split(inputs, targets):
            x_train, x_test = inputs[train_index], inputs[test_index]
            y_train, y_test = targets[train_index], targets[test_index]
            # converting to one-hot embedding
            y_train = tf.keras.utils.to_categorical(y_train, num_classes=n_classes)
            y_test = tf.keras.utils.to_categorical(y_test, num_classes=n_classes)
            model = tf.keras.models.Sequential()

            # define the neural network architecture
            model.add(
                tf.keras.layers.Dense(50, input_dim=hidden_dim, activation="relu")
            )
            model.add(tf.keras.layers.Dense(n_classes))

            k += 1
            model.compile(
                optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
                loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                metrics=["mse", "accuracy"],
            )

            history = model.fit(
                x_train,
                y_train,
                epochs=epochs,
                batch_size=batch_size,
                validation_data=(x_test, y_test),
                verbose=0,
            )

            expdata["acc"].append(100 * max(history.history["val_accuracy"]))
    print(expdata)
    df = pd.DataFrame.from_dict(expdata)
    return df


In [5]:
inp_dir = "pk1_data/"
dataset = "reddit"
embed = "albert-base"
embed_mode = "mean"
mode = ""
network = "SVM"
MODEL_INPUT = "LM_features"
seed = 1337
layer = "all"
print("{} : {} : {} : {} : {}".format(dataset, embed, layer, mode, embed_mode))
np.random.seed(seed)
tf.random.set_seed(seed)
if re.search(r"base", embed):
    n_hl = 12
    hidden_dim = 768
elif re.search(r"large", embed):
    n_hl = 24
    hidden_dim = 1024


reddit : albert-base : all :  : mean


In [6]:
def testSVM():
    start = time.time()
    inputs, full_targets = get_inputs(inp_dir, dataset, embed, embed_mode, mode, layer)
    df = SVMtraining(dataset, inputs, full_targets)
    end = time.time()
    print(f"Took: {int(end - start)} seconds")
    return df

In [7]:
def testMLP():
    start = time.time()
    inputs, full_targets = get_inputs(inp_dir, dataset, embed, embed_mode, mode, layer)
    df = MLPtraining(dataset, inputs, full_targets)
    end = time.time()
    print(f"Took: {int(end - start)} seconds")
    return df

In [8]:
df1 = testSVM()

Took: 5 seconds


In [9]:
df2 = testMLP()

{'acc': [72.00000286102295, 73.46938848495483, 73.46938848495483, 75.51020383834839, 75.51020383834839, 75.51020383834839, 73.46938848495483, 73.46938848495483, 73.46938848495483, 73.46938848495483, 93.99999976158142, 95.91836929321289, 95.91836929321289, 93.87755393981934, 93.87755393981934, 93.87755393981934, 93.87755393981934, 93.87755393981934, 93.87755393981934, 93.87755393981934, 54.00000214576721, 57.14285969734192, 59.183675050735474, 57.14285969734192, 57.14285969734192, 67.34693646430969, 73.46938848495483, 53.06122303009033, 57.14285969734192, 55.10203838348389, 68.00000071525574, 65.30612111091614, 63.26530575752258, 63.26530575752258, 67.34693646430969, 69.38775777816772, 65.30612111091614, 65.30612111091614, 61.22449040412903, 63.26530575752258], 'trait': ['E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'J', 'J', 'J', 'J', 'J', 'J', 'J', 'J', 'J', 'J'], 'fold': [1, 2, 3, 

In [10]:
#SVM
df1.groupby(['trait']).mean().drop(columns=['fold'])

Unnamed: 0_level_0,acc
trait,Unnamed: 1_level_1
E,73.118367
F,52.959184
J,61.097959
N,94.297959


In [11]:
#MLP
df2.groupby(['trait']).mean().drop(columns=['fold'])

Unnamed: 0_level_0,acc
trait,Unnamed: 1_level_1
E,73.934695
F,59.07347
J,65.167347
N,94.297962
