In [1]:
# Todo: Fix dataset imbalance. There are 1244 positive training examples, and 901 negative training examples

In [2]:
# Install sentencenpiece to help in tokenization
!pip install sentencepiece

Defaulting to user installation because normal site-packages is not writeable


In [3]:
import torch
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras
import transformers
import numpy as np
from os import path
import pandas as pd
from transformers import AutoTokenizer, TFXLNetModel, TFAutoModel, TFXLNetForSequenceClassification

DATA_DIR = 'datasets'
DATASET_PATH = 'labels_with_stackx.csv'
TOKENIZED_DATASET_PATH = 'labels_with_stackx_tokenized'
TRAIN_SET = 'train_set'
DEV_SET = 'dev_set'
TEST_SET = 'test_set'
STACKX_COLUMNS = [
    'question_asker_intent_understanding',
     'question_body_critical',
     'question_conversational',
     'question_expect_short_answer',
     'question_fact_seeking',
     'question_has_commonly_accepted_answer',
     'question_interestingness_others',
     'question_interestingness_self',
     'question_multi_intent',
     'question_not_really_a_question',
     'question_opinion_seeking',
     'question_type_choice',
     'question_type_compare',
     'question_type_consequence',
     'question_type_definition',
     'question_type_entity',
     'question_type_instructions',
     'question_type_procedure',
     'question_type_reason_explanation',
     'question_type_spelling',
     'question_well_written'
]
NUM_SX_FEATURES = len(STACKX_COLUMNS)


TRAIN_RATIO = .6
DEV_RATIO = .2
TEST_RATIO = .2

In [4]:
"""
Value of 0.000001 and 2 epochs lead to 58% on training and 54.5% on dev. 
Further training did not lead to higher accuracy

Tried seeing of model had capacity to fit. Ran 6 epochs on 100 training examples
to see if I could at least get it to overfit. loss: 0.6451 - accuracy: 0.6222
Dev: loss: 0.6841 - accuracy: 0.5566

Ran 6 epochs on 200 training examples. loss: 0.6424 - accuracy: 0.6300
Dev: loss: 0.6874 - accuracy: 0.5524
Then another 6 epcohs on another 100 training examples. loss: 0.6233 - accuracy: 0.6200
Dev: loss: 0.7047 - accuracy: 0.5594
Then another 3 epochs on another 400 examples. loss: 0.6606 - accuracy: 0.5725
Dev: loss: 0.7007 - accuracy: 0.5650
Then another 3 epochs on another 500 examples. loss: 0.6486 - accuracy: 0.6300
Dev: loss: 0.6761 - accuracy: 0.5944
Then another epoch on entire training set. loss: 0.6507 - accuracy: 0.6247
Dev: loss: 0.6624 - accuracy: 0.6238
Then another epoch on entire training set. 0.6324 - accuracy: 0.6424
Dev: loss: 0.7089 - accuracy: 0.5986 <- We've reached the point of overfit

Ran 8 epochs on 100 examples with 
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    1e-5,
    decay_steps=50,
    decay_rate=0.9,
    staircase=True
)
Successfully overfit the model

With exponential decay, model appears to start overfitting after epoch 2.
Holding back 60 examples from the test set for validation after each epoch seems to match dev set well.

After further inspection, it appears that the model just leans toward always predicing positive because 
there are more positive than negative examples. This suggests that there is not a meaningful correlation in the data.
For example, trained a model and got 54% accuracy on dev set. Number of positive and negative predictions were
Positives: 711 Negatives: 4


"""

MAX_LEN = 1024
LEARNING_RATE = 0.000001 
BATCH_SIZE = 2
EPOCHS = 2

## Helper functions for tokenizing a dataset

In [5]:
def get_processed_dataframe(path: str) -> pd.DataFrame:
    
    data = pd.read_csv(path)
    
    # Drop all rows with questions that are not understandable
    data.drop(data[data.understandable == 0].index, inplace=True)
    # All questions are understandable so we can remove the 'understandable' column
    data.drop(columns="understandable", inplace=True)
    # Remove rows whos passage has more than 'max_len' words
    data.drop(data[data.passage.map(lambda x: x.count(" ") + 1) > MAX_LEN].index, inplace=True)
    # Remove rows with a comprehension value of 3, because these values won't work for binary classification
    data.drop(data[data.comprehension == 3].index, inplace=True)
    
    return data

# Create a closure for tokenization

def get_tokenize_func():

    tokenizer = AutoTokenizer.from_pretrained('xlnet-base-cased', model_max_length=MAX_LEN)
  
    def tk(df):
        out = tokenizer(df['passage'], df['question'], padding='max_length', truncation=True)
        return out['input_ids'], out['token_type_ids'], out['attention_mask']
  
    return tk

def get_tokens(data: pd.DataFrame):
    data['output_ids'], data['token_type_ids'], data['attention_mask'] = zip(*data.apply(get_tokenize_func(), axis=1))
    for col in ['output_ids', 'token_type_ids', 'attention_mask']:
        data[col] = data[col].apply(lambda cell: np.array(cell))
    
def tokenize_and_save(load_path: str, save_path: str):
    data = get_processed_dataframe(load_path)
    get_tokens(data)
    data.to_pickle(save_path)


## Helper functions for splitting a numpy arrays for train, dev and test

In [6]:
def create_datasets(rerun_tokenization=False):
    # Uncomment the following line of code if you would like to re-run the tokenization process
    if rerun_tokenization:
        tokenize_and_save(path.join(DATA_DIR, DATASET_PATH), path.join(DATA_DIR, TOKENIZED_DATASET_PATH))
    
    tokenized_data = pd.read_pickle(path.join(DATA_DIR, TOKENIZED_DATASET_PATH))
    
    # Randomly shuffle the data, but seeded so it's repeatable
    tokenized_data = tokenized_data.sample(frac=1, random_state=1)

    # Get train, test, dev splits
    num_entries = len(tokenized_data)
    train_cutoff = int(num_entries * TRAIN_RATIO)
    dev_cutoff = train_cutoff + int(num_entries * DEV_RATIO)

    train_set = tokenized_data[:train_cutoff]
    dev_set = tokenized_data[train_cutoff:dev_cutoff]
    test_set = tokenized_data[dev_cutoff:]

    train_set.to_pickle(path.join(DATA_DIR, TRAIN_SET))
    dev_set.to_pickle(path.join(DATA_DIR, DEV_SET))
    test_set.to_pickle(path.join(DATA_DIR,TEST_SET))
    
def get_features(df: pd.DataFrame) -> list:
    output_ids = np.stack(df['output_ids'].values)
    token_type_ids = np.stack(df['token_type_ids'].values)
    attention_mask = np.stack(df['attention_mask'].values)
    stackx_features = np.stack(df[STACKX_COLUMNS].values)
    return [output_ids, token_type_ids, attention_mask, stackx_features]
    
def get_labels(df: pd.DataFrame, label_name: str):
    # Extract numpy array, and reshape to be rank-2
    return np.reshape(df[label_name].values, (-1, 1))
    
def get_datasets():
    train_set = pd.read_pickle(path.join(DATA_DIR, TRAIN_SET))
    dev_set = pd.read_pickle(path.join(DATA_DIR, DEV_SET))
    test_set = pd.read_pickle(path.join(DATA_DIR,TEST_SET))
    
    return train_set, dev_set, test_set


def get_subset(ftrs: list, start: int, stop: int) -> list:
    return [ftrs[0][start:stop], ftrs[1][start:stop], ftrs[2][start:stop], ftrs[3][start:stop]]





In [7]:
create_datasets(rerun_tokenization=False)
train_set, dev_set, test_set = get_datasets()
train_features, train_labels = get_features(train_set), get_labels(train_set, 'comprehension binary')
dev_features, dev_labels = get_features(dev_set), get_labels(dev_set, 'comprehension binary')
test_features, test_labels = get_features(test_set), get_labels(test_set, 'comprehension binary')



In [8]:
def create_model(max_len: int, num_sx_features: int) -> tf.keras.Model:
    encoder = TFXLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=1) #Input is tokenized strings, output is embeddings and logits (logits means pre-sigmoid label)
    input_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name='input_ids')
    token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name='token_type_ids')
    attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32, name='attention_mask')
    stackx_features = layers.Input(shape=(num_sx_features,), dtype=tf.float32, name='stackx_features')

    logits = encoder(
        input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
    ).logits
    encoder_preds = layers.Dense(1, activation='sigmoid', name='encoder_preds')(logits)
    merged = layers.Concatenate()([encoder_preds, stackx_features])
    final_pred = layers.Dense(1, activation='sigmoid', name='final_prediction')(merged)

    model = tf.keras.Model(
            inputs=[input_ids, token_type_ids, attention_mask, stackx_features],
            outputs=[final_pred], name='classifier'
        )

    return model

In [9]:
keras.backend.clear_session()
model = create_model(MAX_LEN, NUM_SX_FEATURES)
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    1e-5,
    decay_steps=50,
    decay_rate=0.9,
    staircase=True
)

model.compile(optimizer=keras.optimizers.Adam(lr_schedule), loss=keras.losses.BinaryCrossentropy(), metrics=['accuracy'])

Some layers from the model checkpoint at xlnet-base-cased were not used when initializing TFXLNetForSequenceClassification: ['lm_loss']
- This IS expected if you are initializing TFXLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFXLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary', 'logits_proj']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


In [10]:
feat_subset = get_subset(train_features, 0, 300)
lab_subset = train_labels[0:300]

In [16]:
hist = model.fit(feat_subset, lab_subset, validation_split=.1,batch_size=BATCH_SIZE, epochs=EPOCHS, shuffle=True)

Epoch 1/2
Epoch 2/2


In [17]:
model.evaluate(dev_features, dev_labels, batch_size=BATCH_SIZE)



[0.6972507834434509, 0.5076923370361328]

In [18]:
#model.save('model_checkpoints/dev62pct')

In [19]:
out = model.predict(dev_features, batch_size=BATCH_SIZE)

In [20]:
pos = np.where(out >= .5)
neg = np.where(out < .5)
print(f'Positives: {pos[0].shape[0]}')
print(f'Negatives: {neg[0].shape[0]}')

Positives: 553
Negatives: 162


dev62pct  dev_ac_59pct
