In [1]:
# Install sentencenpiece to help in tokenization
!pip install sentencepiece

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import torch
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras
import transformers
import numpy as np
from os import path
import pandas as pd
from transformers import AutoTokenizer, TFXLNetModel, TFAutoModel, TFXLNetForSequenceClassification

DATA_DIR = 'datasets'
DATASET_PATH = 'labels_with_stackx.csv'
TOKENIZED_DATASET_PATH = 'labels_with_stackx_tokenized'
TRAIN_SET = 'train_set'
DEV_SET = 'dev_set'
TEST_SET = 'test_set'

TRAIN_RATIO = .6
DEV_RATIO = .2
TEST_RATIO = .2

In [3]:
MAX_LEN = 1024
LEARNING_RATE = 0.000001
BATCH_SIZE = 2
EPOCHS = 2

## Helper functions for tokenizing a dataset

In [4]:
def get_processed_dataframe(path: str) -> pd.DataFrame:
    
    data = pd.read_csv(path)
    
    # Drop all rows with questions that are not understandable
    data.drop(data[data.understandable == 0].index, inplace=True)
    # All questions are understandable so we can remove the 'understandable' column
    data.drop(columns="understandable", inplace=True)
    # Remove rows whos passage has more than 'max_len' words
    data.drop(data[data.passage.map(lambda x: x.count(" ") + 1) > MAX_LEN].index, inplace=True)
    # Remove rows with a comprehension value of 3, because these values won't work for binary classification
    data.drop(data[data.comprehension == 3].index, inplace=True)
    
    return data

# Create a closure for tokenization

def get_tokenize_func():

  tokenizer = AutoTokenizer.from_pretrained('xlnet-base-cased', model_max_length=MAX_LEN)
  
  def tk(df):
    out = tokenizer(df['passage'], df['question'], padding='max_length', truncation=True)
    return out['input_ids'], out['token_type_ids'], out['attention_mask']
  
  return tk

def get_tokens(data: pd.DataFrame):
    data['output_ids'], data['token_type_ids'], data['attention_mask'] = zip(*data.apply(get_tokenize_func(), axis=1))
    for col in ['output_ids', 'token_type_ids', 'attention_mask']:
        data[col] = data[col].apply(lambda cell: np.array(cell))
    
def tokenize_and_save(load_path: str, save_path: str):
    data = get_processed_dataframe(load_path)
    get_tokens(data)
    data.to_pickle(save_path)


## Helper functions for splitting a numpy arrays for train, dev and test

In [5]:
def create_datasets(rerun_tokenization=False):
    # Uncomment the following line of code if you would like to re-run the tokenization process
    if rerun_tokenization:
        tokenize_and_save(path.join(DATA_DIR, DATASET_PATH), path.join(DATA_DIR, TOKENIZED_DATASET_PATH))
    
    tokenized_data = pd.read_pickle(path.join(DATA_DIR, TOKENIZED_DATASET_PATH))
    
    # Randomly shuffle the data, but seeded so it's repeatable
    tokenized_data = tokenized_data.sample(frac=1, random_state=1)

    # Get train, test, dev splits
    num_entries = len(tokenized_data)
    train_cutoff = int(num_entries * TRAIN_RATIO)
    dev_cutoff = train_cutoff + int(num_entries * DEV_RATIO)

    train_set = tokenized_data[:train_cutoff]
    dev_set = tokenized_data[train_cutoff:dev_cutoff]
    test_set = tokenized_data[dev_cutoff:]

    train_set.to_pickle(path.join(DATA_DIR, TRAIN_SET))
    dev_set.to_pickle(path.join(DATA_DIR, DEV_SET))
    test_set.to_pickle(path.join(DATA_DIR,TEST_SET))
    
def get_features(df: pd.DataFrame) -> list:
    output_ids = np.stack(df['output_ids'].values)
    token_type_ids = np.stack(df['token_type_ids'].values)
    attention_mask = np.stack(df['attention_mask'].values)
    return [output_ids, token_type_ids, attention_mask]
    
def get_labels(df: pd.DataFrame, label_name: str):
    # Extract numpy array, and reshape to be rank-2
    return np.reshape(df[label_name].values, (-1, 1))
    
def get_datasets():
    train_set = pd.read_pickle(path.join(DATA_DIR, TRAIN_SET))
    dev_set = pd.read_pickle(path.join(DATA_DIR, DEV_SET))
    test_set = pd.read_pickle(path.join(DATA_DIR,TEST_SET))
    
    return train_set, dev_set, test_set


def get_subset(ftrs: list, size: int) -> list:
    return [ftrs[0][:size], ftrs[1][:size], ftrs[2][:size]]





In [6]:
create_datasets(rerun_tokenization=False)
train_set, dev_set, test_set = get_datasets()
train_features, train_labels = get_features(train_set), get_labels(train_set, 'comprehension binary')
dev_features, dev_labels = get_features(dev_set), get_labels(dev_set, 'comprehension binary')
test_features, test_labels = get_features(test_set), get_labels(test_set, 'comprehension binary')



In [7]:
def create_model(max_len: int) -> tf.keras.Model:
    encoder = TFXLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=1) #Input is tokenized strings, output is embeddings and logits (logits means pre-sigmoid label)
    input_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name='input_ids')
    token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name='token_type_ids')
    attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32, name='attention_mask')

    preds = encoder(
        input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
    ).logits

    model = tf.keras.Model(
            inputs=[input_ids, token_type_ids, attention_mask],
            outputs=[preds], name='classifier'
        )

    return model

In [8]:
keras.backend.clear_session()
model = create_model(MAX_LEN)
model.compile(optimizer=keras.optimizers.Adam(LEARNING_RATE), loss=keras.losses.BinaryCrossentropy(), metrics=['accuracy'])

Some layers from the model checkpoint at xlnet-base-cased were not used when initializing TFXLNetForSequenceClassification: ['lm_loss']
- This IS expected if you are initializing TFXLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFXLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary', 'logits_proj']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


In [9]:
hist = model.fit(train_features, train_labels, batch_size=BATCH_SIZE, epochs=EPOCHS, shuffle=True)

Epoch 1/2
Epoch 2/2

KeyboardInterrupt: 

In [None]:
model.evaluate(dev_features, dev_labels, batch_size=BATCH_SIZE)

In [11]:
hist.history

{'loss': [1.7176302671432495, 4.259496688842773],
 'accuracy': [0.5039626955986023, 0.48344987630844116]}