In [1]:
# Todo: Fix dataset imbalance. There are 1244 positive training examples, and 901 negative training examples

In [3]:
# Install sentencenpiece to help in tokenization
!pip install sentencepiece

Collecting sentencepiece
  Using cached sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2 MB)
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.1.95


In [4]:
import torch
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras
import transformers
import numpy as np
from os import path
import pandas as pd
from transformers import AutoTokenizer, TFXLNetModel, TFAutoModel, TFXLNetForSequenceClassification
from constants import *
from dataframe_utils import tokenize_dataframe
from new_model import create_model

In [14]:
"""
Value of 0.000001 and 2 epochs lead to 58% on training and 54.5% on dev. 
Further training did not lead to higher accuracy

Tried seeing of model had capacity to fit. Ran 6 epochs on 100 training examples
to see if I could at least get it to overfit. loss: 0.6451 - accuracy: 0.6222
Dev: loss: 0.6841 - accuracy: 0.5566

Ran 6 epochs on 200 training examples. loss: 0.6424 - accuracy: 0.6300
Dev: loss: 0.6874 - accuracy: 0.5524
Then another 6 epcohs on another 100 training examples. loss: 0.6233 - accuracy: 0.6200
Dev: loss: 0.7047 - accuracy: 0.5594
Then another 3 epochs on another 400 examples. loss: 0.6606 - accuracy: 0.5725
Dev: loss: 0.7007 - accuracy: 0.5650
Then another 3 epochs on another 500 examples. loss: 0.6486 - accuracy: 0.6300
Dev: loss: 0.6761 - accuracy: 0.5944
Then another epoch on entire training set. loss: 0.6507 - accuracy: 0.6247
Dev: loss: 0.6624 - accuracy: 0.6238
Then another epoch on entire training set. 0.6324 - accuracy: 0.6424
Dev: loss: 0.7089 - accuracy: 0.5986 <- We've reached the point of overfit

Ran 8 epochs on 100 examples with 
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    1e-5,
    decay_steps=50,
    decay_rate=0.9,
    staircase=True
)
Successfully overfit the model

With exponential decay, model appears to start overfitting after epoch 2.
Holding back 60 examples from the test set for validation after each epoch seems to match dev set well.

After further inspection, it appears that the model just leans toward always predicing positive because 
there are more positive than negative examples. This suggests that there is not a meaningful correlation in the data.
For example, trained a model and got 54% accuracy on dev set. Number of positive and negative predictions were
Positives: 711 Negatives: 4


"""

TRAIN_RATIO = .6
DEV_RATIO = .2
LEARNING_RATE = 0.000001 
BATCH_SIZE = 2
EPOCHS = 2

In [15]:
def create_datasets(rerun_tokenization=False):
    # Uncomment the following line of code if you would like to re-run the tokenization process
    if rerun_tokenization:
        tokenize_and_save(path.join(DATA_DIR, DATASET_PATH), path.join(DATA_DIR, TOKENIZED_DATASET_PATH))
    
    tokenized_data = pd.read_pickle(path.join(DATA_DIR, TOKENIZED_DATASET_PATH))
    
    # Randomly shuffle the data, but seeded so it's repeatable
    tokenized_data = tokenized_data.sample(frac=1, random_state=1)

    # Get train, test, dev splits
    num_entries = len(tokenized_data)
    train_cutoff = int(num_entries * TRAIN_RATIO)
    dev_cutoff = train_cutoff + int(num_entries * DEV_RATIO)

    train_set = tokenized_data[:train_cutoff]
    dev_set = tokenized_data[train_cutoff:dev_cutoff]
    test_set = tokenized_data[dev_cutoff:]

    train_set.to_pickle(path.join(DATA_DIR, TRAIN_SET))
    dev_set.to_pickle(path.join(DATA_DIR, DEV_SET))
    test_set.to_pickle(path.join(DATA_DIR,TEST_SET))

def get_subset(ftrs: list, start: int, stop: int) -> list:
    return [ftrs[0][start:stop], ftrs[1][start:stop], ftrs[2][start:stop], ftrs[3][start:stop]]

def get_processed_dataframe(path: str) -> pd.DataFrame:
    
    data = pd.read_csv(path)
    
    # Drop all rows with questions that are not understandable
    data.drop(data[data.understandable == 0].index, inplace=True)
    # All questions are understandable so we can remove the 'understandable' column
    data.drop(columns="understandable", inplace=True)
    # Remove rows whos passage has more than 'max_len' words
    data.drop(data[data.passage.map(lambda x: x.count(" ") + 1) > MAX_LEN].index, inplace=True)
    # Remove rows with a comprehension value of 3, because these values won't work for binary classification
    data.drop(data[data.comprehension == 3].index, inplace=True)
    
    return data







In [16]:
create_datasets(rerun_tokenization=False)
train_set, dev_set, test_set = get_datasets()
train_features, train_labels = get_features(train_set), get_labels(train_set, 'comprehension binary')
dev_features, dev_labels = get_features(dev_set), get_labels(dev_set, 'comprehension binary')
test_features, test_labels = get_features(test_set), get_labels(test_set, 'comprehension binary')



NameError: name 'get_datasets' is not defined

In [None]:
keras.backend.clear_session()
model = create_model(MAX_LEN, NUM_SX_FEATURES)
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    1e-5,
    decay_steps=50,
    decay_rate=0.9,
    staircase=True
)

model.compile(optimizer=keras.optimizers.Adam(lr_schedule), loss=keras.losses.BinaryCrossentropy(), metrics=['accuracy'])

In [16]:
hist = model.fit(feat_subset, lab_subset, validation_split=.1,batch_size=BATCH_SIZE, epochs=EPOCHS, shuffle=True)

Epoch 1/2
Epoch 2/2


In [10]:
dev_labels[0]

NameError: name 'dev_labels' is not defined

In [17]:
model.evaluate(dev_features, dev_labels, batch_size=BATCH_SIZE)



[0.6972507834434509, 0.5076923370361328]

In [18]:
#model.save('model_checkpoints/dev62pct')

In [19]:
out = model.predict(dev_features, batch_size=BATCH_SIZE)

In [20]:
pos = np.where(out >= .5)
neg = np.where(out < .5)
print(f'Positives: {pos[0].shape[0]}')
print(f'Negatives: {neg[0].shape[0]}')

Positives: 553
Negatives: 162


dev62pct  dev_ac_59pct
