In [117]:
# A separate notebook for dealing with all data
import numpy as np
import pandas as pd

In [2]:
def get_label_file():
    data_type = np.uint8
    g = np.memmap('../data/ONE_HOT_LABELS.dat', dtype=data_type, shape=(74165, 100, 33))
    
    return g

In [3]:
def get_embed_file():
    data_type = np.float64
    f = np.memmap('../data/MMAP_MATRIX.dat', dtype=data_type, shape=(74165, 100, 768))
    return f

In [143]:
from keras.layers import LSTM, Bidirectional, TimeDistributed, Dropout, Dense
from keras.models import Model, Input
def bilstm_model():
    """
    The model selection was very basic. Testing was performed on the same 5000 sentences with number of units = 100, 200, 300, 400
    Each subsequent increase in units massivly increased complexity for minor gains in performance.
    10% of the data was used for validation
    
    500
    loss: 0.0266 - accuracy: 0.9924 - val_loss: 0.0321 - val_accuracy: 0.9909
    
    400
    loss: 0.0279 - accuracy: 0.9921 - val_loss: 0.0326 - val_accuracy: 0.9906
    
    300
    loss: 0.0296 - accuracy: 0.9918 - val_loss: 0.0333 - val_accuracy: 0.9905
    
    200
    loss: 0.0328 - accuracy: 0.9911 - val_loss: 0.0349 - val_accuracy: 0.9904
    
    100
    loss: 0.0403 - accuracy: 0.9897 - val_loss: 0.0394 - val_accuracy: 0.9899
    
    For all choices of unit, validation accuracy peaked around 4-7 epochs
    
    
    """
    #Define the input shape. Each datapoint is a sentence consisting of seqlength words, each word 784dims
    input = Input(shape=(train_x.shape[1], train_x.shape[2]))
    # Pass it through a bidirectional lstm
    model = Bidirectional(LSTM(units=200, return_sequences=True, recurrent_dropout=0.1))(input)

    # add a timedistributed layer
    out = TimeDistributed(Dense(train_y.shape[2], activation="softmax"))(model)  # softmax output layer
    
    model = Model(input, out)
    
    # Compile it
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    print(model.summary())
    return model


In [144]:
import keras
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, batch_size=32, shuffle=False):
        'Initialization'
        self.batch_size = batch_size
        self.list_IDs = list_IDs
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        """
        Denotes the number of batches per epoch
        A common way to decide this is simply total amount of samples over batch size
        the batch size you can handle is, of course, determined by your computer, so set that accordingly
        """
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        """
        This function is responsible for grabbing the indices to load for each batch.
        And then calling the data generator for those ids
        """
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        """
        If desired, shuffles the indices after each epoch
        """
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        """
        This function brings data from disk into RAM.
        It is called once per batch, and thus brings in one batch of data.
        """
        X = get_embed_file()[list_IDs_temp,:,:]
        y = get_label_file()[list_IDs_temp,:,:]
        
        # We take the data at the specified indices and retrieve it
        return X, y


In [145]:
def cross_validate(classifier_function, nr_folds, indices):
    """
    Input:
        Classifier_function; a function that generates a classifier with the desired attributes. Will be used to reset the classifier between folds
        nr_folds, how many folds to split into
        indices, a list of the indices we will pass to our data generators
        
    Output:
        For each fold, three saved files.
        One containing the model
        One containing the model weights
        One containing the history dataframe
    """
    fold_size = int(np.floor(len(indices)/nr_folds))
    
    for i in range(nr_folds):
        print(f"Working on fold {i + 1} of {nr_folds}")
        
        #Select the indices to validate on
        validation_indices = indices[i * fold_size : (i+1) * fold_size]
        
        #Select all indices except those we validate on for training
        train_indices =  [x for x in indices if x not in validation_indices]
        
        print(f"Training on {len(train_indices)}, validating on {len(validation_indices)}")
        
        print("Resetting model...")
        classifier = classifier_function()

        print("Created datagenerators")
        dataGen = DataGenerator(train_indices)
        valiGen = DataGenerator(validation_indices)
        
        print("Commencing training")
        history = classifier.fit_generator(generator=dataGen,
                                           validation_data=valiGen,
                                           epochs = 10)
        
        print("TRAINING COMPLETE")
        
        #Pickle the training data in case it will be neeeded
        training_df = pd.DataFrame()
        for metric in history.history.keys():
            training_df[metric] = history.history[metric]
            
        training_df.to_pickle(f'../data/fold_{i}_history')
        print(f"SAVING HISTORY '../data/fold_{i}_history'")
        
        # Serialize the model in case we'll need it
        model_json = classifier.to_json()
        with open(f"../data/fold_{i}_model.json", "w+") as json_file:
            json_file.write(model_json)
        print(f"SAVING MODEL ../data/fold_{i}_model.json")

            
        # Serialize weights to HDF5, becuse we WILL need them
        classifier.save_weights(f'../data/fold_{i}_weights.h5')

        print(f"SAVING MODEL WEIGHTS AS ../data/fold_{i}_weights.h5")
    
    print("PROCESS COMPLETED")


# SUBSET TESTING

In [14]:
# Get the memory mapped arrays
embedding_pointer = get_embed_file()
label_pointer = get_label_file()

#Select a subset
train_x = embedding_pointer[20000:21000]
train_y = label_pointer[20000:21000]

In [16]:
# Create the model
classifier = bilstm_model()

# Train, setting some data aside for evaluation
history = classifier.fit(
        train_x,
        train_y,
        epochs=10,
        batch_size=50,
        validation_split=0.1)
print('\nhistory dict:', history.history.keys())

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 100, 768)          0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 100, 400)          1550400   
_________________________________________________________________
time_distributed_3 (TimeDist (None, 100, 33)           13233     
Total params: 1,563,633
Trainable params: 1,563,633
Non-trainable params: 0
_________________________________________________________________
None
Train on 1800 samples, validate on 200 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

history dict: dict_keys(['val_loss', 'val_accuracy', 'loss', 'accuracy'])


# CROSSVAL TESTING

The test indices will be the last 10 % and we won't touch those at all for the moment.

We will make use of 5-fold cross validation to reduce bias in our results. 


In [146]:
sentence_indices = [i for i in range(get_embed_file().shape[0])]

#Test consist of the last 10 %
TEST = sentence_indices[66749:]
CROSSVAL = sentence_indices[:66749]

In [147]:
import time
start = time.time()
cross_validate(bilstm_model, 5, CROSSVAL)
end = time.time()
print(f"Time needed for CV on entire dataset = {(end-start)/3600} hours")

Working on fold 1 of 5
Training on 53400, validating on 13349
Resetting model...
Model: "model_27"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_27 (InputLayer)        (None, 100, 768)          0         
_________________________________________________________________
bidirectional_27 (Bidirectio (None, 100, 400)          1550400   
_________________________________________________________________
time_distributed_27 (TimeDis (None, 100, 33)           13233     
Total params: 1,563,633
Trainable params: 1,563,633
Non-trainable params: 0
_________________________________________________________________
None
Created datagenerators
Commencing training
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
TRAINING COMPLETE
SAVING HISTORY '../data/fold_0_history'
SAVING MODEL ../data/fold_0_model.json
SAVING MODEL WEIGHTS AS ../data/fold_0_we

In [148]:
#vTime needed for CV on entire dataset = 3.1499764224555755 hours


# LOAD STORED INFORMATION

In [133]:
# Training history
history_df = pd.read_pickle('../data/fold_0_history')

In [135]:
# Model and its weights
classifier = bilstm_model()

# load weights into new model
classifier.load_weights("../data/fold_0_weights.h5")
print("Loaded model from disk")

Model: "model_26"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_26 (InputLayer)        (None, 100, 768)          0         
_________________________________________________________________
bidirectional_26 (Bidirectio (None, 100, 400)          1550400   
_________________________________________________________________
time_distributed_26 (TimeDis (None, 100, 33)           13233     
Total params: 1,563,633
Trainable params: 1,563,633
Non-trainable params: 0
_________________________________________________________________
None
Loaded model from disk


In [140]:

evalGen = DataGenerator(TRAIN[:5000])
end = time.time()
print('\nEvaluate on test data')
results = classifier.evaluate_generator(evalGen)
for i, metric in enumerate(classifier.metrics_names):
    print(f"{metric}: {results[i]}")
elapsed = start - end


Evaluate on test data
loss: 0.035807523876428604
accuracy: 0.9876822829246521


In [141]:
print(elapsed)

-0.00024199485778808594
