In [None]:
CSV_PATH = './data/'

In [None]:
import tensorflow as tf

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Activation, GRU
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D, GlobalAveragePooling1D, GlobalMaxPooling1D, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras import activations

In [None]:
from tqdm import tqdm
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold

In [None]:
#This code snippet is inspired by https://github.com/chakki-works/chakin
from six.moves.urllib.request import urlretrieve
from progressbar import Bar, ETA, FileTransferSpeed, ProgressBar, Percentage, RotatingMarker

ROOT_DIR = './'
url = 'http://nlp.stanford.edu/data/glove.840B.300d.zip'
file_name = url.split('/')[-1]
save_path = os.path.join(ROOT_DIR, file_name)

widgets = ['Test: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', ETA(), ' ', FileTransferSpeed()]
pbar = ProgressBar(widgets=widgets)

def dlProgress(count, blockSize, totalSize):
    if pbar.max_value is None:
        pbar.max_value = totalSize
        pbar.start()

    pbar.update(min(count * blockSize, totalSize))

path, _ = urlretrieve(url, save_path, reporthook=dlProgress)
pbar.finish()

In [None]:
!unzip ./glove.840B.300d.zip

In [None]:
def load_glove(word_dict):#word_index {'word': index} dictionary
    EMBEDDING_FILE = './glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]
    word_index = word_dict
    nb_words = len(word_index)+1
    

    embedding_matrix = np.zeros((nb_words, embed_size),dtype=np.float32) - 1.
    
    for key, i in tqdm(word_index.items()):
        word = key
        embedding_vector = embeddings_index.get(word)        
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = key.lower()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = key.upper()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = key.capitalize()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = ps.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = lc.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = sb.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
            
    del embeddings_index
    import gc
    gc.collect()
    return embedding_matrix, nb_words

In [None]:
def load_glove_simple(word_dict):#word_index {'word': index} dictionary
    EMBEDDING_FILE = './glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]
    word_index = word_dict
    nb_words = len(word_index)+1
    

    embedding_matrix = np.zeros((nb_words, embed_size),dtype=np.float32) - 1.
    
    for key, i in tqdm(word_index.items()):
        word = key
        embedding_vector = embeddings_index.get(word)        
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = key.lower()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = key.upper()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = key.capitalize()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
            
    del embeddings_index
    import gc
    gc.collect()
    return embedding_matrix, nb_words

In [None]:
train_df = pd.read_csv(os.path.join(CSV_PATH, 'train.csv'))
test_df = pd.read_csv(os.path.join(CSV_PATH, 'test.csv'))
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

In [None]:
train_df.head()

In [None]:
train_X = train_df["question_text"].fillna(" ")
test_X = test_df["question_text"].fillna(" ")
text_list = pd.concat([train_X,test_X]).apply(lambda x: x.split())

In [None]:
train_df, val_df = train_test_split(train_df, test_size=0.08, stratify=train_df['target'] ,random_state=2018)

In [None]:
max_features = 95000
max_len = 100


#train_X = np.array(train_df.pop('question_text')) --> This might help to save memory
train_X = train_df['question_text'].fillna(" ")
val_X = val_df['question_text'].fillna(" ")
test_X = test_df['question_text'].fillna(" ")

train_y = train_df['target'].values
val_y = val_df['target'].values

bool_train_labels = train_y != 0

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(text_list))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

In [None]:
train_X_pad = pad_sequences(train_X, maxlen=100,padding='post')
val_X_pad = pad_sequences(val_X, maxlen=100, padding='post')

In [None]:
train_X_pad.shape

In [None]:
#6.19%
neg, pos = np.bincount(train_df['target'])
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

In [None]:
dataset_v1 = tf.data.Dataset.from_tensor_slices((train_X_pad, train_y)).shuffle(1000).batch(16)

In [None]:
def data_generator():
    stop = len(train_X)
    i = 0
    while i < stop:
        yield np.array(train_X[i]), np.array(train_y[i])
        i += 1
    
dataset_v2 = tf.data.Dataset.from_generator(data_generator, output_types=(tf.int32,tf.int32), output_shapes=((None), ()))
padded_dataset_v2 = dataset_v2.padded_batch(16, padded_shapes=((None,), ()))

In [None]:
def class_func(features, label):
    return label

fractions = list(map(lambda x : x.astype(np.float32), [neg/total, pos/total]))
print(fractions)

In [None]:
resampler = tf.data.experimental.rejection_resample(
    class_func, target_dist=[0.7, 0.3], initial_dist=fractions)

In [None]:
resampler

In [None]:
resample_ds = dataset_v2.apply(resampler)
balanced_ds = resample_ds.map(lambda extra_label, features_and_label: features_and_label).padded_batch(16, padded_shapes=((None,), ()))

The requirements to use the cuDNN implementation are:


activation == 'tanh'

recurrent_activation == 'sigmoid'

recurrent_dropout == 0

unroll is False

use_bias is True

Inputs are not masked or strictly right padded.


In [None]:
glove, n_words = load_glove_simple(tokenizer.word_index)

In [None]:
class Attention(Layer):
    """
    Computes a weighted average of the different channels across timesteps.
    Uses 1 parameter pr. channel to compute the attention value for a single timestep.
    """

    def __init__(self, return_attention=False, **kwargs):
        self.init = initializers.RandomUniform(seed=10000)
        self.supports_masking = True
        self.return_attention = return_attention
        super(Attention, self).__init__(** kwargs)

    def build(self, input_shape):
        #self.input_spec = [InputSpec(ndim=3)]
        assert len(input_shape) == 3 #(batch, timestep, features)

        self.W = self.add_weight(shape=(input_shape[2], 1),
                                 name='{}_W'.format(self.name),
                                 initializer=self.init, trainable=True)

        super(Attention, self).build(input_shape)

    def call(self, x, mask=None):
        # computes a probability distribution over the timesteps
        # uses 'max trick' for numerical stability
        # reshape is done to avoid issue with Tensorflow
        # and 1-dimensional weights
        logits = tf.matmul(x, self.W)
        x_shape = x.shape.as_list()
        logits = tf.reshape(logits, (x_shape[0], x_shape[1]))
        ai = tf.math.exp(logits - tf.math.reduce_max(logits, axis=-1, keepdims=True))

        # masked timesteps have zero weight
        if mask is not None:
            mask = tf.dtypes.cast(mask, tf.float32)
            ai = ai * mask
        att_weights = ai / (tf.reduce_sum(ai, axis=1, keepdims=True) + 1e-07)
        weighted_input = x * tf.expand_dims(att_weights, axis=-1)
        result = tf.math.reduce_sum(weighted_input, axis=1)
        if self.return_attention:
            return [result, att_weights]
        return result

    def get_output_shape_for(self, input_shape):
        return self.compute_output_shape(input_shape)

    def compute_output_shape(self, input_shape):
        output_len = input_shape[2]
        if self.return_attention:
            return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
        return (input_shape[0], output_len)

    def compute_mask(self, input, input_mask=None):
        if isinstance(input_mask, list):
            return [None] * len(input_mask)
        else:
            return None

In [None]:
class QIQModel(tf.keras.Model):
    def __init__(self, embedding_matrix, *args, init_out_bias=None, dropout=0.1, **kargs):
        super(QIQModel, self).__init__(*args, **kargs)


        self.embedding_matrix = embedding_matrix
        self.emb_shape = embedding_matrix.shape
        self.embedding = Embedding(*self.emb_shape, embeddings_initializer=tf.keras.initializers.Constant(self.embedding_matrix), trainable=False)
        self.LSTM = LSTM(64, return_sequences=True)
        self.Gmaxpool = GlobalMaxPool1D()
        self.linear1 = Dense(16)
        if init_out_bias:
            self.linear2 = Dense(1, bias_initializer=init_out_bias)
        else:
            self.linear2 = Dense(1)
    
    
    def call(self, inputs, perturb=False):
        emb_out = self.embedding(inputs)
        x = self.LSTM(emb_out)
        x = self.Gmaxpool(x)
        x = self.linear1(x)
        x = self.linear2(x)
    
        if perturb:
            return activations.sigmoid(x), emb_out
        return activations.sigmoid(x)

      def compute_output_shape(self, input_shape):
        return tf.TensorShape((None, 1))

In [None]:
initial_bias = np.log([pos/neg])
initial_bias

In [None]:
model = QIQModel(glove)

In [None]:
# https://www.kaggle.com/hireme/fun-api-keras-f1-metric-cyclical-learning-rate/code

class CyclicLR(tf.keras.callbacks.Callback):
    """This callback implements a cyclical learning rate policy (CLR).
    The method cycles the learning rate between two boundaries with
    some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186).
    The amplitude of the cycle can be scaled on a per-iteration or 
    per-cycle basis.
    This class has three built-in policies, as put forth in the paper.
    "triangular":
        A basic triangular cycle w/ no amplitude scaling.
    "triangular2":
        A basic triangular cycle that scales initial amplitude by half each cycle.
    "exp_range":
        A cycle that scales initial amplitude by gamma**(cycle iterations) at each 
        cycle iteration.
    For more detail, please see paper.
    
    # Example
        ```python
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., mode='triangular')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```
    
    Class also supports custom scaling functions:
        ```python
            clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., scale_fn=clr_fn,
                                scale_mode='cycle')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```    
    # Arguments
        base_lr: initial learning rate which is the
            lower boundary in the cycle.
        max_lr: upper boundary in the cycle. Functionally,
            it defines the cycle amplitude (max_lr - base_lr).
            The lr at any cycle is the sum of base_lr
            and some scaling of the amplitude; therefore 
            max_lr may not actually be reached depending on
            scaling function.
        step_size: number of training iterations per
            half cycle. Authors suggest setting step_size
            2-8 x training iterations in epoch.
        mode: one of {triangular, triangular2, exp_range}.
            Default 'triangular'.
            Values correspond to policies detailed above.
            If scale_fn is not None, this argument is ignored.
        gamma: constant in 'exp_range' scaling function:
            gamma**(cycle iterations)
        scale_fn: Custom scaling policy defined by a single
            argument lambda function, where 
            0 <= scale_fn(x) <= 1 for all x >= 0.
            mode paramater is ignored 
        scale_mode: {'cycle', 'iterations'}.
            Defines whether scale_fn is evaluated on 
            cycle number or cycle iterations (training
            iterations since start of cycle). Default is 'cycle'.
    """

    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}

        if self.clr_iterations == 0:
            tf.keras.backed.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            tf.keras.backend.set_value(self.model.optimizer.lr, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):
        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1

        self.history.setdefault('lr', []).append(tf.keras.backend.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
        
        tf.keras.backend.set_value(self.model.optimizer.lr, self.clr())

In [None]:
clr = CyclicLR(base_lr=0.001, max_lr=0.002,
               step_size=300., mode='exp_range',
               gamma=0.99994)

In [None]:
callbacks = []
callbacks += [clr]
callbacks += [ tf.keras.callbacks.ModelCheckpoint(filepath='/content/drive/My Drive/Kaggle/Quora Insincere Question/model_{epoch}.h5', mode='min', monitor='val_loss')]

In [None]:
metrics = [
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc')
]

In [None]:
weight_for_0 = (1 / neg)*(total)/2.0 
weight_for_1 = (1 / pos)*(total)/2.0

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

In [None]:
model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.01, beta_1=0.99, epsilon=1e-1), loss='binary_crossentropy', metrics=metrics)

In [None]:
model.fit(padded_dataset2, verbose=1, epochs=2, callbacks=callbacks)

In [None]:
model.summary()

Try common tecniques for dealing with imbalanced data like:

Class weighting

Oversampling

In [None]:
@tf.function
def train_step(inputs, targets):
    #if len(targets.shape) > 2:
    #  width = targets.shape[-2]
    #  targets = tf.reshape(targets, [-1, width*width])
    with tf.GradientTape() as tape:
        logits_ = pixelrnn(inputs)
        #loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=targets, logits=logits_))
        bce = tf.keras.losses.BinaryCrossentropy()
        loss = bce(targets, logits_)
    grads = tape.gradient(loss, pixelrnn.trainable_variables)
    optimizer.apply_gradients(zip(grads, pixelrnn.trainable_variables))
  
    return loss

In [None]:
import random
EPOCHS = 1
BATCH_SIZE = 16
loss_trace = []
import time
for epoch in range(EPOCHS):
  
    total_loss = 0.0
    i=0
    epoch_start = time.time()
    for x,y in train_dataset:
        loss = train_step(x,y)
    
        total_loss += loss
        i += 1
    
        if i % 100 == 0:
            if i % 500 == 0:
                print('{}% complete'.format(i/30))
        else:
            print('{}% complete'.format(i/30) ,end=' ')
        
    epoch_elapsed = time.time() - epoch_start
    print("epoch {} : elapsed: {}".format(epoch,epoch_elapsed))
    print("epoch {} : loss: {}".format(epoch,total_loss))
  
    if (epoch + 1) % 5 == 0:
        ckpt_save_path = ckpt_manager.save()
        print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                         ckpt_save_path))
  
  loss_trace.append(total_loss)