In [None]:
CSV_PATH = './data/'

In [None]:
!ls '/content/drive/My Drive/Kaggle/Quora Insincere Question/perturbation/trained'

In [None]:
import tensorflow as tf

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Activation, GRU
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D, GlobalAveragePooling1D, GlobalMaxPooling1D, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras import activations

In [None]:
from tqdm import tqdm
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold

In [None]:
#This code snippet is inspired by https://github.com/chakki-works/chakin
from six.moves.urllib.request import urlretrieve
from progressbar import Bar, ETA, FileTransferSpeed, ProgressBar, Percentage, RotatingMarker

ROOT_DIR = './'
url = 'http://nlp.stanford.edu/data/glove.840B.300d.zip'
file_name = url.split('/')[-1]
save_path = os.path.join(ROOT_DIR, file_name)

widgets = ['Test: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', ETA(), ' ', FileTransferSpeed()]
pbar = ProgressBar(widgets=widgets)

def dlProgress(count, blockSize, totalSize):
    if pbar.max_value is None:
        pbar.max_value = totalSize
        pbar.start()

    pbar.update(min(count * blockSize, totalSize))

path, _ = urlretrieve(url, save_path, reporthook=dlProgress)
pbar.finish()

In [None]:
!unzip ./glove.840B.300d.zip

In [None]:
def load_glove_simple(word_dict):#word_index {'word': index} dictionary
    EMBEDDING_FILE = './glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]
    word_index = word_dict
    nb_words = len(word_index)+1
    

    embedding_matrix = np.zeros((nb_words, embed_size),dtype=np.float32) - 1.
    
    for key, i in tqdm(word_index.items()):
        word = key
        embedding_vector = embeddings_index.get(word)        
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = key.lower()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = key.upper()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = key.capitalize()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
            
    del embeddings_index
    import gc
    gc.collect()
    return embedding_matrix, nb_words

In [None]:
train_df = pd.read_csv(os.path.join(CSV_PATH, 'train.csv'))
test_df = pd.read_csv(os.path.join(CSV_PATH, 'test.csv'))
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

In [None]:
train_X = train_df["question_text"].fillna(" ")
test_X = test_df["question_text"].fillna(" ")
text_list = pd.concat([train_X,test_X]).apply(lambda x: x.split())

In [None]:
train_df, val_df = train_test_split(train_df, test_size=0.08, stratify=train_df['target'] ,random_state=2018)
max_features = 95000
max_len = 100


#train_X = np.array(train_df.pop('question_text')) --> This might help to save memory
train_X = train_df['question_text'].fillna(" ")
val_X = val_df['question_text'].fillna(" ")
test_X = test_df['question_text'].fillna(" ")

train_y = train_df['target'].values
val_y = val_df['target'].values

bool_train_labels = train_y != 0

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(text_list))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)



In [None]:
glove, n_words = load_glove_simple(tokenizer.word_index)

In [None]:
glove.shape

In [None]:
train_X_pad = pad_sequences(train_X, maxlen=100,padding='post')
val_X_pad = pad_sequences(val_X, maxlen=100, padding='post')

In [None]:
#6.19%
neg, pos = np.bincount(train_df['target'])
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

In [None]:
dataset_v1 = tf.data.Dataset.from_tensor_slices((train_X_pad, train_y))
for x,y in dataset_v1.take(2):
    print(x)

In [None]:
dataset_v1 = tf.data.Dataset.from_tensor_slices((train_X_pad, train_y)).shuffle(10000).batch(512)

In [None]:
dataset_val_v1 = tf.data.Dataset.from_tensor_slices((val_X_pad, val_y)).shuffle(10000).batch(512)

In [None]:
embedding_input = tf.keras.Input(shape=(None,))
embedding_output = Embedding(*glove.shape, embeddings_initializer=tf.keras.initializers.Constant(glove), trainable=False)(embedding_input)
embedding = tf.keras.Model(embedding_input, embedding_output)

In [None]:
np.array(train_X[0]).shape

In [None]:
#@tf.function
def gradient_perturbation(inputs, targets, trained_model):
  
    with tf.GradientTape() as tape:
        logits, emb_out = trained_model(inputs, perturb=True)
        bce = tf.keras.losses.BinaryCrossentropy(name='loss_pre_trained')
        loss = bce(targets, logits)
    
    grads = tape.gradient(loss, emb_out)
    normalized_grads = tf.math.truediv(grads,tf.expand_dims(tf.math.sqrt(tf.math.reduce_sum(tf.math.square(grads) + 1e-7, axis=2)), axis=-1))
    return normalized_grads + emb_out 

In [None]:
def data_generator_v1():
    stop = len(train_X)
    i = 0
    while i < stop:
        yield embedding(np.array(train_X[i])), np.array(train_y[i])
        i += 1
    
dataset_v1 = tf.data.Dataset.from_generator(data_generator_v1, output_types=(tf.float32,tf.int32), output_shapes=((None, 300), ()))
padded_dataset_v1 = dataset_v1.padded_batch(16, padded_shapes=((None,300), ()))

In [None]:
import random

In [None]:
def data_generator_v2():
    stop = len(train_X)
    i = 0
    while i < stop:
    
        if train_y[i] == 1 and random.random() < 0.3:
            x, y = np.expand_dims(train_X[i], 0), np.expand_dims(train_y[i], 0)
            perturbation = gradient_perturbation(x,y,trained_model)
            yield np.squeeze(perturbation), train_y[i]
            i += 1
      
        else:
            yield embedding(np.array(train_X[i])), np.array(train_y[i])
            i += 1
    
dataset_v2 = tf.data.Dataset.from_generator(data_generator_v2, output_types=(tf.float32,tf.int32), output_shapes=((None, 300), ()))
padded_dataset_v2 = dataset_v2.padded_batch(16, padded_shapes=((None,300), ()))

In [None]:
class QIQModel(tf.keras.Model):
    def __init__(self, embedding_matrix, *args, init_out_bias=None, dropout=0.1, **kargs):
        super(QIQModel, self).__init__(*args, **kargs)

        self.embedding_matrix = embedding_matrix
        self.emb_shape = embedding_matrix.shape
        self.embedding = Embedding(*self.emb_shape, embeddings_initializer=tf.keras.initializers.Constant(self.embedding_matrix), trainable=False)
        self.LSTM = LSTM(64, return_sequences=True)
        self.Gmaxpool = GlobalMaxPool1D()
        self.linear1 = Dense(16)
        if init_out_bias:
            self.linear2 = Dense(1, bias_initializer=init_out_bias)
        else:
            self.linear2 = Dense(1)
    
    
    def call(self, inputs, perturb=False):
        emb_out = self.embedding(inputs)
        x = self.LSTM(emb_out)
        x = self.Gmaxpool(x)
        x = self.linear1(x)
        x = self.linear2(x)
    
        if perturb:
            return activations.sigmoid(x), emb_out
        return activations.sigmoid(x)
  
    def compute_output_shape(self, input_shape):
        return tf.TensorShape((None, 1))

In [None]:
!ls './perturbation/trained'

In [None]:
#if you wanna load any state of optimizer and of metrics, follow the code below
trained_model_2 = QIQModel(glove)
# This initializes the variables used by the optimizers,
# as well as any stateful metric variables
trained_model_2.compile(optimizer=tf.optimizers.Adam(learning_rate=0.01, beta_1=0.99, epsilon=1e-1), loss='binary_crossentropy')
trained_model_2.train_on_batch(*next(iter(dataset_v1)))
trained_model_2.load_weights('./perturbation/trained/model_1.ckpt')

In [None]:
#if you just want to load model weights, 
trained_model = QIQModel(glove)
trained_model.load_weights('./perturbation/trained/model_1.ckpt')

In [None]:
trained_model.trainable = False

In [None]:
callbacks = [ tf.keras.callbacks.ModelCheckpoint(filepath='./perturbation/trained/model_{epoch}.ckpt', mode='min', monitor='val_loss')]

In [None]:
trained_model = QIQModel(glove)
#load weights

trained_model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.01, beta_1=0.99, epsilon=1e-1), loss='binary_crossentropy')
trained_model.fit(dataset_v1, epochs=1, validation_data=dataset_val_v1, callbacks=callbacks)

In [None]:
class QIQModel_v2(tf.keras.Model):
    def __init__(self, *args, init_out_bias=None, dropout=0.1, **kargs):
        super(QIQModel_v2, self).__init__(*args, **kargs)
    
        self.LSTM = LSTM(64, return_sequences=True)
        self.Gmaxpool = GlobalMaxPool1D()
        self.linear1 = Dense(16)
        if init_out_bias:
            self.linear2 = Dense(1, bias_initializer=init_out_bias)
        else:
            self.linear2 = Dense(1)
    
    
    def call(self, emb_out):
        x = self.LSTM(emb_out)
        x = self.Gmaxpool(x)
        x = self.linear1(x)
        x = self.linear2(x)
        return activations.sigmoid(x)
  
    def compute_output_shape(self, input_shape):
        return tf.TensorShape((None, 1))

In [None]:
model.fit(padded_dataset_v2, epochs=1)

In [None]:
model = QIQModel_v2()

In [None]:
model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.01, beta_1=0.99, epsilon=1e-1), loss='binary_crossentropy')
model.fit(padded_dataset_v2, epochs=1)

In [None]:
model = QIQModel_v2()
optimizer = tf.optimizers.RMSprop(learning_rate=0.001)

In [None]:
#@tf.function
def train_step(inputs, targets):
  
    with tf.GradientTape() as tape:
        logits_ = model(inputs)
        #loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=targets, logits=logits_))
        bce = tf.keras.losses.BinaryCrossentropy(name='loss_with_perturb')
        loss = bce(targets, logits_)
        #print(loss)
    
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
  
    return loss

In [None]:
@tf.function
def train_step(model, example, optimizer):
    inputs, targets = example
    with tf.GradientTape() as tape:
        logits_ = model(inputs)
        #loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=targets, logits=logits_))
        bce = tf.keras.losses.BinaryCrossentropy(name='loss_with_perturb')
        loss = bce(targets, logits_)
        #print(loss)
    
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    return loss

In [None]:
ckpt = tf.train.Checkpoint(step=tf.Variable(1), net=model, optimizer=optimizer)
checkpoint_path = './checkpoint/quora2'
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print ('Latest checkpoint restored!!')

In [None]:
def train_and_checkpoint(net, manager, epochs=1):
    ckpt.restore(manager.latest_checkpoint)
    if manager.latest_checkpoint:
        print("Restored from {}".format(manager.latest_checkpoint))
    else:
        print("Initializing from scratch.")
    
    print(ckpt.step)
    for epoch in range(epochs):
        for example in padded_dataset_v1:
            loss = train_step(net, example, optimizer)
            ckpt.step.assign_add(1)
            if int(ckpt.step) % 30 == 0:
                save_path = manager.save()
                print("Saved checkpoint for step {}: {}".format(int(ckpt.step), save_path))
                print("loss {:1.2f}".format(loss.numpy()))

In [None]:
train_and_checkpoint(model, ckpt_manager)

In [None]:
model_2 = QIQModel_v2()
optimizer_2 = tf.optimizers.RMSprop(learning_rate=0.001)

In [None]:
ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer=optimizer_2, net=model_2)
manager = tf.train.CheckpointManager(ckpt, './checkpoint/quora', max_to_keep=5)

In [None]:
train_and_checkpoint(model_2, manager)

In [None]:
import random
EPOCHS = 1
BATCH_SIZE = 16
loss_trace = []
import time
for epoch in range(EPOCHS):
  
    total_loss = 0.0
    i=0
    epoch_start = time.time()
    for x,y in padded_dataset_v2:

        loss = train_step(x,y)
    
        total_loss += loss
        i += 1
    
        
    epoch_elapsed = time.time() - epoch_start
    print("epoch {} : elapsed: {}".format(epoch,epoch_elapsed))
    print("epoch {} : loss: {}".format(epoch,total_loss))
  
    if (epoch + 1) % 5 == 0:
        ckpt_save_path = ckpt_manager.save()
        print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                         ckpt_save_path))
  
    loss_trace.append(total_loss)