In [1]:
import dill
import gc

In [2]:
with open('./data/gh_descs_norm.pkl', 'rb') as f:
    data = []
    while True:
        try:
            data.extend(dill.load(f))
        except:
            print('EOF reached')
            break
            
with open('./data/gh_labels.pkl', 'rb') as f:
    labels = dill.load(f)
    
len(data), len(labels)

EOF reached


(747518, 747518)

In [3]:
labels = [0 if item == 0 else 1 for item in labels]
set(labels)

{0, 1}

In [4]:
import numpy as np
import tensorflow as tf
import keras


SEED = 42
np.random.seed(SEED)
tf.set_random_seed(SEED)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, labels, 
                                                    test_size=0.2, random_state=SEED)
len(X_train), len(X_test)

(598014, 149504)

In [6]:
## some config values 
EMBED_SIZE = 300 # how big is each word vector
MAX_FEATURES = 800000 # how many unique words to use (i.e num rows in embedding vector)
MAX_LEN = 1000 # max number of words in a doc to use

In [7]:
import os.path

CVE_WORD2IDX_MAP_FILE = 'sec_tokenizer_word2idx.pkl'

if not os.path.isfile(CVE_WORD2IDX_MAP_FILE):
    tokenizer = keras.preprocessing.text.Tokenizer(oov_token='<UNK>', num_words=MAX_FEATURES)
    tokenizer.fit_on_texts(list(X_train))
    tokenizer.word_index['<PAD>'] = 0
    with open(CVE_WORD2IDX_MAP_FILE, 'wb') as f:
        dill.dump(tokenizer.word_index, f)
else:
    tokenizer = keras.preprocessing.text.Tokenizer(oov_token='<UNK>', num_words=MAX_FEATURES)
    with open(CVE_WORD2IDX_MAP_FILE, 'rb') as f:
        word2idx = dill.load(f)
    tokenizer.word_index = word2idx

In [8]:
len(tokenizer.word_index)

1096194

In [9]:
#MAX_FEATURES = len(tokenizer.word_index)
#MAX_FEATURES

In [10]:
## Tokenize the sentences
train_X = tokenizer.texts_to_sequences(X_train)
test_X = tokenizer.texts_to_sequences(X_test)

In [11]:
## Pad the sentences 
train_X = keras.preprocessing.sequence.pad_sequences(train_X, maxlen=MAX_LEN)
test_X = keras.preprocessing.sequence.pad_sequences(test_X, maxlen=MAX_LEN)

In [12]:
train_X.shape, test_X.shape

((598014, 1000), (149504, 1000))

In [13]:
train_y = np.array(y_train)
test_y = np.array(y_test)

In [14]:
train_idx = np.random.permutation(len(train_X))
train_idx

array([115007, 562806, 319482, ..., 365838, 131932, 121958])

In [15]:
train_X = train_X[train_idx]
train_y = train_y[train_idx]
train_X.shape, train_y.shape

((598014, 1000), (598014,))

In [16]:
word2idx = tokenizer.word_index

In [17]:
def load_pretrained_embeddings(word_to_index, max_features, embedding_size, embedding_file_path):    
    
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    
    embeddings_index = dict(get_coefs(*row.split(" ")) 
                                for row in open(embedding_file_path, encoding="utf8", errors='ignore') 
                                    if len(row)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean, emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    nb_words = min(max_features, len(word_to_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_size))
    
    for word, idx in word_to_index.items():
        if idx >= max_features: 
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[idx] = embedding_vector

    return embedding_matrix

In [18]:
FASTTEXT_INIT_EMBEDDINGS_FILE = 'fasttext_init_embeddings_model1.pkl'

if not os.path.isfile(FASTTEXT_INIT_EMBEDDINGS_FILE):
    FASTTEXT_EMBEDDINGS_PATH = './embeddings/fasttext/crawl-300d-2M.vec'
    ft_embeddings = load_pretrained_embeddings(word_to_index=word2idx, max_features=MAX_FEATURES, 
                                               embedding_size=EMBED_SIZE, 
                                               embedding_file_path=FASTTEXT_EMBEDDINGS_PATH)
    with open(FASTTEXT_INIT_EMBEDDINGS_FILE, 'wb') as f:
        dill.dump(ft_embeddings, f)
else:
    with open(FASTTEXT_INIT_EMBEDDINGS_FILE, 'rb') as f:
        ft_embeddings = dill.load(f)
        
ft_embeddings.shape    

(800000, 300)

In [19]:
PARAGRAM_INIT_EMBEDDINGS_FILE = 'paragram_init_embeddings_model1.pkl'

if not os.path.isfile(PARAGRAM_INIT_EMBEDDINGS_FILE):
    PARAGRAM_EMBEDDINGS_PATH = './embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    pg_embeddings = load_pretrained_embeddings(word_to_index=word2idx, max_features=MAX_FEATURES, 
                                               embedding_size=EMBED_SIZE, 
                                               embedding_file_path=PARAGRAM_EMBEDDINGS_PATH)
    with open(PARAGRAM_INIT_EMBEDDINGS_FILE, 'wb') as f:
        dill.dump(pg_embeddings, f)
else:
    with open(PARAGRAM_INIT_EMBEDDINGS_FILE, 'rb') as f:
        pg_embeddings = dill.load(f)
        
pg_embeddings.shape    

(800000, 300)

In [20]:
avg_pretrained_embeddings = np.mean([ft_embeddings, pg_embeddings], axis = 0)
avg_pretrained_embeddings.shape

(800000, 300)

In [17]:
from keras.engine.topology import Layer
from keras import backend as K


class AttentionLayer(Layer):
    
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        """
        
        self.supports_masking = True
        self.init = keras.initializers.get('glorot_uniform')

        self.W_regularizer = keras.regularizers.get(W_regularizer)
        self.b_regularizer = keras.regularizers.get(b_regularizer)

        self.W_constraint = keras.constraints.get(W_constraint)
        self.b_constraint = keras.constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(AttentionLayer, self).__init__(**kwargs)
        

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True
        

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    
    def call(self, x, mask=None):
        # TF backend doesn't support it
        # eij = K.dot(x, self.W) 
        # features_dim = self.W.shape[0]
        # step_dim = x._keras_shape[1]

        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), 
                              K.reshape(self.W, (features_dim, 1))),
                        (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        a = K.expand_dims(a)
        weighted_input = x * a
        
        return K.sum(weighted_input, axis=1)

    
    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim
    
    
    def get_config(self):
        config = {'step_dim': self.step_dim}
        base_config = super(AttentionLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [22]:
import keras
from keras.utils import multi_gpu_model

def build_gru_model(embedding_matrix, embedding_size, max_len, max_features, gru_units=32):
    
    inp = keras.layers.Input(shape=(max_len,))
    x = keras.layers.Embedding(max_features, embedding_size, 
                                  weights=[embedding_matrix], trainable=True)(inp)
    x = keras.layers.Bidirectional(keras.layers.CuDNNGRU(gru_units, return_sequences=True))(x)
    x = AttentionLayer(max_len)(x)
    x = keras.layers.Dense(gru_units, activation='relu')(x)
    x = keras.layers.Dropout(rate=0.2)(x)

    outp = keras.layers.Dense(1, activation='sigmoid')(x)
    # initialize the model
    model = keras.models.Model(inputs=inp, outputs=outp)

    # make the model parallel
    #model = multi_gpu_model(model, gpus=2)
       
    model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(), metrics=['accuracy'])
    
    return model

In [23]:
# TensorFlow wizardry
#config = tf.ConfigProto()
 
# Don't pre-allocate memory; allocate as-needed
#config.gpu_options.allow_growth = True
 
 
# Create a session with the above options specified.
#K.tensorflow_backend.set_session(tf.Session(config=config))

In [24]:
gru_model = build_gru_model(embedding_matrix=avg_pretrained_embeddings, embedding_size=EMBED_SIZE, 
                            max_len=MAX_LEN, max_features=MAX_FEATURES, gru_units=32)
gru_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 300)         240000000 
_________________________________________________________________
bidirectional_1 (Bidirection (None, 1000, 64)          64128     
_________________________________________________________________
attention_layer_1 (Attention (None, 64)                1064      
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total para

In [25]:
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(train_y),
                                                 train_y)
class_weights = dict(enumerate(class_weights))
class_weights[1] *= 2
class_weights

{0: 0.5841669076217344, 1: 6.940577052528957}

In [27]:
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.4,
                              patience=2, min_lr=0.00001)

early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=4, 
                           mode='auto', baseline=None, restore_best_weights=False)

callbacks = [reduce_lr, early_stop]

In [28]:
history = gru_model.fit(train_X, train_y, batch_size=256, epochs=10, callbacks=callbacks,
                        class_weight=class_weights, validation_split=0.1)

  num_elements)


Train on 538212 samples, validate on 59802 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


In [31]:
pred_y = gru_model.predict([test_X], batch_size=512, verbose=1)
pred_y



array([[7.93378218e-04],
       [7.51256011e-05],
       [1.51747945e-05],
       ...,
       [9.99999762e-01],
       [4.60618287e-01],
       [1.29677443e-04]], dtype=float32)

In [40]:
pred_yr = pred_y.ravel()
pred_yl = [1 if prob > 0.5 else 0 for prob in pred_yr]

In [31]:
from sklearn.metrics import confusion_matrix, classification_report

In [42]:
confusion_matrix(y_true=test_y, y_pred=pred_yl)

array([[124759,   3307],
       [  1435,  20003]])

In [36]:
print(classification_report(y_true=test_y, y_pred=pred_yl))

             precision    recall  f1-score   support

          0       0.99      0.97      0.98    128066
          1       0.86      0.93      0.89     21438

avg / total       0.97      0.97      0.97    149504



In [None]:
from sklearn import metrics

pred_train_y = gru_model.predict([train_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(train_y, 
                                                                             (pred_train_y>thresh).astype(int))))

In [None]:
#pred_test_y = gru_model.predict([test_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.0001, 0.501, 0.0001):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(test_y, 
                                                                             (pred_test_y>thresh).astype(int))))

In [29]:
gru_model.save('./models/model1_sec_nonsec_demo2.h5')

In [30]:
gru_model.save_weights('./models/model1_sec_nonsec_demo_weights2.h5')

In [None]:
gru_model.save('./models/model1_sec_nonsec_demo.h5')

In [None]:
gru_model.save_weights('./models/model1_sec_nonsec_demo_weights.h5')

In [18]:
def build_gru_cpumodel(embedding_size, max_len, max_features, gru_units=32):
    
    inp = keras.layers.Input(shape=(max_len,))
    x = keras.layers.Embedding(max_features, embedding_size, trainable=True)(inp)
    x = keras.layers.Bidirectional(keras.layers.GRU(gru_units*2, return_sequences=True, 
                                                    reset_after=True, recurrent_activation='sigmoid'))(x)
    x = keras.layers.Bidirectional(keras.layers.GRU(gru_units, return_sequences=True, reset_after=True, 
                                                    recurrent_activation='sigmoid'))(x)
    x = AttentionLayer(max_len)(x)
    x = keras.layers.Dense(gru_units, activation='relu')(x)
    x = keras.layers.Dropout(rate=0.2)(x)
    x = keras.layers.Dense(gru_units // 2, activation='relu')(x)
    x = keras.layers.Dropout(rate=0.2)(x)
    outp = keras.layers.Dense(1, activation='sigmoid')(x)
    # initialize the model
    model = keras.models.Model(inputs=inp, outputs=outp)       
    model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(), metrics=['accuracy'])
    
    return model

In [23]:
def build_gru_cpumodel2(embedding_size, max_len, max_features, gru_units=32):
    
    inp = keras.layers.Input(shape=(max_len,))
    x = keras.layers.Embedding(max_features, embedding_size, trainable=True)(inp)
    x = keras.layers.Bidirectional(keras.layers.GRU(gru_units, return_sequences=True, reset_after=True, 
                                                    recurrent_activation='sigmoid'))(x)
    x = AttentionLayer(max_len)(x)
    x = keras.layers.Dense(gru_units, activation='relu')(x)
    x = keras.layers.Dropout(rate=0.2)(x)
    outp = keras.layers.Dense(1, activation='sigmoid')(x)
    # initialize the model
    model = keras.models.Model(inputs=inp, outputs=outp)       
    model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(), metrics=['accuracy'])
    
    return model

In [56]:
#with tf.device('cpu:0'):
gru_cpu_model = build_gru_cpumodel2(embedding_size=EMBED_SIZE, 
                               max_len=MAX_LEN, max_features=MAX_FEATURES, gru_units=32)
gru_cpu_model.load_weights('./models/model1_sec_nonsec_demo_weights2.h5')

In [57]:
gru_cpu_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_6 (Embedding)      (None, 1000, 300)         240000000 
_________________________________________________________________
bidirectional_9 (Bidirection (None, 1000, 64)          64128     
_________________________________________________________________
attention_layer_6 (Attention (None, 64)                1064      
_________________________________________________________________
dense_14 (Dense)             (None, 32)                2080      
_________________________________________________________________
dropout_9 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 33        
Total para

In [51]:
pred_y = gru_cpu_model.predict([test_X], batch_size=2048, verbose=1)
pred_y



array([[0.04563026],
       [0.20279008],
       [0.01513663],
       ...,
       [1.        ],
       [0.45771995],
       [0.01318576]], dtype=float32)

In [54]:
pred_yr = pred_y.ravel()
pred_yl = [1 if prob > 0.5 else 0 for prob in pred_yr]

In [55]:
confusion_matrix(y_true=test_y, y_pred=pred_yl)

array([[123814,   4252],
       [  1216,  20222]])

In [35]:
confusion_matrix(y_true=test_y, y_pred=pred_yl)

array([[105161,  22905],
       [   772,  20666]])

In [36]:
print(classification_report(y_true=test_y, y_pred=pred_yl))

             precision    recall  f1-score   support

          0       0.99      0.82      0.90    128066
          1       0.47      0.96      0.64     21438

avg / total       0.92      0.84      0.86    149504



In [47]:
pred_yl = [1 if prob > 0.5 else 0 for prob in pred_yr]

In [48]:
confusion_matrix(y_true=test_y, y_pred=pred_yl)

array([[124759,   3307],
       [  1435,  20003]])