In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import functools
import ast
import keras
import pickle
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Dropout, Conv1D,MaxPool1D,GlobalAveragePooling1D
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.losses import binary_crossentropy
from keras.optimizers import Adam
from keras.metrics import top_k_categorical_accuracy 
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
keras.backend.set_session(session)
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")

Default GPU Device: /device:GPU:0


In [3]:
generic = lambda x: ast.literal_eval(x)
conv = {'Tags': generic}
df = pd.read_csv('data/data_final.csv', converters=conv)

In [4]:
sample = df.sample(frac=0.2, random_state=2020)
x1 = sample['Body']
y1 = sample['Tags']
max_length = 200

In [5]:
char_table = sorted(list(set(''.join(df['Body'].values))))
vocab_size = len(char_table)

In [6]:
x2 = x1.reset_index(drop=True)

In [7]:
xs = []
for enum, code in enumerate(x2):
    try:
        code_char =[]
        for char in code[:max_length]:
            if char in char_table:
                code_char.append(char_table.index(char))
        code_char_index = code_char + ([0] * (max_length - len(code)))
        xs.append(np.array(code_char_index))
    except:
        print(enum, code)
x = np.array(xs)

In [8]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit_transform(y1)

# saving
with open('binarizer.pickle', 'wb') as handle:
    pickle.dump(multilabel_binarizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# loading
with open('binarizer.pickle', 'rb') as handle:
    multilabel_binarizer = pickle.load(handle)

y = multilabel_binarizer.transform(y1)

In [9]:
print(y.shape)
print(x.shape)
print(vocab_size)
print(max_length)

(148964, 342)
(148964, 200)
4435
200


In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=9000)

In [11]:
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=9000)

In [12]:
print(x_train.shape)
print(x_val.shape)
print(x_test.shape)

(119171, 200)
(14896, 200)
(14897, 200)


# Basic CNN model

In [28]:
# Basic CNN model
filter_length = 500

output_dims = [100]
for output_dim in output_dims:
    model = Sequential()
    model.add(Embedding(vocab_size, output_dim, input_length=x_train.shape[1]))
    model.add(Dropout(0.1))
    model.add(Conv1D(filter_length, 15, padding='valid', activation='relu', strides=1))
    model.add(GlobalMaxPool1D())   
    model.add(Dropout(0.1))
    model.add(Dense(400, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(y_train.shape[1], activation='sigmoid'))

top1_acc = functools.partial(tf.keras.metrics.top_k_categorical_accuracy, k=1)
top5_acc = functools.partial(tf.keras.metrics.top_k_categorical_accuracy, k = 5)
top1_acc.__name__ = 'top1_acc'
top5_acc.__name__ = 'top5_acc'

def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    keras.backend.get_session().run(tf.local_variables_initializer())
    return auc 

model.compile(
  optimizer='adam',
  loss='binary_crossentropy',
  metrics=['accuracy', 'top_k_categorical_accuracy', top1_acc, top5_acc, auc])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 100)          443500    
_________________________________________________________________
dropout_1 (Dropout)          (None, 200, 100)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 186, 500)          750500    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 500)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 500)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 400)               200400    
_________________________________________________________________
dropout_3 (Dropout)          (None, 400)               0         
__________

In [29]:
history = model.fit(x_train, y_train,
                  epochs=10,
                  batch_size=16,
                  verbose=1,
                  validation_data=(x_val, y_val))

Train on 119171 samples, validate on 14896 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [30]:
model.save("model/basic_model_char_embedding.h5")

## Load model here 

In [13]:
top1_acc = functools.partial(tf.keras.metrics.top_k_categorical_accuracy, k=1)
top5_acc = functools.partial(tf.keras.metrics.top_k_categorical_accuracy, k = 5)
top1_acc.__name__ = 'top1_acc'
top5_acc.__name__ = 'top5_acc'

def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    keras.backend.get_session().run(tf.local_variables_initializer())
    return auc 

In [14]:
model = keras.models.load_model("model/basic_model_char_embedding.h5", 
                                custom_objects={'top1_acc':top1_acc, 'top5_acc':top5_acc, 'auc':auc})

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Instructions for updating:
Use tf.cast instead.


In [15]:
score = model.evaluate(x_test, y_test)



In [16]:
for i in range (len(model.metrics_names)):
    print("Test {}: {}".format(model.metrics_names[i],score[i]) )

Test loss: 0.017663229754882433
Test acc: 0.9956090466771469
Test top_k_categorical_accuracy: 0.7256494596507507
Test top1_acc: 0.4310263811565689
Test top5_acc: 0.7256494596507507
Test auc: 0.9213675592316471


# Deep CNN Model

In [23]:
# Deep CNN Model
filter_length = 500
output_dims = [100]
for output_dim in output_dims:
    model = Sequential()
    model.add(Embedding(vocab_size, output_dim, input_length=x_train.shape[1]))
    model.add(Dropout(0.1))
    model.add(Conv1D(filter_length, 15, padding='valid', activation='relu', strides=1))
    model.add(Conv1D(filter_length, 8,activation='relu'))
    model.add(MaxPool1D())
    model.add(Conv1D(filter_length, 8,activation='relu'))
    model.add(MaxPool1D())
    model.add(Conv1D(filter_length, 8, activation='relu'))
    model.add(GlobalMaxPool1D())   
    model.add(Dropout(0.1))
    model.add(Dense(400, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(y_train.shape[1], activation='sigmoid'))
    
top1_acc = functools.partial(tf.keras.metrics.top_k_categorical_accuracy, k=1)
top5_acc = functools.partial(tf.keras.metrics.top_k_categorical_accuracy, k = 5)
top1_acc.__name__ = 'top1_acc'
top5_acc.__name__ = 'top5_acc'

def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    keras.backend.get_session().run(tf.local_variables_initializer())
    return auc 

model.compile(
  optimizer='adam',
  loss='binary_crossentropy',
  metrics=['accuracy', 'top_k_categorical_accuracy', top1_acc, top5_acc, auc])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 200, 100)          443500    
_________________________________________________________________
dropout_7 (Dropout)          (None, 200, 100)          0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 186, 500)          750500    
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 179, 500)          2000500   
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 89, 500)           0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 82, 500)           2000500   
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 41, 500)           0         
__________

In [24]:
history2 = model.fit(x_train, y_train,
                  epochs=10,
                  batch_size=16,
                  verbose=1,
                  validation_data=(x_val, y_val))

Train on 119171 samples, validate on 14896 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [25]:
model.save("model/deep_model_char_embedding.h5")

## Load model here

In [None]:
top1_acc = functools.partial(tf.keras.metrics.top_k_categorical_accuracy, k=1)
top5_acc = functools.partial(tf.keras.metrics.top_k_categorical_accuracy, k = 5)
top1_acc.__name__ = 'top1_acc'
top5_acc.__name__ = 'top5_acc'

def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    keras.backend.get_session().run(tf.local_variables_initializer())
    return auc 

In [22]:
model = keras.models.load_model("model/deep_model_char_embedding.h5", 
                                custom_objects={'top1_acc':top1_acc, 'top5_acc':top5_acc, 'auc':auc})

In [23]:
score = model.evaluate(x_test, y_test)



In [24]:
for i in range (len(model.metrics_names)):
    print("Test {}: {}".format(model.metrics_names[i],score[i]) )

Test loss: 0.020423251952214026
Test acc: 0.9952376871658922
Test top_k_categorical_accuracy: 0.6231455998172006
Test top1_acc: 0.3645700476706056
Test top5_acc: 0.6231455998172006
Test auc: 0.9035350808787124
