In [1]:
import tensorflow as tf
import pandas as pd
import functools
import ast
import keras
import pickle
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Dropout, Conv1D,MaxPool1D,GlobalAveragePooling1D
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.losses import binary_crossentropy
from keras.optimizers import Adam
from keras.metrics import top_k_categorical_accuracy 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
keras.backend.set_session(session)
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")

Default GPU Device: /device:GPU:0


In [3]:
generic = lambda x: ast.literal_eval(x)
conv = {'Tags': generic}
df = pd.read_csv('data/data_final.csv', converters=conv)

In [4]:
sample = df.sample(frac=0.2, random_state=2020)
x1 = sample['Body']
y1 = sample['Tags']
max_words = 200000

In [5]:
tokenizer = Tokenizer(num_words=max_words,  lower = True)
tokenizer.fit_on_texts(x1)
sequences = tokenizer.texts_to_sequences(x1)

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# loading
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
    
# pad sequences
max_length=min(200,max([len(s.split()) for s in x1]))
x = pad_sequences(sequences, maxlen=max_length, padding='post') #previously was pre

In [6]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit_transform(y1)

# saving
with open('binarizer.pickle', 'wb') as handle:
    pickle.dump(multilabel_binarizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# loading
with open('binarizer.pickle', 'rb') as handle:
    multilabel_binarizer = pickle.load(handle)

y = multilabel_binarizer.transform(y1)

In [7]:
vocab_size=len(tokenizer.word_index) + 1
print(y.shape)
print(x.shape)
print(vocab_size)
print(max_length)

(148964, 342)
(148964, 200)
502544
200


In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=9000)

In [9]:
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=9000)

In [10]:
print(x_train.shape)
print(x_val.shape)
print(x_test.shape)

(119171, 200)
(14896, 200)
(14897, 200)


In [11]:
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

# Function takes a tokenized sentence and returns the words
def sequence_to_text(list_of_indices):
    # Looking up words in dictionary
    words = [reverse_word_map.get(letter) for letter in list_of_indices]
    return(words)

# Creating texts 
my_texts = list(map(sequence_to_text, [x_train[51]]))
print(my_texts)

[['states', 'df', 'lt', 'map', 'data', 'state', 'xa', 'states', 'df', 'subset', 'states', 'df', 'group', '8', 'get', 'rid', 'of', 'dc', 'xa', 'states', 'df', 'st', 'lt', 'state', 'abb', 'match', 'states', 'df', 'region', 'tolower', 'state', 'name', 'attach', 'state', 'abbreviations', 'xa', 'xa', 'states', 'df', 'value', 'value', 'states', 'df', 'st', 'xa', 'xa', 'p', 'qplot', 'long', 'lat', 'data', 'states', 'df', 'group', 'group', 'fill', 'value', 'geom', 'polygon', 'xlab', 'ylab', 'main', 'main', 'opts', 'axis', 'text', 'y', 'theme', 'blank', 'axis', 'text', 'x', 'theme', 'blank', 'axis', 'ticks', 'theme', 'blank', 'scale', 'fill', 'continuous', 'name', 'xa', 'p2', 'p', 'geom', 'path', 'data', 'states', 'df', 'color', 'white', 'alpha', '0', '4', 'fill', 'na', 'coord', 'map', 'project', 'xa', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, Non

# Basic CNN model

In [12]:
filter_length = 500

output_dims = [100]
for output_dim in output_dims:
    model = Sequential()
    model.add(Embedding(vocab_size, output_dim, input_length=x_train.shape[1]))
    model.add(Dropout(0.1))
    model.add(Conv1D(filter_length, 15, padding='valid', activation='relu', strides=1))
    model.add(GlobalMaxPool1D())   
    model.add(Dropout(0.1))
    model.add(Dense(400, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(y_train.shape[1], activation='sigmoid'))

top1_acc = functools.partial(tf.keras.metrics.top_k_categorical_accuracy, k=1)
top5_acc = functools.partial(tf.keras.metrics.top_k_categorical_accuracy, k = 5)
top1_acc.__name__ = 'top1_acc'
top5_acc.__name__ = 'top5_acc'

def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    keras.backend.get_session().run(tf.local_variables_initializer())
    return auc 

model.compile(
  optimizer='adam',
  loss='binary_crossentropy',
  metrics=['accuracy', 'top_k_categorical_accuracy', top1_acc, top5_acc, auc]
)

model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 100)          50254400  
_________________________________________________________________
dropout_1 (Dropout)          (None, 200, 100)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 186, 500)          750500    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 500)               0         
_________________________________________________________________
dropout_2 (Dropout)        

In [12]:
history = model.fit(x_train, y_train,
                  epochs=10,
                  batch_size=16,
                  verbose=1,
                  validation_data=(x_val, y_val))

Instructions for updating:
Use tf.cast instead.
Train on 119171 samples, validate on 14896 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [13]:
model.save("model/basic_model_word_embedding.h5")

## Load model here

In [12]:
top1_acc = functools.partial(tf.keras.metrics.top_k_categorical_accuracy, k=1)
top5_acc = functools.partial(tf.keras.metrics.top_k_categorical_accuracy, k = 5)
top1_acc.__name__ = 'top1_acc'
top5_acc.__name__ = 'top5_acc'

def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    keras.backend.get_session().run(tf.local_variables_initializer())
    return auc 

In [13]:
model = keras.models.load_model("model/basic_model_word_embedding.h5", 
                                custom_objects={'top1_acc':top1_acc, 'top5_acc':top5_acc, 'auc':auc})

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Instructions for updating:
Use tf.cast instead.


In [14]:
score = model.evaluate(x_test, y_test)



In [15]:
for i in range (len(model.metrics_names)):
    print("Test {}: {}".format(model.metrics_names[i],score[i]) )

Test loss: 0.022464561751333028
Test acc: 0.9953319010554028
Test top_k_categorical_accuracy: 0.7014835201958534
Test top1_acc: 0.4132375646163259
Test top5_acc: 0.7014835201958534
Test auc: 0.8888032306854682


# Deep CNN model

In [12]:
filter_length = 500
output_dims = [100]
for output_dim in output_dims:
    model = Sequential()
    model.add(Embedding(vocab_size, output_dim, input_length=x_train.shape[1]))
    model.add(Dropout(0.1))
    model.add(Conv1D(filter_length, 15, padding='valid', activation='relu', strides=1))
    model.add(Conv1D(filter_length, 8,activation='relu'))
    model.add(MaxPool1D())
    model.add(Conv1D(filter_length, 8,activation='relu'))
    model.add(MaxPool1D())
    model.add(Conv1D(filter_length, 8, activation='relu'))
    model.add(GlobalMaxPool1D())   
    model.add(Dropout(0.1))
    model.add(Dense(400, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(y_train.shape[1], activation='sigmoid'))

top1_acc = functools.partial(tf.keras.metrics.top_k_categorical_accuracy, k=1)
top5_acc = functools.partial(tf.keras.metrics.top_k_categorical_accuracy, k = 5)
top1_acc.__name__ = 'top1_acc'
top5_acc.__name__ = 'top5_acc'

def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    keras.backend.get_session().run(tf.local_variables_initializer())
    return auc 

model.compile(
  optimizer='adam',
  loss='binary_crossentropy',
  metrics=['accuracy', 'top_k_categorical_accuracy', top1_acc, top5_acc, auc])

model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 100)          50254400  
_________________________________________________________________
dropout_1 (Dropout)          (None, 200, 100)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 186, 500)          750500    
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 179, 500)          2000500   
_________________________________________________________________
max_pooling1d_1 (MaxPooling

In [13]:
history2 = model.fit(x_train, y_train,
                  epochs=10,
                  batch_size=16,
                  verbose=1,
                  validation_data=(x_val, y_val))

Instructions for updating:
Use tf.cast instead.
Train on 119171 samples, validate on 14896 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
model.save("model/deep_model_word_embedding.h5")

## Load model here

In [15]:
top1_acc = functools.partial(tf.keras.metrics.top_k_categorical_accuracy, k=1)
top5_acc = functools.partial(tf.keras.metrics.top_k_categorical_accuracy, k = 5)
top1_acc.__name__ = 'top1_acc'
top5_acc.__name__ = 'top5_acc'

def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    keras.backend.get_session().run(tf.local_variables_initializer())
    return auc 

In [16]:
model = keras.models.load_model("model/deep_model_word_embedding.h5", 
                                custom_objects={'top1_acc':top1_acc, 'top5_acc':top5_acc, 'auc':auc})

In [17]:
score = model.evaluate(x_test, y_test)



In [18]:
for i in range (len(model.metrics_names)):
    print("Test {}: {}".format(model.metrics_names[i],score[i]) )

Test loss: 0.019864329154100354
Test acc: 0.9953191429366599
Test top_k_categorical_accuracy: 0.6477143049176094
Test top1_acc: 0.39014566692680513
Test top5_acc: 0.6477143049176094
Test auc: 0.9089150693926018
