In [1]:
import tensorflow as tf
import pandas as pd
import functools
import ast
import keras
import pickle
from keras.models import Sequential
from keras.layers import Input,Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Dropout, Conv1D,MaxPool1D,GlobalAveragePooling1D
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.losses import binary_crossentropy
from keras.optimizers import Adam
from keras.metrics import top_k_categorical_accuracy 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, Conv1D, Input, Concatenate, BatchNormalization, MaxPooling1D, Embedding

Using TensorFlow backend.


In [2]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
keras.backend.set_session(session)
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")
graph = tf.get_default_graph()

init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
session = keras.backend.get_session()
session.run(init)

Default GPU Device: /device:GPU:0


In [3]:
generic = lambda x: ast.literal_eval(x)
conv = {'Tags': generic}
df = pd.read_csv('data/data_final.csv', converters=conv)

In [4]:
sample = df.sample(frac=0.2, random_state=2020)
x1 = sample['Body']
y1 = sample['Tags']
max_words = 200000

In [5]:
tokenizer = Tokenizer(num_words=max_words,  lower = True)
tokenizer.fit_on_texts(x1)
sequences = tokenizer.texts_to_sequences(x1)

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# loading
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
    
# pad sequences
max_length=min(200,max([len(s.split()) for s in x1]))
x = pad_sequences(sequences, maxlen=max_length, padding='post') #previously was pre

In [6]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit_transform(y1)


# saving
with open('binarizer', 'wb') as handle:
    pickle.dump(multilabel_binarizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# loading
with open('binarizer', 'rb') as handle:
    multilabel_binarizer = pickle.load(handle)
    
y = multilabel_binarizer.transform(y1)


In [7]:
vocab_size=len(tokenizer.word_index) + 1
print(y.shape)
print(x.shape)
print(vocab_size)
print(max_length)

(148964, 342)
(148964, 200)
502544
200


In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=9000)

In [9]:
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=9000)

In [10]:
print(x_train.shape)
print(x_val.shape)
print(x_test.shape)

(119171, 200)
(14896, 200)
(14897, 200)


In [11]:
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

# Function takes a tokenized sentence and returns the words
def sequence_to_text(list_of_indices):
    # Looking up words in dictionary
    words = [reverse_word_map.get(letter) for letter in list_of_indices]
    return(words)

# Creating texts 
my_texts = list(map(sequence_to_text, [x_train[51]]))
print(my_texts)

[['states', 'df', 'lt', 'map', 'data', 'state', 'xa', 'states', 'df', 'subset', 'states', 'df', 'group', '8', 'get', 'rid', 'of', 'dc', 'xa', 'states', 'df', 'st', 'lt', 'state', 'abb', 'match', 'states', 'df', 'region', 'tolower', 'state', 'name', 'attach', 'state', 'abbreviations', 'xa', 'xa', 'states', 'df', 'value', 'value', 'states', 'df', 'st', 'xa', 'xa', 'p', 'qplot', 'long', 'lat', 'data', 'states', 'df', 'group', 'group', 'fill', 'value', 'geom', 'polygon', 'xlab', 'ylab', 'main', 'main', 'opts', 'axis', 'text', 'y', 'theme', 'blank', 'axis', 'text', 'x', 'theme', 'blank', 'axis', 'ticks', 'theme', 'blank', 'scale', 'fill', 'continuous', 'name', 'xa', 'p2', 'p', 'geom', 'path', 'data', 'states', 'df', 'color', 'white', 'alpha', '0', '4', 'fill', 'na', 'coord', 'map', 'project', 'xa', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, Non

In [24]:
# Replicated model
inputs = Input(shape=(max_length, ))
embedding = Embedding(vocab_size, 16, input_length=max_length, trainable=False)(inputs)
convs = []
for n, fsz in [(128, 2), (192, 3), (256, 4), (512, 5)]:
    model = Conv1D(n, fsz, activation='relu', padding='same')(embedding)
    model = BatchNormalization()(model)
    model = Conv1D(n, fsz, activation='relu', padding='same')(model)
    model = MaxPooling1D()(model)
    convs.append(model)

model = Concatenate(axis=-1)(convs)
model = Flatten()(model)
## Fully connected layers
model = BatchNormalization()(model)
model = Dense(7, activation='relu')(model)
model = BatchNormalization()(model)
model = Dense(7, activation='relu')(model)
model = BatchNormalization()(model)

model = Dense(y_train.shape[1], activation='sigmoid')(model)
model = Model(inputs, model)

1.13.1


In [25]:
print(model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 200, 16)      8040704     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 200, 128)     4224        embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 200, 192)     9408        embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_4 (

In [26]:
top1_acc = functools.partial(tf.keras.metrics.top_k_categorical_accuracy, k=1)
top5_acc = functools.partial(tf.keras.metrics.top_k_categorical_accuracy, k = 5)
top1_acc.__name__ = 'top1_acc'
top5_acc.__name__ = 'top5_acc'

def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    keras.backend.get_session().run(tf.local_variables_initializer())
    return auc 

model.compile(
  optimizer='adam',
  loss='binary_crossentropy',
  metrics=['accuracy', 'top_k_categorical_accuracy', top1_acc, top5_acc, auc]
)

In [14]:
from tensorflow.python.keras.backend import set_session
from tensorflow.python.keras.models import load_model
global session
global graph
with graph.as_default():
    set_session(session)
    history = model.fit(x_train, y_train,
                          epochs=10,
                          batch_size=16,
                          verbose=1,
                          validation_data=(x_val, y_val))

Train on 119171 samples, validate on 14896 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
model.save(("model/replicated_model_word_embedding.h5")) 

## Load model here 

In [11]:
top1_acc = functools.partial(tf.keras.metrics.top_k_categorical_accuracy, k=1)
top5_acc = functools.partial(tf.keras.metrics.top_k_categorical_accuracy, k = 5)
top1_acc.__name__ = 'top1_acc'
top5_acc.__name__ = 'top5_acc'

def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    keras.backend.get_session().run(tf.local_variables_initializer())
    return auc 

In [28]:
from tensorflow.python.keras.backend import set_session
from tensorflow.python.keras.models import load_model
global session
global graph
with graph.as_default():
    set_session(session)
    model = tf.keras.models.load_model("model/replicated_model_word_embedding.h5", 
                                custom_objects={'top1_acc':top1_acc, 'top5_acc':top5_acc, 'auc':auc})

In [30]:
score = model.evaluate(x_test, y_test)



In [31]:
for i in range (len(model.metrics_names)):
    print("Test {}: {}".format(model.metrics_names[i],score[i]) )

Test loss: 0.02638202022283223
Test acc: 0.9942439794540405
Test top_k_categorical_accuracy: 0.4711010456085205
Test top1_acc: 0.2543431222438812
Test top5_acc: 0.4711010456085205
Test auc: 0.847448468208313
