# 04 - model2 - neural net

### 1. Set up

In [28]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
from tensorflow import keras
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers import Embedding
from tensorflow.keras import layers, callbacks
from sklearn.metrics import classification_report
import pickle
import time

In [2]:
df = pd.read_csv('/home/jupyter/sb-entity-classification/data/data.csv')
df.columns = ['class','name']
df['class'] = df['class'] -1  # based on information provided in the brief

classes_list = pd.read_csv('/home/jupyter/sb-entity-classification/data/classes.txt', header = None)
classes_list['class'] = classes_list.index
classes_list.columns = ['class_name', 'class']
class_names = classes_list['class_name'].tolist()
df = df.merge(classes_list, on = 'class', how = 'left')

### 2. Split train, validation, and test set

In [3]:
# separate train validation and test
msk = np.random.rand(len(df)) < 0.98
train = df[msk]
test = df[~msk]

In [4]:
test.to_pickle('/home/jupyter/sb-entity-classification/data/test_nn.pkl')

In [5]:
train.shape

(537831, 3)

In [6]:
test.shape

(10956, 3)

In [7]:
samples = train['name'].tolist()
labels  = train['class'].tolist()

# Shuffle the data
seed = 42
rng = np.random.RandomState(seed)
rng.shuffle(samples)
rng = np.random.RandomState(seed)
rng.shuffle(labels)

In [8]:
# Extract a training & validation split
validation_split = 0.2
num_validation_samples = int(validation_split * len(samples))
train_samples = samples[:-num_validation_samples]
val_samples = samples[-num_validation_samples:]
train_labels = labels[:-num_validation_samples]
val_labels = labels[-num_validation_samples:]

### 3. Use pre-trained GloVe embeddings

In [9]:
vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=8)
text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128)
vectorizer.adapt(text_ds)

In [10]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [11]:
path_to_glove_file = os.path.join(
    os.path.expanduser("~"), "sb-entity-classification/data/glove.6B/glove.6B.100d.txt"
)

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [12]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 19028 words (972 misses)


In [13]:
x_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
x_val = vectorizer(np.array([[s] for s in val_samples])).numpy()

y_train = np.array(train_labels)
y_val = np.array(val_labels)

### 4. Build NN Model

In [14]:
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

In [15]:
int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = layers.Conv1D(128, 1, activation="relu")(embedded_sequences)
x = layers.MaxPooling1D(1)(x)
x = layers.Conv1D(128, 1, activation="relu")(x)
x = layers.MaxPooling1D(1)(x)
x = layers.Conv1D(128, 1, activation="relu")(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)
preds = layers.Dense(len(class_names), activation="softmax")(x)
model = keras.Model(int_sequences_input, preds)
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 100)         2000200   
_________________________________________________________________
conv1d (Conv1D)              (None, None, 128)         12928     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, None, 128)         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 128)         16512     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, None, 128)         0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, None, 128)        

In [16]:
early_stopping_cb = callbacks.EarlyStopping(
    monitor='val_acc',
    min_delta=0,
    patience=2,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=True)

cbs = [
    early_stopping_cb
]

### 5. Train Model

In [17]:
model.compile(loss="sparse_categorical_crossentropy", 
              optimizer="rmsprop", 
              metrics=["acc"])
model.fit(x_train, 
          y_train, 
          batch_size=128, 
          epochs=20, 
          validation_data=(x_val, y_val),
          shuffle=True,
          callbacks=cbs)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


<tensorflow.python.keras.callbacks.History at 0x7fd4afd78390>

In [18]:
string_input = keras.Input(shape=(1,), dtype="string")
x = vectorizer(string_input)
preds = model(x)
end_to_end_model = keras.Model(string_input, preds)

In [41]:
### save models and artifacts
# tf.keras.models.save_model(model, '/home/jupyter/sb-entity-classification/models/nn_embedding.h5')
# tf.keras.models.save_model(end_to_end_model, '/home/jupyter/sb-entity-classification/models/end_to_end_nn_embedding')
# with open('/home/jupyter/sb-entity-classification/models/class_names.txt', "wb") as fp:   
#     pickle.dump(class_names, fp)

### 6. Predict using trained model

In [19]:
def get_prediction(string_input):
    probabilities = end_to_end_model.predict([[string_input]])
    return class_names[np.argmax(probabilities[0])]

#### 6.1 try it on a few examples

In [20]:
test.sample(5)

Unnamed: 0,class,name,class_name
407878,10,Diplosphaera,Plant
139140,3,Anthony Ashley-Cooper (cricketer),Athlete
236690,5,Jeep Grand Cherokee,MeanOfTransportation
499752,12,Wombling Free,Film
413550,10,Ulmus 'Folia Variegata Pendula',Plant


In [21]:
get_prediction("Diplosphaera")

'Animal'

In [22]:
get_prediction("Anthony Ashley-Cooper (cricketer)")

'Athlete'

In [23]:
get_prediction("Jeep Grand Cherokee")

'MeanOfTransportation'

In [24]:
get_prediction("Wombling Free")

'Album'

In [25]:
get_prediction("Ulmus 'Folia Variegata Pendula'")

'Plant'

_Observations_:

From the sample examples, the model is not able to differentiate titles for different media of artworks (when a keyword is absent), which is probably only going to be able to resolved or improved with additional data sources of contexts. 

#### 6.2 try it on a sample of test set to estimate run time

In [29]:
sample_size = 100
test_sample = test.sample(sample_size)
start = time.time()
predictions = list(map(get_prediction,test_sample['name'].tolist()))
time_taken = time.time() - start
avg_time_p = time_taken / sample_size
print('Avg time per prediction {:.2f}s'.format(avg_time_p))
print('Total time for all {} forecasts: {:.2f}h'.format(sample_size, time_taken / 3600))

Avg time per prediction 0.19s
Total time for all 100 forecasts: 0.01h


In [30]:
print(classification_report(test_sample['class_name'].tolist()[:sample_size], predictions, digits=3))

                        precision    recall  f1-score   support

                 Album      0.700     0.778     0.737         9
                Animal      0.360     1.000     0.529         9
                Artist      0.400     0.400     0.400         5
               Athlete      0.286     0.286     0.286         7
              Building      1.000     0.833     0.909         6
               Company      1.000     0.500     0.667         8
EducationalInstitution      0.833     1.000     0.909         5
                  Film      0.833     0.714     0.769         7
  MeanOfTransportation      1.000     0.857     0.923         7
          NaturalPlace      1.000     1.000     1.000         6
          OfficeHolder      1.000     0.200     0.333         5
                 Plant      1.000     0.667     0.800        15
               Village      1.000     0.400     0.571         5
           WrittenWork      0.857     1.000     0.923         6

              accuracy                

#### 6.3 to score on the entire test set:
Please see scripts/score_with_nn.py, which was developed using notebooks/05-model2-nn-scoring.ipynb