# Training of the LSTM "v2" model

## Data retrieval

In [84]:
import os
import datetime
import json

import pandas
import numpy as np
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.layers import Dense

from feast import FeatureStore

In [96]:
root_dir = os.path.join(os.getcwd(), '..', '..')
store_dir = os.path.join(root_dir, 'sms_feature_store')
raw_data_dir = os.path.join(root_dir, 'raw_data')
model_save_dir = os.path.join(root_dir, 'models', 'model2_2020', 'classifier')

In [75]:
base_dir = os.getcwd()
input_dir = os.path.join(base_dir, '..', '..',  'models', 'model2_2020', 'tokenizer')
input_metadata_file = os.path.join(input_dir, 'settings.json')
tokenizer_settings = json.load(open(input_metadata_file))
MAX_NUM_WORDS = tokenizer_settings['MAX_NUM_WORDS']

In [81]:
EMBEDDING_DIM = 128
LSTM_OUT_DIM = 196
SPATIAL_DROPOUT_FRACTION = 0.4
LSTM_DROPOUT_FRACTION = 0.3
LSTM_RECURRENT_DROPOUT_FRACTION = 0.3
#
SPLIT_TEST_SIZE = 0.25
TRAIN_BATCH_SIZE = 32
TRAIN_EPOCHS = 5

In [3]:
training_timefreeze = datetime.datetime(2020, 7, 2)
print(f"Freezing time to {training_timefreeze.strftime('%Y-%m-%d %H:%M:%S')} for training")

Freezing time to 2020-07-02 00:00:00 for training


In [4]:
store = FeatureStore(repo_path=store_dir)

In [5]:
training_sms_ids = [
    int(sms_id)
    for sms_id in (
        line.strip()
        for line in open(os.path.join(raw_data_dir, 'training_sms_ids.txt')).readlines()
    )
    if sms_id
]

In [6]:
entities_df = pandas.DataFrame.from_dict({
    'sms_id': training_sms_ids,
})
entities_df['event_timestamp'] = training_timefreeze

historical_df = store.get_historical_features(
    entity_df=entities_df,
    features=[
        'sms_labels:label',
        'sms_features2:features',
    ],
).to_df()

In [7]:
historical_df

Unnamed: 0,sms_id,event_timestamp,label,features
0,10001,2020-07-02 00:00:00+00:00,ham,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,10028,2020-07-02 00:00:00+00:00,ham,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 147, 3, 5..."
2,10027,2020-07-02 00:00:00+00:00,ham,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,10002,2020-07-02 00:00:00+00:00,ham,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,10003,2020-07-02 00:00:00+00:00,spam,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...
7523,17522,2020-07-02 00:00:00+00:00,ham,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7524,17523,2020-07-02 00:00:00+00:00,ham,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7525,17524,2020-07-02 00:00:00+00:00,ham,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7526,17512,2020-07-02 00:00:00+00:00,spam,"[0, 0, 0, 0, 0, 34, 5, 27, 32, 122, 122, 122, ..."


## Training

### Data transformation

In [22]:
labelLegend = {
    lb: idx
    for idx, lb in enumerate(np.unique(historical_df['label']))
}
labelLegendInverted = {'%i' % v: k for k,v in labelLegend.items()}
#
print(f'labels: {labels}')
print(f'labelLegend: {labelLegend}')
print(f'labelLegendInverted: {labelLegendInverted}')

labels: ['ham', 'spam']
labelLegend: {'ham': 0, 'spam': 1}
labelLegendInverted: {'0': 'ham', '1': 'spam'}


In [25]:
oneHotEncodedLabels = to_categorical(historical_df['label'].map(lambda lb: labelLegend[lb]))
print(oneHotEncodedLabels)

[[1. 0.]
 [1. 0.]
 [1. 0.]
 ...
 [1. 0.]
 [0. 1.]
 [1. 0.]]


In [86]:
X = pandas.DataFrame(historical_df.features.tolist()).to_numpy()
Y = oneHotEncodedLabels
#
print(f'X = {X}')
print(f'\nY = {Y}')

X = [[  0   0   0 ... 169  77  68]
 [  0   0   0 ...   3   7  45]
 [  0   0   0 ...   0  18  21]
 ...
 [  0   0   0 ...  12  16   5]
 [  0   0   0 ... 122 122  19]
 [  0   0   0 ...  53   5  19]]

Y = [[1. 0.]
 [1. 0.]
 [1. 0.]
 ...
 [1. 0.]
 [0. 1.]
 [1. 0.]]


#### Training/testing split

In [87]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=SPLIT_TEST_SIZE, random_state=2022)
#
print(f'X_train.shape = {X_train.shape}')
print(f'Y_train.shape = {Y_train.shape}')
print(f'X_test.shape = {X_test.shape}')
print(f'Y_test.shape = {Y_test.shape}')

X_train.shape = (5646, 30)
Y_train.shape = (5646, 2)
X_test.shape = (1882, 30)
Y_test.shape = (1882, 2)


### Model training

#### Model architecture

In [88]:
model = Sequential(name='spam_v2_2020')
model.add(Embedding(MAX_NUM_WORDS, EMBEDDING_DIM, input_length=X_train.shape[1]))
model.add(SpatialDropout1D(SPATIAL_DROPOUT_FRACTION))
model.add(LSTM(LSTM_OUT_DIM, dropout=LSTM_DROPOUT_FRACTION, recurrent_dropout=LSTM_RECURRENT_DROPOUT_FRACTION))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#
print(model.summary())

Model: "spam_v2_2020"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 30, 128)           23040     
                                                                 
 spatial_dropout1d_2 (Spatia  (None, 30, 128)          0         
 lDropout1D)                                                     
                                                                 
 lstm_2 (LSTM)               (None, 196)               254800    
                                                                 
 dense_2 (Dense)             (None, 2)                 394       
                                                                 
Total params: 278,234
Trainable params: 278,234
Non-trainable params: 0
_________________________________________________________________
None


#### Training

In [89]:
print('** Training starts...\n')
model.fit(X_train, y_train,
          validation_data=(X_test, Y_test),
          batch_size=TRAIN_BATCH_SIZE, verbose=1,
          epochs=TRAIN_EPOCHS)
print('\n** Training completed')

** Training starts...

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

** Training completed


#### Evaluate model metrics

In [95]:
# from sklearn.metrics import accuracy_score, recall_score, precision_score

# Y_predict = model.predict(X_test)
# # accuracy = accuracy_score(Y_test, Y_predict)
# precision = precision_score(Y_test, Y_predict)
# recall = recall_score(Y_test, Y_predict)

# # print('Accuracy:  %.4f' % accuracy)
# print('Precision: %.4f' % precision)
# print('Recall:    %.4f' % recall)

### Storing the model

In [98]:
print('Saving the trained model ...', end='')
model_out_file = os.path.join(model_save_dir, 'model2.h5')
model.save(model_out_file)
print(f'done [{model_out_file}]')

print('Saving model metadata ...', end='')
metadata_out_file = os.path.join(model_save_dir, 'model2_metadata.json')
#
model_metadata = {
    'label_legend_inverted': labelLegendInverted,
    'label_legend': labelLegend,
#     'max_words': MAX_NUM_WORDS,
}
#
json.dump(model_metadata, open(metadata_out_file, 'w'), indent=2)
print(f'done [{metadata_out_file}]')

Saving the trained model ...done [/home/stefano/personal/WORK_Datastax/mlops-speedrun-spamclassifier/training/model2_2020/../../models/model2_2020/classifier/model2.h5]
Saving model metadata ...done [/home/stefano/personal/WORK_Datastax/mlops-speedrun-spamclassifier/training/model2_2020/../../models/model2_2020/classifier/model2_metadata.json]


#### Test load-and-apply

In [117]:
from tensorflow.keras import models

from analysis.features2.feature2_extractor import Feature2Extractor

#
feature2_extractor = Feature2Extractor()
loaded_model = models.load_model(model_out_file)
loaded_metadata = json.load(open(metadata_out_file))

# prediction
input_text = 'hi guys download this shady thing if you like free cash and a prize'
feats = feature2_extractor.get_features_list(input_text)
probabilities = loaded_model.predict(np.array([feats]))[0].tolist()
prediction = {
    lb: probabilities[lbi]
    for lb, lbi in loaded_metadata['label_legend'].items()
}
#
prediction



{'ham': 0.1282256543636322, 'spam': 0.8717743754386902}