# Training of the LSTM "v2" model

## Data retrieval

In [None]:
import os
import datetime
import json

import pandas
import numpy as np
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.layers import Dense

from sklearn.metrics import classification_report

from feast import FeatureStore

In [None]:
root_dir = os.path.join(os.getcwd(), '..', '..')
store_dir = os.path.join(root_dir, 'sms_feature_store')
raw_data_dir = os.path.join(root_dir, 'raw_data')
model_save_dir = os.path.join(root_dir, 'models', 'model2_2020', 'classifier')

In [None]:
base_dir = os.getcwd()
input_dir = os.path.join(base_dir, '..', '..',  'models', 'model2_2020', 'tokenizer')
input_metadata_file = os.path.join(input_dir, 'settings.json')
tokenizer_settings = json.load(open(input_metadata_file))
MAX_NUM_WORDS = tokenizer_settings['MAX_NUM_WORDS']

In [None]:
EMBEDDING_DIM = 6
LSTM_OUT_DIM = 6
SPATIAL_DROPOUT_FRACTION = 0.05
LSTM_DROPOUT_FRACTION = 0.05
LSTM_RECURRENT_DROPOUT_FRACTION = 0.05
#
SPLIT_TEST_SIZE = 0.25
TRAIN_BATCH_SIZE = 32
TRAIN_EPOCHS = 3

In [None]:
training_timefreeze = datetime.datetime(2020, 7, 2)
print(f"Freezing time to {training_timefreeze.strftime('%Y-%m-%d %H:%M:%S')} for training")

In [None]:
store = FeatureStore(repo_path=store_dir)

In [None]:
training_sms_ids = [
    sms_id
    for sms_id in (
        line.strip()
        for line in open(os.path.join(raw_data_dir, 'training_sms_ids.txt')).readlines()
    )
    if sms_id
]

In [None]:
entities_df = pandas.DataFrame.from_dict({
    'sms_id': training_sms_ids,
})
entities_df['event_timestamp'] = training_timefreeze

historical_df = store.get_historical_features(
    entity_df=entities_df,
    features=[
        'sms_labels:label',
        'sms_features2:features',
    ],
).to_df()

In [None]:
historical_df

## Training

### Data transformation

In [None]:
labelLegend = {
    lb: idx
    for idx, lb in enumerate(np.unique(historical_df['label']))
}
labelLegendInverted = {'%i' % v: k for k,v in labelLegend.items()}
#
print(f'labelLegend: {labelLegend}')
print(f'labelLegendInverted: {labelLegendInverted}')

In [None]:
oneHotEncodedLabels = to_categorical(historical_df['label'].map(lambda lb: labelLegend[lb]))
print(oneHotEncodedLabels)

In [None]:
X = pandas.DataFrame(historical_df.features.tolist()).to_numpy()
Y = oneHotEncodedLabels
#
print(f'X = {X}')
print(f'\nY = {Y}')

#### Training/testing split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=SPLIT_TEST_SIZE, random_state=2022)
#
print(f'X_train.shape = {X_train.shape}')
print(f'Y_train.shape = {Y_train.shape}')
print(f'X_test.shape = {X_test.shape}')
print(f'Y_test.shape = {Y_test.shape}')

### Model training

#### Model architecture

In [None]:
model = Sequential(name='spam_v2_2020')
model.add(Embedding(MAX_NUM_WORDS, EMBEDDING_DIM, input_length=X_train.shape[1]))
model.add(SpatialDropout1D(SPATIAL_DROPOUT_FRACTION))
model.add(LSTM(LSTM_OUT_DIM, dropout=LSTM_DROPOUT_FRACTION, recurrent_dropout=LSTM_RECURRENT_DROPOUT_FRACTION))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#
print(model.summary())

#### Training

In [None]:
print('** Training starts...\n')
model.fit(X_train, Y_train,
          validation_data=(X_test, Y_test),
          batch_size=TRAIN_BATCH_SIZE, verbose=1,
          epochs=TRAIN_EPOCHS)
print('\n** Training completed')

#### Evaluate model metrics

In [None]:
Y_predict = model.predict(X_test)
predicted = np.argmax(Y_predict, axis=1)
report = classification_report(np.argmax(Y_test, axis=1), predicted)
print(report)

### Storing the model

In [None]:
print('Saving the trained model ...', end='')
model_out_file = os.path.join(model_save_dir, 'model2.h5')
model.save(model_out_file)
print(f'done [{model_out_file}]')

print('Saving model metadata ...', end='')
metadata_out_file = os.path.join(model_save_dir, 'model2_metadata.json')
#
model_metadata = {
    'label_legend_inverted': labelLegendInverted,
    'label_legend': labelLegend,
}
#
json.dump(model_metadata, open(metadata_out_file, 'w'), indent=2)
print(f'done [{metadata_out_file}]')

#### Test load-and-apply

In [None]:
from tensorflow.keras import models

from analysis.features2.feature2_extractor import Feature2Extractor

#
feature2_extractor = Feature2Extractor()
loaded_model = models.load_model(model_out_file)
loaded_metadata = json.load(open(metadata_out_file))

# prediction
input_text = 'hi guys download this shady thing if you like free cash and a prize'
feats = feature2_extractor.get_features_list(input_text)
probabilities = loaded_model.predict(np.array([feats]))[0].tolist()
prediction = {
    lb: probabilities[lbi]
    for lb, lbi in loaded_metadata['label_legend'].items()
}
#
prediction