# Data retrieval

In [None]:
import os
# import sys
import datetime
import pandas

from feast import FeatureStore

In [None]:
root_dir = os.path.join(os.getcwd(), '..', '..')
store_dir = os.path.join(root_dir, 'sms_feature_store')
raw_data_dir = os.path.join(root_dir, 'raw_data')

In [None]:
training_timefreeze = datetime.datetime(2019, 5, 5)
print(f"Freezing time to {training_timefreeze.strftime('%Y-%m-%d %H:%M:%S')} for training")

In [None]:
store = FeatureStore(repo_path=store_dir)

In [None]:
training_sms_ids = [
    int(sms_id)
    for sms_id in (
        line.strip()
        for line in open(os.path.join(raw_data_dir, 'training_sms_ids.txt')).readlines()
    )
    if sms_id
]

In [None]:
entities_df = pandas.DataFrame.from_dict({
    'sms_id': training_sms_ids,
})
entities_df['event_timestamp'] = training_timefreeze

historical_df = store.get_historical_features(
    entity_df=entities_df,
    features=[
        'sms_labels:label',
        'sms_features1:cap_r',
        'sms_features1:nal_r',
    ] + [
        'sms_features1:cw_scores_%i' % i
        for i in range(7)
    ],
).to_df()

In [None]:
historical_df

# Training

## Data transformation

In [None]:
x_columns = ['cap_r', 'nal_r', 'cw_scores_0',
             'cw_scores_1', 'cw_scores_2', 'cw_scores_3',
             'cw_scores_4', 'cw_scores_5', 'cw_scores_6']
y_raw_columns = ['label']
y_columns = ['label_int']

In [None]:
training_df = historical_df[x_columns + y_raw_columns]

In [None]:
training_df['label_int'] = training_df['label'].map(lambda lb: {'spam': 1, 'ham': 0}[lb])

## Training

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score

In [None]:
model = RandomForestClassifier(n_estimators=10, max_depth=10,
                               min_samples_split=2, random_state=0)

In [None]:
XY_train, XY_test = train_test_split(training_df, test_size=.2)
X_train = XY_train[x_columns]
Y_train = XY_train[y_columns]['label_int']
X_test = XY_test[x_columns]
Y_test = XY_test[y_columns]['label_int']

In [None]:
model.fit(X_train, Y_train)

### evaluate model metrics

In [None]:
Y_predict = model.predict(X_test)
accuracy = accuracy_score(Y_test, Y_predict)
precision = precision_score(Y_test, Y_predict)
recall = recall_score(Y_test, Y_predict)

print('Accuracy:  %.4f' % accuracy)
print('Precision: %.4f' % precision)
print('Recall:    %.4f' % recall)

## Storing the model

In [None]:
import joblib

In [None]:
o_filename = os.path.join(root_dir, 'models', 'model1_2019', 'model1.pkl')
joblib.dump(model, o_filename, compress=9)

#### Test load-and-apply

In [None]:
loaded = joblib.load(o_filename)
loaded.predict(pandas.DataFrame(
    {
        'cap_r': [0.2, 0.05],
        'nal_r': [0.05, 0.02],
        'cw_scores_0': [0, 0],
        'cw_scores_1': [0, 0],
        'cw_scores_2': [1, 0],
        'cw_scores_3': [0, 1],
        'cw_scores_4': [1, 0],
        'cw_scores_5': [1, 0],
        'cw_scores_6': [0, 0],
    }
))