In [1]:
from io import BytesIO
import json
import boto3
import joblib
import logging
import pickle
from healthinsurance.HealthInsurance import HealthInsurance

In [2]:
# Define logger class
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Helper Function to download object from S3 Bucket
def DownloadFromS3(bucket:str, key:str):
    s3 = boto3.client('s3')

    with BytesIO() as f:
        s3.download_fileobj(Bucket = bucket, Key = key, Fileobj = f)
        f.seek(0)
        df_test = joblib.load(f)
    
    return df_test

In [3]:
# Load model into memory
logger.info('Loading model from file...')

model = pickle.load(open('./model_random_forest.pkl', 'rb')) # where the model is stored

logger.info('Model loaded from file.')

In [76]:
# Lambda Function

# read JSON data packet containing S3 Bucket specs to access test dataset
bucket = 'joaomj-lambda-buckets-2022'
key = 'validation/df_test.joblib'
percentage = 0.4

# load test data from S3
logger.info(f'Loading data from{bucket}/{key}')
df_test_raw = DownloadFromS3(bucket, key)
logger.info(f'Loaded {type(key)} from S3...')

# ========================================
# code from 'handler.py'


# instantiate HealthInsurance class
logger.info('Instantiating HealthInsurance class...')
pipeline = HealthInsurance()
logger.info('HealthInsurance class instantiated...')

# data cleaning
logger.info('Data cleaning...')
df_test = df_test_raw.copy()
df1 = pipeline.data_cleaning(df_test)

# feature engineering
logger.info('Feature engineering...')
df2 = pipeline.feature_engineering(df1)

# data preparation
logger.info('Data preparation...')    
df3 = pipeline.data_preparation(df2)

# prediction
logger.info('Making prediction...')
prediction = pipeline.get_prediction(model, df3)

# join prediction with original test data
logger.info('Joining prediction with test data...')
df_test_raw.rename(columns = {'response':'score'}, inplace = True)
df_test_raw['score'] = prediction

In [81]:
# use only the specified percentage of dataset
import numpy as np

df_test_raw.sort_values(by = 'score', ascending = False, inplace = True, ignore_index = True)
a, b, c = np.split(df_test_raw, [int(percentage*len(df_test)), int((1-percentage)*len(df_test))])

In [78]:
# return json to be used by API
logger.info('Preparing response as JSON file...')
response = json.dumps(a.values.tolist(), separators=(',', ':')) # convert dataframe to list
size_obj1 = len(response.encode('utf-8'))
print('The size is: {} MB'.format(size_obj1/(1024*1024))) # size in megabytes

The size is: 4.216194152832031 MB
