In [1]:
from io import BytesIO
import json
import boto3
import joblib
import logging
import pickle
from healthinsurance.HealthInsurance import HealthInsurance

In [2]:
# Define logger class
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Helper Function to download object from S3 Bucket
def DownloadFromS3(bucket:str, key:str):
    s3 = boto3.client('s3')

    with BytesIO() as f:
        s3.download_fileobj(Bucket = bucket, Key = key, Fileobj = f)
        f.seek(0)
        df_test = joblib.load(f)
    
    return df_test

In [3]:
# Load model into memory
logger.info('Loading model from file...')

model = pickle.load(open('./model_random_forest.pkl', 'rb')) # where the model is stored

logger.info('Model loaded from file.')

In [76]:
# Lambda Function

# read JSON data packet containing S3 Bucket specs to access test dataset
bucket = 'joaomj-lambda-buckets-2022'
key = 'validation/df_test.joblib'
percentage = 0.4

# load test data from S3
logger.info(f'Loading data from{bucket}/{key}')
df_test_raw = DownloadFromS3(bucket, key)
logger.info(f'Loaded {type(key)} from S3...')

# ========================================
# code from 'handler.py'


# instantiate HealthInsurance class
logger.info('Instantiating HealthInsurance class...')
pipeline = HealthInsurance()
logger.info('HealthInsurance class instantiated...')

# data cleaning
logger.info('Data cleaning...')
df_test = df_test_raw.copy()
df1 = pipeline.data_cleaning(df_test)

# feature engineering
logger.info('Feature engineering...')
df2 = pipeline.feature_engineering(df1)

# data preparation
logger.info('Data preparation...')    
df3 = pipeline.data_preparation(df2)

# prediction
logger.info('Making prediction...')
prediction = pipeline.get_prediction(model, df3)

# join prediction with original test data
logger.info('Joining prediction with test data...')
df_test_raw.rename(columns = {'response':'score'}, inplace = True)
df_test_raw['score'] = prediction

In [81]:
# use only the specified percentage of dataset
import numpy as np

df_test_raw.sort_values(by = 'score', ascending = False, inplace = True, ignore_index = True)
a, b, c = np.split(df_test_raw, [int(percentage*len(df_test)), int((1-percentage)*len(df_test))])

In [78]:
# return json to be used by API
logger.info('Preparing response as JSON file...')
response = a.to_json(orient = 'columns', date_format = 'iso', double_precision = 4)

size_obj = len(response.encode('utf-8'))

print('The size is: {} MB'.format(size_obj/(1024*1024))) # size in megabytes

The size is: 4.216194152832031 MB


In [82]:
# the size of a list is < size of a dataframe?

# response = df_test.to_json(orient = 'records', date_format = 'iso')

response1 = json.dumps(a.values.tolist(), separators=(',', ':')) # convert dataframe to list
size_obj1 = len(response1.encode('utf-8'))
print('The size is: {} MB'.format(size_obj1/(1024*1024))) # size in megabytes

The size is: 1.6101608276367188 MB


In [79]:
a.head()

Unnamed: 0,id,gender,age,region_code,policy_sales_channel,driving_license,vehicle_age,vehicle_damage,previously_insured,annual_premium,vintage,score
0,43790,Female,25,50,152,1,New,0,0,31559.0,116,1.0
1,227205,Female,24,18,152,1,New,0,1,27473.0,66,1.0
2,14595,Male,51,47,113,1,Average,0,1,32497.0,216,1.0
3,35188,Female,31,18,152,1,New,0,1,24021.0,205,1.0
4,11138,Male,27,46,152,1,New,0,1,27043.0,163,1.0


In [87]:
# viewing the response
import pandas as pd

resp = pd.read_json(response1)
resp.rename(columns = {
    0:'id', 
    1:'gender', 
    2:'age',
    3:'region_code',
    4:'policy_sales_channel',
    5:'driving_license',
    6:'vehicle_age',
    7:'vehicle_damage',
    8:'previously_insured',
    9:'annual_premium',
    10:'vintage',
    11:'score'
    }
, inplace = True)
resp.head()

Unnamed: 0,id,gender,age,region_code,policy_sales_channel,driving_license,vehicle_age,vehicle_damage,previously_insured,annual_premium,vintage,score
0,43790,Female,25,50,152,1,New,0,0,31559,116,1
1,150717,Male,41,28,124,1,Average,0,0,27124,227,1
2,361307,Male,71,28,26,1,Average,0,1,54077,294,1
3,89594,Male,23,8,152,1,New,0,1,199154,191,1
4,268871,Female,20,45,160,1,New,0,1,28137,143,1


In [None]:
#     # # make predictions and return them as JSON
#     # logger.info(f'Performing predictions...')
#     # predictions = model.predict(df_test)
#     # response = json.dumps(predictions.tolist())

#     # return {
#     #     'statusCode': 200,
#     #     'headers':{
#     #         'Content-type':'application/json'
#     #     },
#     #     'body':response 
#     # }