In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

bucket = 'jasonwang-insurance-bucket'  # replace with your bucket name
data_key = 'data'  # replace with your data key
data_location = 's3://{}/{}'.format(bucket, data_key)
df = pd.read_parquet(data_location)

In [10]:
df.head(10)

Unnamed: 0,recordid,customerid,generalhealth,checkup,exercise,heartdisease,skincancer,othercancer,depression,diabetes,...,sex,agecategory,height,weight,bmi,smokinghistory,alcoholconsumption,fruitconsumption,greenvegetablesconsumption,friedpotatoconsumption
0,271475,,Poor,Within the past year,False,False,True,False,False,False,...,F,80+,140,39.46,20.22,True,0,4,20,5
1,20678,,Good,Within the past year,False,False,False,False,False,False,...,M,60-64,178,88.0,27.84,False,6,30,30,8
2,274280,,Fair,Within the past year,True,False,False,False,False,True,...,M,60-64,178,113.4,35.87,False,20,12,8,8
3,5348,,Good,5 or more years ago,True,False,False,False,False,False,...,M,65-69,170,77.11,26.63,True,28,3,24,16
4,108221,,Fair,Within the past year,True,False,True,True,True,False,...,M,65-69,170,68.04,23.49,True,16,3,16,4
5,147978,,Excellent,Within the past year,True,False,False,False,True,False,...,F,65-69,163,61.23,23.17,True,0,60,30,4
6,89857,,Excellent,Within the past year,False,False,False,False,False,False,...,M,55-59,170,88.45,30.54,True,4,60,16,4
7,243693,,Very Good,Within the past year,True,False,False,False,False,False,...,M,75-79,191,81.65,22.5,False,0,30,4,8
8,211681,,Excellent,Within the past 2 years,True,False,False,False,False,False,...,F,30-34,160,49.9,19.49,False,0,60,4,4
9,154936,,Excellent,Within the past year,True,False,False,True,False,False,...,F,60-64,163,67.59,25.58,True,8,30,20,10


In [11]:
df.dtypes

recordid                        int32
customerid                    float64
generalhealth                  object
checkup                        object
exercise                         bool
heartdisease                     bool
skincancer                       bool
othercancer                      bool
depression                       bool
diabetes                         bool
arthritis                        bool
sex                            object
agecategory                    object
height                          int32
weight                        float64
bmi                           float64
smokinghistory                   bool
alcoholconsumption              int32
fruitconsumption                int32
greenvegetablesconsumption      int32
friedpotatoconsumption          int32
dtype: object

In [17]:
categorical_columns = ['generalhealth', 'checkup', 'sex', 'agecategory']
df_encoded = pd.get_dummies(df, columns=categorical_columns)
df_encoded.drop(['recordid', 'customerid'], axis = 1, inplace=True)

In [18]:
df_encoded.head(10)

Unnamed: 0,exercise,heartdisease,skincancer,othercancer,depression,diabetes,arthritis,height,weight,bmi,...,agecategory_35-39,agecategory_40-44,agecategory_45-49,agecategory_50-54,agecategory_55-59,agecategory_60-64,agecategory_65-69,agecategory_70-74,agecategory_75-79,agecategory_80+
0,False,False,True,False,False,False,True,140,39.46,20.22,...,False,False,False,False,False,False,False,False,False,True
1,False,False,False,False,False,False,False,178,88.0,27.84,...,False,False,False,False,False,True,False,False,False,False
2,True,False,False,False,False,True,True,178,113.4,35.87,...,False,False,False,False,False,True,False,False,False,False
3,True,False,False,False,False,False,False,170,77.11,26.63,...,False,False,False,False,False,False,True,False,False,False
4,True,False,True,True,True,False,True,170,68.04,23.49,...,False,False,False,False,False,False,True,False,False,False
5,True,False,False,False,True,False,False,163,61.23,23.17,...,False,False,False,False,False,False,True,False,False,False
6,False,False,False,False,False,False,False,170,88.45,30.54,...,False,False,False,False,True,False,False,False,False,False
7,True,False,False,False,False,False,False,191,81.65,22.5,...,False,False,False,False,False,False,False,False,True,False
8,True,False,False,False,False,False,False,160,49.9,19.49,...,False,False,False,False,False,False,False,False,False,False
9,True,False,False,True,False,False,False,163,67.59,25.58,...,False,False,False,False,False,True,False,False,False,False


In [19]:
X = df_encoded.drop(['heartdisease', 'diabetes'], axis=1)
y_heart = df_encoded['heartdisease']
y_diabetes = df_encoded['diabetes']

X_train_heart, X_test_heart, y_train_heart, y_test_heart = train_test_split(X, y_heart, test_size=0.2, random_state=42)
X_train_diabetes, X_test_diabetes, y_train_diabetes, y_test_diabetes = train_test_split(X, y_diabetes, test_size=0.2, random_state=42)

ct = ColumnTransformer([('standardize', StandardScaler(), X.columns)], remainder='passthrough')
X_train_heart = ct.fit_transform(X_train_heart)
X_test_heart = ct.transform(X_test_heart)
X_train_diabetes = ct.fit_transform(X_train_diabetes)
X_test_diabetes = ct.transform(X_test_diabetes)

In [29]:
model_heart = LogisticRegression()
model_heart.fit(X_train_heart, y_train_heart)
y_pred_heart = logreg.predict(X_test_heart)
print(classification_report(y_test_heart, y_pred_heart))

              precision    recall  f1-score   support

       False       0.92      1.00      0.96     56781
        True       0.52      0.06      0.11      4990

    accuracy                           0.92     61771
   macro avg       0.72      0.53      0.53     61771
weighted avg       0.89      0.92      0.89     61771



In [26]:
model_diabetes = LogisticRegression()
model_diabetes.fit(X_train_diabetes, y_train_diabetes)
y_pred_diabetes = logreg.predict(X_test_diabetes)
print(classification_report(y_test_diabetes, y_pred_diabetes))

              precision    recall  f1-score   support

       False       0.87      0.99      0.93     53741
        True       0.42      0.03      0.06      8030

    accuracy                           0.87     61771
   macro avg       0.65      0.51      0.49     61771
weighted avg       0.81      0.87      0.82     61771



In [30]:
import joblib

joblib.dump(model_heart, 'model_heart.joblib')
joblib.dump(model_diabetes, 'model_diabetes.joblib')

['model_diabetes.joblib']

In [32]:
import boto3
import os

prefix = 'model'

session = boto3.Session()
s3_client = session.client('s3')

# Upload the models to S3
s3_client.upload_file('model_heart.joblib', bucket, os.path.join(prefix, 'model_heart.joblib'))
s3_client.upload_file('model_diabetes.joblib', bucket, os.path.join(prefix, 'model_diabetes.joblib'))