In [1]:
import pandas as pd
import os
import yaml
import psycopg2
import boto3
from sqlalchemy.engine import create_engine

with open("../config.yaml") as f:
    config = yaml.safe_load(f)

### Split Datasets

In [2]:
df = pd.read_csv("../../level_2/heart.csv")

In [3]:
categorical_set = ['patient_id', 'age', 'sex', 'cp', 'exang', 'fbs', 'slope', 'thal']
continuous_set = ['patient_id', 'trestbps', 'chol', 'restecg', 'thalach', 'oldpeak', 'ca']
target_set = ['patient_id', 'target']

In [4]:
df_categorical = df[categorical_set]
df_continuous = df[continuous_set]
df_target = df[target_set]

In [6]:
df_categorical.to_csv("heart_disease_categorical.csv", index=False)
df_continuous.to_csv("heart_disease_continuous.csv", index=False)
df_target.to_csv("heart_disease_target.csv", index=False)
df_target.to_parquet("heart_disease_target.parquet", index=False)

### Upload to PostgreSQL

In [6]:
engine = create_engine(config["POSTGRESQL_DB"])

In [7]:
df_continuous.to_sql("heart_disease_continuous", engine, index=False)

### Upload to S3

In [7]:
os.environ["AWS_ACCESS_KEY_ID"] = config["AWS_ACCESS_KEY_ID"]
os.environ["AWS_SECRET_ACCESS_KEY"] = config["AWS_SECRET_ACCESS_KEY"]
os.environ["AWS_DEFAULT_REGION"] = config["AWS_DEFAULT_REGION"]

In [8]:
s3 = boto3.resource('s3')
my_bucket = s3.Bucket(config["AWS_BUCKET"])

my_bucket.upload_file("heart_disease_categorical.csv", "heart/heart_disease_categorical.csv")

objects = []
for object_summary in my_bucket.objects.filter(Prefix="heart/"):
    objects.append(object_summary.key)

objects

['heart/heart.csv',
 'heart/heart_disease_categorial.csv',
 'heart/heart_disease_categorical.csv',
 'heart/heart_ingested.csv']