# Database sample

## Store the database in the cloud

For a colaborative environment in preprocessing: A chunck of the original database is stored into an AWS service: RDS.

In [1]:
# Imports
import os
import pandas as pd
import random
import time
import string
from sqlalchemy import create_engine
from config import db_password, db_scp_password
import warnings

In [2]:
# flags
pd.set_option('display.max_columns', None) #
warnings.filterwarnings('ignore')

In [3]:
# extract data from local file only a chuck of 40000 rows
def readData(*argv):
    relative_path = os.path.join(argv[0],argv[1],argv[2])
    textFileReder = pd.read_csv(relative_path, chunksize=40000)
    mex_df=textFileReder.get_chunk(40000)
    return mex_df

# store data using AWS-RDS
def storeDatabase(df,table_name, password,location,port,database):
    db_string = f"postgresql://postgres:{db_scp_password}@{location}:{port}/{database}"
    engine = create_engine(db_string)
    df.to_sql(name=table_name, con=engine, if_exists='append')

In [4]:
# load the data from csv file
data_df = readData('Resources','RawData','220616COVID19MEXICO.csv') #chuck data from original dataset


In [5]:
# rename columns 
data_df.rename(columns = {'FECHA_ACTUALIZACION':'data_file_updated', 'ID_REGISTRO':'id_patient',
                         'ORIGEN':'resp_monitoring','SECTOR':'type_institution','ENTIDAD_UM':'state_medical_unit',
                          'SEXO':'gender','ENTIDAD_NAC':'state_patient_birth','ENTIDAD_RES':'state_residence',
                          'MUNICIPIO_RES':'city_patient_birth','TIPO_PACIENTE':'type_patient',
                         'FECHA_INGRESO':'date_admitted','FECHA_SINTOMAS':'date_patient_symp',
                         'FECHA_DEF':'date_patient_death','INTUBADO':'intubated','NEUMONIA':'pneumonia',
                         'EDAD':'age','NACIONALIDAD':'resident',
                         'EMBARAZO':'pregnant','HABLA_LENGUA_INDIG':'indigenous_lang',
                         'INDIGENA':'indigenous','DIABETES':'diabetes',
                         'EPOC':'copd','ASMA':'asthma','INMUSUPR':'immunosup',
                         'HIPERTENSION':'hypertension','OTRA_COM':'another_complication',
                         'CARDIOVASCULAR':'cardiovascular','OBESIDAD':'obesity',
                         'RENAL_CRONICA':'renal_chronic','TABAQUISMO':'tobacco',
                         'OTRO_CASO':'closed_contanct','TOMA_MUESTRA_LAB':'lab_sample',
                         'RESULTADO_LAB':'lab_result','TOMA_MUESTRA_ANTIGENO':'antigen_sample',
                         'RESULTADO_ANTIGENO':'antigen_result','CLASIFICACION_FINAL':'final_class',
                         'MIGRANTE':'migrant','PAIS_NACIONALIDAD':'country_nationality',
                          'PAIS_ORIGEN':'country_patient_birth','UCI':'icu'}, inplace = True )

In [6]:
# connect with the database sample_covid_patients that contains a chuck of the original dataset

db_string = f"postgresql://postgres:{db_scp_password}@prodSampleCovidPatients.cqbgcjbaetrj.us-west-1.rds.amazonaws.com:5432/sample_covid_patients"
engine = create_engine(db_string)

In [7]:
# run sql to store data into AWS-RDS
# data is already loaded, this steps is not required
# incommend in case you need it

# data_df.to_sql(name='patient', con=engine, if_exists='append')

In [8]:
## import table: patient as a dataframe 
sql_query = 'SELECT index, id_patient, state_medical_unit, state_residence, data_file_updated,\
resp_monitoring, type_institution, gender, state_patient_birth, city_patient_birth, type_patient,\
date_admitted, date_patient_symp, date_patient_death, intubated, pneumonia, age, resident, pregnant,\
indigenous_lang, indigenous, diabetes, copd, asthma, immunosup, hypertension, another_complication,\
cardiovascular, obesity, renal_chronic, tobacco, closed_contanct, lab_sample, lab_result, antigen_sample,\
antigen_result, final_class, migrant, country_nationality, country_patient_birth, icu \
FROM public.patient;'

# display results from the database
chunck_patients_df = pd.read_sql_query(sql_query,engine)
chunck_patients_df.head(15)

Unnamed: 0,index,id_patient,state_medical_unit,state_residence,data_file_updated,resp_monitoring,type_institution,gender,state_patient_birth,city_patient_birth,type_patient,date_admitted,date_patient_symp,date_patient_death,intubated,pneumonia,age,resident,pregnant,indigenous_lang,indigenous,diabetes,copd,asthma,immunosup,hypertension,another_complication,cardiovascular,obesity,renal_chronic,tobacco,closed_contanct,lab_sample,lab_result,antigen_sample,antigen_result,final_class,migrant,country_nationality,country_patient_birth,icu
0,0,z3bf80,8,8,2022-06-16,2,12,2,8,37,1,2020-07-28,2020-07-20,9999-99-99,97,2,35,1,97,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,2,97,3,99,México,97,97
1,1,z1e370,14,14,2022-06-16,1,12,1,14,85,1,2020-04-22,2020-04-18,9999-99-99,97,2,42,1,2,2,2,2,2,1,2,2,2,2,2,2,2,2,1,2,2,97,7,99,México,97,97
2,2,zze974,24,24,2022-06-16,1,6,1,24,35,1,2021-02-28,2021-02-20,9999-99-99,97,99,34,1,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,2,2,97,7,99,México,97,97
3,3,zz7067,9,9,2022-06-16,1,12,2,9,7,1,2020-08-18,2020-08-17,9999-99-99,97,2,51,1,97,2,2,2,2,2,2,1,2,2,2,2,2,2,1,2,2,97,7,99,México,97,97
4,4,z1da1e,1,1,2022-06-16,1,12,2,1,1,1,2020-03-09,2020-03-05,9999-99-99,97,99,30,1,97,1,2,2,2,2,2,2,2,2,2,2,2,1,1,2,2,97,7,99,México,97,97
5,5,z393a3,9,9,2022-06-16,1,12,1,9,17,1,2020-12-28,2020-12-28,9999-99-99,97,2,47,1,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,97,1,2,7,99,México,97,97
6,6,z59dea,7,7,2022-06-16,1,12,2,7,78,1,2020-06-28,2020-06-24,9999-99-99,97,2,47,1,97,2,2,2,2,2,2,1,2,2,2,2,2,2,2,97,2,97,6,99,México,97,97
7,7,z5ba5b,8,8,2022-06-16,2,12,2,10,37,1,2020-07-31,2020-07-28,9999-99-99,97,2,38,1,97,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,97,7,99,México,97,97
8,8,z2eace,15,15,2022-06-16,1,3,1,15,106,2,2020-09-23,2020-09-20,9999-99-99,2,2,7,1,2,2,2,2,2,2,2,2,2,2,1,2,2,2,1,4,2,97,6,99,México,97,2
9,9,z38de4,7,7,2022-06-16,1,12,1,7,101,1,2020-05-23,2020-05-20,9999-99-99,97,2,7,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,97,2,97,6,99,México,97,97


# First Segment

* ✓ Sample data that mimics the expected final database structure or schema 
* ✓ Draft machine learning module is connected to the provisional database

In [9]:
# Functions
    
# function to sample data from original field sets    
def sampleData():
    # Create sample data 
    chucksize = 40000

    id_patient = [ x for x in  range(0,chucksize) ]
    state_medical_unit = random.choices(range(0,10), k=chucksize) #choose the 10 most important states
    state_residence = random.choices(range(0,10), k=chucksize) #choose the 10 most important states
    data_file_updated = [ randomDate('2020-04-19', '2020-06-03', random.random()) for x in range(0,chucksize)] 
    resp_monitoring = random.choices([0, 1], k=chucksize) 
    type_institution = random.choices([0,1, 2, 3, 4, 5, 6, 7, 8, 9, 10], k=chucksize) # 10 most importan institutions
    gender = random.choices([0, 1], k=chucksize) 
    state_patient_birth = random.choices(range(0, 10), k=chucksize)   # 10 most important state of birth
    city_patient_birth = random.choices(range(0, 10), k=chucksize)  # 10 most important
    type_patient = random.choices([0, 1], k=chucksize) 
    date_admitted = [ randomDate("2020-01-01", "2020-06-03", random.random()) for x in range(0,chucksize)] 
    date_patient_symp = [ randomDate("2020-01-01", "2020-06-02", random.random()) for x in range(0,chucksize)] 
    date_patient_death = [ randomDate("2020-01-11", "2020-06-04", random.random()) for x in range(0,chucksize)] #2020-06-04 represents 9999-99-99
    intubated = random.choices([0, 1], k=chucksize)
    pneumonia =random.choices([0, 1], k=chucksize)  
    age = random.choices(range(0,5), k=chucksize)  # choose 6 intervals of ages
    resident = random.choices([0, 1], k=chucksize)
    pregnant =random.choices([0, 1], k=chucksize)
    indigenous_lang = random.choices([0, 1], k=chucksize) 
    diabetes = random.choices([0, 1], k=chucksize) 
    copd = random.choices([0, 1], k=chucksize)
    asthma = random.choices([0, 1], k=chucksize) 
    immunosup = random.choices([0, 1], k=chucksize) 
    hypertension = random.choices([0, 1], k=chucksize) 
    cardiovascular =random.choices([0, 1], k=chucksize) 
    another_complication = random.choices([0, 1], k=chucksize) 
    obesity = random.choices([0, 1], k=chucksize)
    renal_chronic = random.choices([0, 1], k=chucksize)  
    tobacco = random.choices([0, 1], k=chucksize)  
    closed_contanct = random.choices([0, 1], k=chucksize) 
    indigenous = random.choices([0, 1], k=chucksize)
    lab_sample = random.choices([0, 1], k=chucksize)
    lab_result = random.choices([0, 1], k=chucksize) # not based on pending results
    antigen_sample = random.choices([0, 1], k=chucksize)
    antigen_result = random.choices([0, 1], k=chucksize)
    final_class = random.choices([0, 1,2], k=chucksize) #only confirmed epi, confirmed jugm, sars-cov confirmed.
    migrant = random.choices([0, 1], k=chucksize) 
    country_nationality = random.choices(range(0,10), k=chucksize) 
    country_patient_birth = random.choices(range(0,10), k=chucksize) #choose the 10 most important countries
    icu = random.choices([0, 1], k=chucksize)  
    data = {
     'id_patient' : id_patient,
     'state_medical_unit': state_medical_unit,
     'state_residence' : state_residence,
     'data_file_updated':data_file_updated,
     'resp_monitoring':resp_monitoring,
     'type_institution':type_institution,
     'gender':gender,
     'state_patient_birth':state_patient_birth,
     'city_patient_birth':city_patient_birth,
     'type_patient':type_patient,
     'date_admitted':date_admitted,
     'date_patient_symp':date_patient_symp,
     'date_patient_death':date_patient_death,
     'intubated':intubated,
     'pneumonia':pneumonia,
     'age':age,
     'resident':resident,
     'pregnant':pregnant,
     'indigenous_lang':indigenous_lang,
     'indigenous':indigenous,
     'diabetes':diabetes,
     'copd':copd,
     'asthma':asthma,
     'immunosup':immunosup,
     'hypertension':hypertension,
     'another_complication':another_complication,
     'cardiovascular':cardiovascular,
     'obesity':obesity,
     'renal_chronic':renal_chronic,
     'tobacco':tobacco,
     'closed_contanct':closed_contanct,
     'lab_sample':lab_sample,
     'lab_result':lab_result,
     'antigen_sample':antigen_sample,
     'antigen_result':antigen_result,
     'final_class':final_class,
     'migrant':migrant,
     'country_nationality':country_nationality,
     'country_patient_birth':country_patient_birth,
     'icu':icu}
    data_df = pd.DataFrame(data=data)
    return data_df

# function to convert objects to a datatype requeried
def transforDataTypes(data_df):
    data_df['id_patient'] = data_df['id_patient'].astype('string')
    data_df['data_file_updated'] = pd.to_datetime(data_df['data_file_updated'])
    data_df['date_admitted'] = pd.to_datetime(data_df['date_admitted'])
    data_df['date_patient_symp'] = pd.to_datetime(data_df['date_patient_symp'])
    data_df['country_nationality'] = data_df['country_nationality'].astype('string')
    data_df['country_patient_birth'] = data_df['country_patient_birth'].astype('string')
    return data_df

# function from stack overflow to transform dates into number
# for future use in preprocessing

def strTimeProp(start, end, time_format, prop):
    stime = time.mktime(time.strptime(start, time_format))
    etime = time.mktime(time.strptime(end, time_format))
    ptime = stime + prop * (etime - stime)
    output = time.strftime(time_format, time.localtime(ptime))
    if output == '2020-06-04':
        return '9999-99-99'
    return output

def randomDate(start, end, prop):
    return strTimeProp(start, end, '%Y-%m-%d', prop)


In [10]:
# Running Main Flow
sample_df = sampleData()
sample_df = transforDataTypes(sample_df)

In [11]:
# connect with the database covid_patients that contains the sample data

db_string = f"postgresql://postgres:{db_password}@covidpatients.cqbgcjbaetrj.us-west-1.rds.amazonaws.com:5432/covid_patients"
engine = create_engine(db_string)

In [12]:
# run sql to store data into AWS-RDS
# sample_df.to_sql(name='patient', con=engine, if_exists='append')

In [13]:
## import table: patient as a dataframe 
sql_query = 'SELECT index, id_patient, state_medical_unit, state_residence, data_file_updated,\
resp_monitoring, type_institution, gender, state_patient_birth, city_patient_birth, type_patient,\
date_admitted, date_patient_symp, date_patient_death, intubated, pneumonia, age, resident, pregnant,\
indigenous_lang, indigenous, diabetes, copd, asthma, immunosup, hypertension, another_complication,\
cardiovascular, obesity, renal_chronic, tobacco, closed_contanct, lab_sample, lab_result, antigen_sample,\
antigen_result, final_class, migrant, country_nationality, country_patient_birth, icu \
FROM public.patient;'

# display results from the database
patients_df = pd.read_sql_query(sql_query,engine)
patients_df.head(15)

Unnamed: 0,index,id_patient,state_medical_unit,state_residence,data_file_updated,resp_monitoring,type_institution,gender,state_patient_birth,city_patient_birth,type_patient,date_admitted,date_patient_symp,date_patient_death,intubated,pneumonia,age,resident,pregnant,indigenous_lang,indigenous,diabetes,copd,asthma,immunosup,hypertension,another_complication,cardiovascular,obesity,renal_chronic,tobacco,closed_contanct,lab_sample,lab_result,antigen_sample,antigen_result,final_class,migrant,country_nationality,country_patient_birth,icu
0,0,0,5,7,2020-05-11,0,7,1,7,2,1,2020-04-16,2020-03-26,2020-05-30,0,0,2,0,1,0,1,1,0,0,1,0,1,0,1,1,1,0,0,0,1,1,1,0,1,3,1
1,1,1,9,2,2020-05-29,1,4,0,8,4,0,2020-03-19,2020-04-02,2020-04-09,1,1,2,0,0,0,0,1,0,1,1,0,1,0,1,1,0,1,1,1,1,1,2,0,7,9,0
2,2,2,9,4,2020-05-18,0,7,0,2,6,1,2020-02-06,2020-05-12,2020-04-09,1,1,2,0,1,0,1,0,1,0,1,1,1,0,1,0,0,0,1,1,1,0,0,0,1,2,0
3,3,3,5,9,2020-05-05,1,6,0,3,1,0,2020-03-24,2020-04-27,2020-04-06,0,1,1,1,1,1,0,1,0,0,1,0,0,0,0,1,1,1,0,0,1,0,1,0,2,7,1
4,4,4,4,7,2020-05-30,0,3,1,7,3,1,2020-05-19,2020-05-14,2020-03-25,0,1,1,1,0,1,0,0,1,1,1,0,0,1,0,1,0,0,1,1,0,0,0,1,9,6,0
5,5,5,3,4,2020-05-14,1,9,1,5,0,1,2020-01-11,2020-03-07,2020-05-01,0,1,4,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,1,0,0
6,6,6,5,6,2020-05-09,1,5,0,8,1,1,2020-01-09,2020-05-08,2020-02-11,1,0,0,0,1,1,0,1,0,1,0,0,1,0,0,0,1,0,0,0,1,0,2,1,3,1,0
7,7,7,2,7,2020-05-25,1,2,1,7,1,0,2020-01-06,2020-04-24,2020-05-13,0,0,3,1,0,0,1,0,0,0,0,1,0,0,1,1,0,1,0,1,1,1,0,0,6,7,1
8,8,8,3,7,2020-04-27,1,2,0,4,3,1,2020-05-23,2020-05-21,2020-03-25,1,0,2,1,1,1,0,0,0,0,1,1,0,1,0,0,1,0,1,1,1,1,0,0,1,2,1
9,9,9,1,3,2020-04-20,0,0,1,4,3,1,2020-03-17,2020-05-02,2020-03-08,0,1,2,1,0,0,1,0,1,0,0,0,1,1,1,0,0,1,0,1,0,0,0,0,5,6,1


In [14]:
## Data is ready for final Preprocessing