# Importing the Cleaned Data (csv)


## Import Dependencies

In [1]:
# basic stuff
import psycopg2
import pandas as pd
import psycopg2.extras
import os
import numpy
from config import (census_key, gkey)
import gmaps
import requests
from ipywidgets.embed import embed_minimal_html
from pprint import pprint

# Imports the method used to connect to DBs
from sqlalchemy import create_engine

# function to establish a session with a connected database
from sqlalchemy.orm import Session

# database compliant datatypes
from sqlalchemy import Column, Integer, String, Float

## Setup the PostgreSQL engine

In [2]:
# password is hard-coded in the connection string as "postgres"
engine = create_engine('postgresql://postgres:postgres@localhost:5432/medical_no_show_db')


## Clear out data first
### Start with the fact (dependent) tables first, then drop foreign keys, truncate rest of tables and then re-add keys

In [6]:
# truncate non-dependent tables first
engine.execute('TRUNCATE TABLE staging_table')

# truncate the rest of the tables, this will get neighborhood and appointments
engine.execute('TRUNCATE TABLE neighborhood CASCADE;')

<sqlalchemy.engine.result.ResultProxy at 0x2330df9f4c8>

In [7]:
# just making sure tables are empty
engine.execute("SELECT * FROM appointments;").fetchall()

[]

## Importing the metadata first

### state

#### Import, preview

In [16]:
# medical no show staging data
raw_file = os.path.join("..","data","cleanData","appointments.csv")

# trying a latin encoding
# raw_df = pd.read_csv(raw_file, encoding="ISO-8859-1")
raw_df = pd.read_csv(raw_file, encoding="utf-8-sig")

# preview the raw data
raw_df.head()

Unnamed: 0,PatientID,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighborhood,Welfare_Assistance,Hypertension,Diabetes,Alcoholism,Handicap,SMS_received,No-show,AppointmentDayofWeek,AdvanceBookingDays,SameDayAppt
0,29872499824296,5642903,F,2016-04-29,2016-04-29,62,JARDIM DA PENHA,0,1,0,0,0,0,No,Friday,0,1
1,558997776694438,5642503,M,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,0,0,0,0,0,No,Friday,0,1
2,4262962299951,5642549,F,2016-04-29,2016-04-29,62,MATA DA PRAIA,0,0,0,0,0,0,No,Friday,0,1
3,867951213174,5642828,F,2016-04-29,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No,Friday,0,1
4,8841186448183,5642494,F,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,1,1,0,0,0,No,Friday,0,1


#### Rename columns to match database

In [18]:
# rename columns to match database column names, due to laziness
raw_df = raw_df.rename(columns={
    'PatientID':'patient_id',
    'AppointmentID': 'appointment_id',
    'Gender':'gender',
    'ScheduledDay': 'scheduled_day',
    'AppointmentDay':'appointment_day',
    'Age': 'age',
    'Neighborhood':'neighborhood',
    'Welfare_Assistance':'welfare_assistance',
    'Hypertension': 'hypertension',
    'Diabetes':'diabetes',
    'Alcoholism': 'alcoholism',
    'Handicap':'handicap',
    'SMS_received': 'sms_received',
    'No-show': 'no_show',
    'AppointmentDayofWeek':'appointment_day_of_week',
    'AdvanceBookingDays': 'advance_booking_days',
    'SameDayAppt':'same_day_appt'
})
raw_df.head()


Unnamed: 0,patient_id,appointment_id,gender,scheduled_day,appointment_day,age,neighborhood,welfare_assistance,hypertension,diabetes,alcoholism,handicap,sms_received,no_show,appointment_day_of_week,advance_booking_days,same_day_appt
0,29872499824296,5642903,F,2016-04-29,2016-04-29,62,JARDIM DA PENHA,0,1,0,0,0,0,No,Friday,0,1
1,558997776694438,5642503,M,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,0,0,0,0,0,No,Friday,0,1
2,4262962299951,5642549,F,2016-04-29,2016-04-29,62,MATA DA PRAIA,0,0,0,0,0,0,No,Friday,0,1
3,867951213174,5642828,F,2016-04-29,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No,Friday,0,1
4,8841186448183,5642494,F,2016-04-29,2016-04-29,56,JARDIM DA PENHA,0,1,1,0,0,0,No,Friday,0,1


#### Write to PostgreSQL, return rows to verify
##### Caution, to re-run, you have to run the truncate table code above first

In [21]:
# write dataframe to table, replace the rows if they exist
raw_df.to_sql('staging_data', con=engine, if_exists='append', index=False)

# return the data to make sure it was appended correctly
engine.execute("SELECT * FROM staging_data LIMIT 10").fetchall()


[(29872499824296, 5642903, 'F', datetime.date(2016, 4, 29), datetime.date(2016, 4, 29), 62, 'JARDIM DA PENHA', 0, 1, 0, 0, 0, 0, 'No', 'Friday', 0, 1),
 (558997776694438, 5642503, 'M', datetime.date(2016, 4, 29), datetime.date(2016, 4, 29), 56, 'JARDIM DA PENHA', 0, 0, 0, 0, 0, 0, 'No', 'Friday', 0, 1),
 (4262962299951, 5642549, 'F', datetime.date(2016, 4, 29), datetime.date(2016, 4, 29), 62, 'MATA DA PRAIA', 0, 0, 0, 0, 0, 0, 'No', 'Friday', 0, 1),
 (867951213174, 5642828, 'F', datetime.date(2016, 4, 29), datetime.date(2016, 4, 29), 8, 'PONTAL DE CAMBURI', 0, 0, 0, 0, 0, 0, 'No', 'Friday', 0, 1),
 (8841186448183, 5642494, 'F', datetime.date(2016, 4, 29), datetime.date(2016, 4, 29), 56, 'JARDIM DA PENHA', 0, 1, 1, 0, 0, 0, 'No', 'Friday', 0, 1),
 (95985133231274, 5626772, 'F', datetime.date(2016, 4, 27), datetime.date(2016, 4, 29), 76, 'REPÚBLICA', 0, 1, 0, 0, 0, 0, 'No', 'Friday', 2, 0),
 (733688164476661, 5630279, 'F', datetime.date(2016, 4, 27), datetime.date(2016, 4, 29), 23, 'GOIA

#### create a neighborhood df
Create a df of unique neighborhoods

In [17]:
# group the neighborhoods
neighbor_df = pd.DataFrame(raw_df.groupby('Neighborhood')['AppointmentID'].count().reset_index())

# rename column
neighbor_df = neighbor_df.rename(columns={'AppointmentID':'AppointmentCount'})

neighbor_df.head()



Unnamed: 0,Neighborhood,AppointmentCount
0,AEROPORTO,8
1,ANDORINHAS,2262
2,ANTÔNIO HONÓRIO,271
3,ARIOVALDO FAVALESSA,282
4,BARRO VERMELHO,423


In [22]:
# medical no show staging data
income_file = os.path.join("..","data","cleanData","median_monthly_nominal_income_by_neighborhood.csv")

# trying a latin encoding
# raw_df = pd.read_csv(raw_file, encoding="ISO-8859-1")
income_df = pd.read_csv(income_file, encoding="utf-8-sig")

# preview the raw data
income_df.head()

Unnamed: 0,"Mesorregiões, microrregiões, municípios, distritos, subdistritos e bairros",Valor do rendimento nominal mediano mensal das pessoas de 10 anos ou mais de idade (R$)
0,Andorinhas,510.0
1,Antonio Honório,755.0
2,Ariovaldo Favalessa,510.0
3,Barro Vermelho,2000.0
4,Bela Vista,510.0


In [23]:
income_df = income_df.rename(columns={
    'Mesorregiões, microrregiões, municípios, distritos, subdistritos e bairros':'neighborhood',
    'Valor do rendimento nominal mediano mensal das pessoas de 10 anos ou mais de idade (R$)': 'median_income'
})
income_df.head()

Unnamed: 0,neighborhood,median_income
0,Andorinhas,510.0
1,Antonio Honório,755.0
2,Ariovaldo Favalessa,510.0
3,Barro Vermelho,2000.0
4,Bela Vista,510.0


In [27]:
# write dataframe to table, replace the rows if they exist
income_df.to_sql('neighborhood_income_staging', con=engine, if_exists='append', index=False)

# return the data to make sure it was appended correctly
engine.execute("SELECT * FROM neighborhood_income_staging LIMIT 10").fetchall()

[('Andorinhas', 510.0),
 ('Antonio Honório', 755.0),
 ('Ariovaldo Favalessa', 510.0),
 ('Barro Vermelho', 2000.0),
 ('Bela Vista', 510.0),
 ('Bento Ferreira', 1500.0),
 ('Boa Vista', 590.0),
 ('Bonfim', 510.0),
 ('Carapina I', 700.0),
 ('Caratoíra', 510.0)]

In [5]:
med_df = pd.read_sql_query('select * from "medical_noshow_data"',con=engine)

med_df.head()

Unnamed: 0,appointment_id,patient_id,repeat_patient_yn,gender_yn,time_between_sch_appt,same_day_appt_yn,within_week_appt_yn,advanced_appt_yn,monday_yn,tuesday_yn,...,young_adult_yn,adult_yn,senior_yn,welfare_assistance,hypertension,diabetes,alcoholism,handicap_yn,sms_received,no_show_yn
0,5698125,678814354693913,1,0,0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
1,5698246,54593736353128,0,0,0,1,0,0,1,0,...,1,0,0,1,0,0,0,0,0,0
2,5699393,4369164743113,1,0,0,1,0,0,1,0,...,0,1,0,0,1,0,0,0,0,0
3,5694371,54523365344664,0,1,3,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4,5698279,62917816238835,1,0,0,1,0,0,1,0,...,1,0,0,1,0,0,0,0,0,0


In [6]:

full_file = os.path.join("..", "data", "cleanData", "appointment_dataset.csv")
med_df.to_csv(full_file, index=False, header=True)

### populating the PostgreSQL neighborhood table, we will update geo stuff later

In [21]:
# return the data to make sure it was appended correctly
engine.execute("SELECT * FROM neighborhood LIMIT 10").fetchall()

[(1, 'AEROPORTO', None, None),
 (2, 'ANDORINHAS', None, None),
 (3, 'ANTÔNIO HONÓRIO', None, None),
 (4, 'ARIOVALDO FAVALESSA', None, None),
 (5, 'BARRO VERMELHO', None, None),
 (6, 'BELA VISTA', None, None),
 (7, 'BENTO FERREIRA', None, None),
 (8, 'BOA VISTA', None, None),
 (9, 'BONFIM', None, None),
 (10, 'CARATOÍRA', None, None)]

In [7]:
# make dataframe out of neighborhood table
neighbor_df = pd.read_sql_query('select * from "neighborhood"',con=engine)

neighbor_df.head()

Unnamed: 0,neighborhood_id,neighborhood,longitude,latitude,median_income
0,1,AEROPORTO,,,510.0
1,3,ANTÔNIO HONÓRIO,,,755.0
2,39,JARDIM CAMBURI,,,1500.0
3,42,JOANA D´ARC,,,510.0
4,49,MORADA DE CAMBURI,,,1100.0


### loop through dataframe, updating sql as we go with lat/long, if they exist!

In [14]:
# array to store neighborhoods we can't match
missing_hoods = []
# array for neighborhood geo coordinates
hoods = []
lats = []
longs = []

# loop through neighborhood df
for index, row in neighbor_df.iterrows():
    
    # target address is the row's neighborhood
    target_address = row['neighborhood']

    # build the endpoint URL
    
#     target_url = ('https://maps.googleapis.com/maps/api/geocode/json?address={0}&key={1}').format(target_address, gkey)
    target_url = 'https://maps.googleapis.com/maps/api/geocode/json?components=locality:' + target_address + '|country:BR&key=' + gkey

#     print(target_url)
    # run a request to endpoint and convert result to json
    geo_data = requests.get(target_url).json()    
    
    # try to extract the lat/long
    try:
      
        # Extract latitude and longitude
        lat = geo_data["results"][0]["geometry"]["location"]["lat"]
        lng = geo_data["results"][0]["geometry"]["location"]["lng"] 
        
        # append to all the arrays
        hoods.append(target_address)
        lats.append(lat)
        longs.append(lng)
         
#         print(f'Lat: {lat} Long: {lng}')
        
        # update the data base
        sql = "UPDATE neighborhood SET ""Latitude"" = " + str(lat) + ", ""Longitude"" = " + str(lng) + " WHERE ""neighborhood"" = '" + target_address + "';"
        engine.execute(sql)
    
    except:
        
        missing_hoods.append(target_address)

geo_df = pd.DataFrame({
    'neighborhood': hoods,
    'latitude': lats,
    'longitude': longs
})     

geo_df.head()

# look at the ones we missed
# print(f'Missing hoods: {missing_hoods}')
    

Unnamed: 0,neighborhood,latitude,longitude
0,AEROPORTO,-23.430573,-46.473043
1,ANTÔNIO HONÓRIO,-20.263121,-40.298013
2,JARDIM CAMBURI,-20.259219,-40.265453
3,JOANA D´ARC,-8.088779,-34.88565
4,MORADA DE CAMBURI,-20.272049,-40.296737


In [15]:
# checking it out from the database
engine.execute("SELECT * FROM neighborhood WHERE latitude IS NOT NULL LIMIT 10").fetchall()

[(40, 'JARDIM DA PENHA', -40.2960984, -20.2850907, 1500.0),
 (41, 'JESUS DE NAZARETH', -40.3024808, -20.3188643, 510.0),
 (43, 'JUCUTUQUARA', -40.31971, -20.3085919, 800.0),
 (44, 'MARIA ORTIZ', -40.2999279, -20.258305, 510.0),
 (45, 'MÁRIO CYPRESTE', -40.3554284, -20.3195399, 510.0),
 (46, 'MARUÍPE', -40.31971, -20.296341, 800.0),
 (47, 'MATA DA PRAIA', -40.2922686, -20.2751185, 2000.0),
 (48, 'MONTE BELO', -46.3688171, -21.3221066, 511.0),
 (1, 'AEROPORTO', -46.473043, -23.4305731, 510.0),
 (3, 'ANTÔNIO HONÓRIO', -40.2980132, -20.2631207, 755.0)]

### Load appointment table

In [35]:
# sql 
sql = "  INSERT INTO appointments " \
    "SELECT s.appointment_id, " \
    "s.patient_id, " \
    "s.gender, " \
    "s.scheduled_day, " \
    "s.appointment_day, " \
    "s.patient_age, " \
    "n.neighborhood_id, " \
    "s.scholarship, " \
    "s.hypertension, " \
    "s.diabetes, " \
    "s.alcoholism, " \
    "s.handicap, " \
    "s.no_show " \
    "FROM staging_table s INNER JOIN neighborhood n " \
    "ON s.neighborhood = n.neighborhood;"

engine.execute(sql)

# checking it out from the database
engine.execute("SELECT * FROM appointments LIMIT 10").fetchall()

[(5642903, 29872499824296, 'F', datetime.date(2016, 4, 29), datetime.date(2016, 4, 29), 62, 155, 0, 1, 0, 0, 0, 'No'),
 (5642503, 558997776694438, 'M', datetime.date(2016, 4, 29), datetime.date(2016, 4, 29), 56, 155, 0, 0, 0, 0, 0, 'No'),
 (5642549, 4262962299951, 'F', datetime.date(2016, 4, 29), datetime.date(2016, 4, 29), 62, 93, 0, 0, 0, 0, 0, 'No'),
 (5642828, 867951213174, 'F', datetime.date(2016, 4, 29), datetime.date(2016, 4, 29), 8, 88, 0, 0, 0, 0, 0, 'No'),
 (5642494, 8841186448183, 'F', datetime.date(2016, 4, 29), datetime.date(2016, 4, 29), 56, 155, 0, 1, 1, 0, 0, 'No'),
 (5626772, 95985133231274, 'F', datetime.date(2016, 4, 27), datetime.date(2016, 4, 29), 76, 122, 0, 1, 0, 0, 0, 'No'),
 (5630279, 733688164476661, 'F', datetime.date(2016, 4, 27), datetime.date(2016, 4, 29), 23, 85, 0, 0, 0, 0, 0, 'Yes'),
 (5630575, 3449833394123, 'F', datetime.date(2016, 4, 27), datetime.date(2016, 4, 29), 39, 85, 0, 0, 0, 0, 0, 'Yes'),
 (5638447, 56394729949972, 'F', datetime.date(2016, 4,

In [16]:
tab_df = pd.read_sql_query('select * from "medical_noshow_tableau"',con=engine)

tab_df.head()

Unnamed: 0,appointment_id,patient_id,repeat_patient_yn,neighborhood_id,neighborhood,longitude,latitude,median_income,gender,scheduled_day,appointment_day,time_between_sch_appt,age,welfare_assistance,hypertension,diabetes,alcoholism,handicap,no_show
0,5577268,981473726894859,0,44,MARIA ORTIZ,-40.299928,-20.258305,510.0,F,2016-04-13,2016-05-16,33,78,0,0,0,0,0,No
1,5699869,761922811367,1,44,MARIA ORTIZ,-40.299928,-20.258305,510.0,F,2016-05-16,2016-05-16,0,37,0,0,0,0,0,No
2,5573760,2115284169585,0,44,MARIA ORTIZ,-40.299928,-20.258305,510.0,F,2016-04-12,2016-05-16,34,56,0,0,0,0,0,Yes
3,5575768,773687299474153,0,44,MARIA ORTIZ,-40.299928,-20.258305,510.0,F,2016-04-13,2016-05-16,33,69,0,0,0,0,0,Yes
4,5573964,849415271411923,0,44,MARIA ORTIZ,-40.299928,-20.258305,510.0,M,2016-04-12,2016-05-16,34,20,0,0,0,0,0,No


In [17]:
full_file = os.path.join("..", "data", "cleanData", "appointment_tableau_dataset.csv")
tab_df.to_csv(full_file, index=False, header=True)