# Importing the Cleaned Data (csv)


## Import Dependencies

In [2]:
# basic stuff
import psycopg2
import pandas as pd
import psycopg2.extras
import os
import numpy
from config import (census_key, gkey)
import gmaps
import requests
from ipywidgets.embed import embed_minimal_html
from pprint import pprint

# Imports the method used to connect to DBs
from sqlalchemy import create_engine

# function to establish a session with a connected database
from sqlalchemy.orm import Session

# database compliant datatypes
from sqlalchemy import Column, Integer, String, Float

## Setup the PostgreSQL engine

In [3]:
# password is hard-coded in the connection string as "postgres"
engine = create_engine('postgresql://postgres:postgres008@localhost:5432/medical_no_show_db')

## Clear out data first
### Start with the fact (dependent) tables first, then drop foreign keys, truncate rest of tables and then re-add keys

In [37]:
# truncate non-dependent tables first
engine.execute('TRUNCATE TABLE staging_table')

# truncate the rest of the tables, this will get neighborhood and appointments
engine.execute('TRUNCATE TABLE neighborhood CASCADE;')

<sqlalchemy.engine.result.ResultProxy at 0x19ca93588c8>

In [25]:
# just making sure tables are empty
engine.execute("SELECT * FROM appointments;").fetchall()

[]

## Importing the metadata first

### state

#### Import, preview

In [26]:
# medical no show staging data
raw_file = os.path.join("..","data","rawData","Brasil_medical_noshow_V2-May-2016.csv")

# trying a latin encoding
# raw_df = pd.read_csv(raw_file, encoding="ISO-8859-1")
raw_df = pd.read_csv(raw_file, encoding="latin1")

# preview the raw data
raw_df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


#### Rename columns to match database

In [27]:
# rename columns to match database column names, due to laziness
raw_df = raw_df.rename(columns={
    'PatientId':'patient_id', 
    'AppointmentID': 'appointment_id',
    'Gender': 'gender',
    'ScheduledDay': 'scheduled_day',
    'AppointmentDay': 'appointment_day',
    'Age': 'patient_age',
    # i had to change this, and some other spellings, to prevent future errors! :) 
    'Neighbourhood': 'neighborhood',
    'Scholarship': 'scholarship',
    'Hipertension': 'hypertension',
    'Diabetes': 'diabetes',
    'Alcoholism': 'alcoholism',
    'Handcap': 'handicap',
    'SMS_received': 'sms_received',
    'No-show': 'no_show'
})
raw_df.head()

Unnamed: 0,patient_id,appointment_id,gender,scheduled_day,appointment_day,patient_age,neighborhood,scholarship,hypertension,diabetes,alcoholism,handicap,sms_received,no_show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


#### Write to PostgreSQL, return rows to verify
##### Caution, to re-run, you have to run the truncate table code above first

In [28]:
# write dataframe to table, replace the rows if they exist
raw_df.to_sql('staging_table', con=engine, if_exists='append', index=False)

# return the data to make sure it was appended correctly
engine.execute("SELECT * FROM staging_table LIMIT 10").fetchall()


[(29872499824296, 5642903, 'F', datetime.date(2016, 4, 29), datetime.date(2016, 4, 29), 62, 'JARDIM DA PENHA', 0, 1, 0, 0, 0, 0, 'No'),
 (558997776694438, 5642503, 'M', datetime.date(2016, 4, 29), datetime.date(2016, 4, 29), 56, 'JARDIM DA PENHA', 0, 0, 0, 0, 0, 0, 'No'),
 (4262962299951, 5642549, 'F', datetime.date(2016, 4, 29), datetime.date(2016, 4, 29), 62, 'MATA DA PRAIA', 0, 0, 0, 0, 0, 0, 'No'),
 (867951213174, 5642828, 'F', datetime.date(2016, 4, 29), datetime.date(2016, 4, 29), 8, 'PONTAL DE CAMBURI', 0, 0, 0, 0, 0, 0, 'No'),
 (8841186448183, 5642494, 'F', datetime.date(2016, 4, 29), datetime.date(2016, 4, 29), 56, 'JARDIM DA PENHA', 0, 1, 1, 0, 0, 0, 'No'),
 (95985133231274, 5626772, 'F', datetime.date(2016, 4, 27), datetime.date(2016, 4, 29), 76, 'REPÃ\x9aBLICA', 0, 1, 0, 0, 0, 0, 'No'),
 (733688164476661, 5630279, 'F', datetime.date(2016, 4, 27), datetime.date(2016, 4, 29), 23, 'GOIABEIRAS', 0, 0, 0, 0, 0, 0, 'Yes'),
 (3449833394123, 5630575, 'F', datetime.date(2016, 4, 27)

#### create a neighborhood df
Create a df of unique neighborhoods

In [29]:
# group the neighborhoods
neighbor_df = pd.DataFrame(raw_df.groupby('neighborhood')['appointment_id'].count().reset_index())

# rename column
neighbor_df = neighbor_df.rename(columns={'appointment_id':'total_appointments'})

neighbor_df.head()



Unnamed: 0,neighborhood,total_appointments
0,AEROPORTO,8
1,ANDORINHAS,2262
2,ANTÃNIO HONÃRIO,271
3,ARIOVALDO FAVALESSA,282
4,BARRO VERMELHO,423


### populating the PostgreSQL neighborhood table, we will update geo stuff later

In [30]:
# neighborhood table first
sql = "INSERT INTO neighborhood (neighborhood) SELECT neighborhood FROM staging_table GROUP BY neighborhood"
engine.execute(sql)

<sqlalchemy.engine.result.ResultProxy at 0x19caea6c188>

In [31]:
# return the data to make sure it was appended correctly
engine.execute("SELECT * FROM neighborhood LIMIT 10").fetchall()

[(82, 'CRUZAMENTO', None, None),
 (83, 'SANTOS DUMONT', None, None),
 (84, 'BARRO VERMELHO', None, None),
 (85, 'GOIABEIRAS', None, None),
 (86, 'RESISTÃ\x8aNCIA', None, None),
 (87, 'FRADINHOS', None, None),
 (88, 'PONTAL DE CAMBURI', None, None),
 (89, 'DO MOSCOSO', None, None),
 (90, 'PARQUE MOSCOSO', None, None),
 (91, 'JESUS DE NAZARETH', None, None)]

### loop through dataframe, updating sql as we go with lat/long, if they exist!

In [32]:
# function to remove special characters from neighborhood
# can't get this to work, or remove those special chars
def stripChars(input_string): 
    output_string = ""
    for character in input_string:
        if character.isalnum():
            output_string += character
    return output_string

# array to store neighborhoods we can't match
missing_hoods = []

# loop through neighborhood df
for index, row in neighbor_df.iterrows():
    
    # target address is the row's neighborhood
    target_address = stripChars(row['neighborhood'])

    # build the endpoint URL
    target_url = ('https://maps.googleapis.com/maps/api/geocode/json?address={0}&key={1}').format(target_address, gkey)

    # run a request to endpoint and convert result to json
    geo_data = requests.get(target_url).json()    
    
    # try to extract the lat/long
    try:
      
        # Extract latitude and longitude
        lat = geo_data["results"][0]["geometry"]["location"]["lat"]
        lng = geo_data["results"][0]["geometry"]["location"]["lng"] 
        
        # update the data base
        sql = "UPDATE neighborhood SET latitude = " + str(lat) + ", longitude = " + str(lng) + " WHERE neighborhood = '" + target_address + "';"
        engine.execute(sql)
    
    except:
        
        missing_hoods.append(target_address)

# look at the ones we missed
print(f'Missing hoods: {missing_hoods}')
    

Missing hoods: ['CARATOÃRA', 'CONSOLAÃÃO', 'DAPENHA', 'DOMOSCOSO', 'ESTRELINHA', 'FONTEGRANDE', 'GRANDEVITÃRIA', 'ILHADOPRÃNCIPE', 'ILHASOCEÃNICASDETRINDADE', 'JOANADÂARC', 'MARUÃPE', 'MÃRIOCYPRESTE', 'REDENÃÃO', 'REPÃBLICA', 'SANTACECÃLIA', 'SANTALUÃZA', 'SANTALÃCIA', 'SANTOANDRÃ', 'SANTOANTÃNIO', 'SEGURANÃADOLAR', 'SÃOJOSÃ', 'UNIVERSITÃRIO']


#### 50 out of 81, not bad!
Looks like those special characters need to be removed.  I'll circle back

In [34]:
# checking it out from the database
engine.execute("SELECT * FROM neighborhood WHERE latitude IS NOT NULL LIMIT 10").fetchall()

[(159, 'AEROPORTO', -95.88386589999999, 36.198778),
 (131, 'ANDORINHAS', -39.8392279, -10.3491989),
 (125, 'BONFIM', -8.5940618, 41.1510153),
 (109, 'CENTRO', -95.712891, 37.09024),
 (111, 'COMDUSA', -40.3352805, -20.2855012),
 (148, 'CONQUISTA', -4.4902774, 38.4097283),
 (82, 'CRUZAMENTO', -8.5611565, 38.2077692),
 (87, 'FRADINHOS', -40.3280037, -20.3069131),
 (85, 'GOIABEIRAS', -95.712891, 37.09024),
 (157, 'GURIGICA', -40.3050336, -20.3059062)]

### Load appointment table

In [35]:
# sql 
sql = "  INSERT INTO appointments " \
    "SELECT s.appointment_id, " \
    "s.patient_id, " \
    "s.gender, " \
    "s.scheduled_day, " \
    "s.appointment_day, " \
    "s.patient_age, " \
    "n.neighborhood_id, " \
    "s.scholarship, " \
    "s.hypertension, " \
    "s.diabetes, " \
    "s.alcoholism, " \
    "s.handicap, " \
    "s.no_show " \
    "FROM staging_table s INNER JOIN neighborhood n " \
    "ON s.neighborhood = n.neighborhood;"

engine.execute(sql)

# checking it out from the database
engine.execute("SELECT * FROM appointments LIMIT 10").fetchall()

[(5642903, 29872499824296, 'F', datetime.date(2016, 4, 29), datetime.date(2016, 4, 29), 62, 155, 0, 1, 0, 0, 0, 'No'),
 (5642503, 558997776694438, 'M', datetime.date(2016, 4, 29), datetime.date(2016, 4, 29), 56, 155, 0, 0, 0, 0, 0, 'No'),
 (5642549, 4262962299951, 'F', datetime.date(2016, 4, 29), datetime.date(2016, 4, 29), 62, 93, 0, 0, 0, 0, 0, 'No'),
 (5642828, 867951213174, 'F', datetime.date(2016, 4, 29), datetime.date(2016, 4, 29), 8, 88, 0, 0, 0, 0, 0, 'No'),
 (5642494, 8841186448183, 'F', datetime.date(2016, 4, 29), datetime.date(2016, 4, 29), 56, 155, 0, 1, 1, 0, 0, 'No'),
 (5626772, 95985133231274, 'F', datetime.date(2016, 4, 27), datetime.date(2016, 4, 29), 76, 122, 0, 1, 0, 0, 0, 'No'),
 (5630279, 733688164476661, 'F', datetime.date(2016, 4, 27), datetime.date(2016, 4, 29), 23, 85, 0, 0, 0, 0, 0, 'Yes'),
 (5630575, 3449833394123, 'F', datetime.date(2016, 4, 27), datetime.date(2016, 4, 29), 39, 85, 0, 0, 0, 0, 0, 'Yes'),
 (5638447, 56394729949972, 'F', datetime.date(2016, 4,