In [13]:
import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from gmplot import gmplot
import googlemaps
import random
import folium
from folium.plugins import HeatMap
import os

In [2]:
# read in csv as df (with the first column as index_col)
data = pd.read_csv("./data/Jan19_SH.csv", index_col=0)

# remove unwanted columns
unwanted=['opdhVbp', 'vstId', 'vstDate', 'opdhVpulse',
          'opdhVtemp', 'opdhDhtn', 'opdhDihd', 'opdhDhyper',
          'pntAge', 'opdhVbmi', 'opdhVheight', 'opdhVweight',
          'opdhDdm', 'vstAmount', 'vsrcId']
data.drop(unwanted, inplace=True, axis=1)
# inplace=true tells pd to make the changes in our object
# axis=1 - dropping columns

# check for empty cells and replace with 'Pakistan' (default)
print('number of empty cells', data.isnull().sum())
data['pntAddress']=data['pntAddress'].fillna('Pakistan')
print('number of empty cells after fillna', data.isnull().sum(), '\n')

# check the types of data in the addresses column
print('type of data in the df ', data.dtypes)
print('shape of dataFrame ', np.shape(data), '\n')

number of empty cells pntAddress    10
dtype: int64
number of empty cells after fillna pntAddress    0
dtype: int64 

type of data in the df  pntAddress    object
dtype: object
shape of dataFrame  (6185, 1) 



In [28]:
# need to use googlemaps to find lat and loong of addresses
with open('api.txt') as f:
    myAPIkey=f.readline()
# set the api key first
gmaps_key=googlemaps.Client(key ='AIzaSyDDj4Tkr4vAp5hn2IJ4a9rx6hkInFDMSq8')

# create empty Lon and Lat columns ... and google maps name
# geocode_object should be included too 
data["LAT"] = None
data["LON"] = None
data["GMAPS_NAME"] = None
data["GEOCODE_OBJECT"] = None
data["PARTIAL_RESULT"] = None

# find lat, lon for a random 50 addresses and add them to the dataFrame
chosen=random.sample(range(0, len(data)), 50)
for i in chosen:
    geocode_result = gmaps_key.geocode(data['pntAddress'][i]+', Pakistan')
    try:
        lat = geocode_result[0]["geometry"]["location"]["lat"]
        lon = geocode_result[0]["geometry"]["location"]["lng"]
        gname = geocode_result[0]["formatted_address"]
        gdata = geocode_result[0]
#         partial = checkPartialMatch(geocode_result)
        data.iat[i, data.columns.get_loc("LAT")] = lat
        data.iat[i, data.columns.get_loc("LON")] = lon
        data.iat[i, data.columns.get_loc("GMAPS_NAME")] = gname
        data.iat[i, data.columns.get_loc("GEOCODE_OBJECT")] = gdata
#         data.iat[i, data.columns.get_loc("PARTIAL_RESULT")] = partial
    except:
        gname = None
        lat = None 
        lon = None
        print("couldn't find address", i, data['pntAddress'][i])

# isloate the lon and lat data
latList=data["LAT"].values;
lonList=data["LON"].values;

# remove any None values for where lon and lat data could not be found
# or where address was not searched for
latList=latList[latList != np.array(None)]
lonList=lonList[lonList != np.array(None)]

In [38]:
#let's get the 15000 addresses and produce a csv for github script to run on it

data = pd.read_csv("./data/PatientData.csv", index_col=0)

In [41]:
data.tail(10)

Unnamed: 0_level_0,pntAge,pntGender,pntAddress,pntRegDate,survey,srvyId
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
15148,41.0,M,jamroad,07/01/2019 11:11,0,5
15149,5.0,M,"hayatabad,Peshawar",07/01/2019 11:57,0,4
15150,40.0,M,"Regi Lalma,Peshawar",07/01/2019 13:07,0,13
15151,57.0,M,"Landi Kotal,Khyber Agency",07/01/2019 13:30,0,4
15152,55.0,F,Nagoman,07/01/2019 15:34,0,8
15153,25.0,M,Landi Kotal Khyber Agency,08/01/2019 10:40,0,4
15154,27.0,M,"Sui Gas Office,Peshawaar",08/01/2019 10:41,0,13
15155,40.0,F,"Swat,kpk",08/01/2019 10:45,0,4
15156,50.0,F,Afghanistan(Dr.Naeem Khattak),08/01/2019 12:25,0,1
15157,50.0,M,bannu house,08/01/2019 12:34,0,4
