# getGSVImagesForMTurkv2 #
<br>

**Summary:** downloadGSVImages, select a subset of images to meet target ranges for a combination of categories, and remove images that were flagged during visual inspection <br>
**Author:** Andrew Larkin <br>
**Date Created:** Dec 21, 2020 <br>
**Affiliation:** Oregon State University, College of Health

In [1]:
import pandas as ps
import os
import streetview
import numpy as np
import time
import random
import wget

In [2]:
PARENT_FOLDER = "insert filepath where project files should be stored"

# super sample of GSV grid points that intersect roads
INPUT_FILES = [
    PARENT_FOLDER + "mjRdsPoints.csv",
    PARENT_FOLDER + "miRdsPoints.csv",
    PARENT_FOLDER + "resRdsPoints.csv"
]

# SV panoids within 10m of sampled grid points
OUTPUT_FILES = [
    PARENT_FOLDER + "mjRdsSVImages.csv",
    PARENT_FOLDER + "miRdsSVImages.csv",
    PARENT_FOLDER + "resRdsSVImages.csv"
]

# SV panoids with adjustments to compass direction so straight images are looking directly down the center of the road
COMPASS_FILES = [
    PARENT_FOLDER + "mjRdsSVImagesCompass.csv",
    PARENT_FOLDER + "miRdsSVImagesCompass.csv",
    PARENT_FOLDER + "resRdsSVImagesCompass.csv"
]
IMAGE_FOLDER = PARENT_FOLDER + "SV_Images/"
SCREENED_IMAGE_FOLDER = PARENT_FOLDER + "SV_Images_Screened/"
IMGS_TO_REMOVE = PARENT_FOLDER + "imgsToRemove.csv"
DISTANCE_THRESHOLD = 10 # maximum allowable difference between images for same location, in meters
API_KEY = 'inesrt api key here'

# sample sizes for each ABCD classification level. 
# A: Census division (1: new england, 2: middle atlantic, 3: east north central, 4: west north central,
#                     5: south atlantic, 6: east south central, 7: west south central, 8: mountain, 9: pacific)
# B: Urbanization level (1: urban center, 2: urban cluster: 3: rural)
# C: Road type (1: primary, 2: secondary/tertiary, 3: residential)
# D: Viewing angle (1: straight, 2: side)
sampleCodeSizes = {
    
    # urban centers
    '1130':1300,
    '2130':1300,
    '3130':1300,
    '4130':1300,
    '5130':1300,
    '6130':1300,
    '7130':1300,
    '8130':1300,
    '9130':1300,
    '1120':1300,
    '2120':1300,
    '3120':1300,
    '4120':1300,
    '5120':1300,
    '6120':1300,
    '7120':1300,
    '8120':1300,
    '9120':1300,
    '1110':350,
    '2110':350,
    '3110':350,
    '4110':350,
    '5110':350,
    '6110':350,
    '7110':350,
    '8110':350,
    '9110':350,
    # urban clusters
    '1230':750,
    '2230':750,
    '3230':750,
    '4230':750,
    '5230':750,
    '6230':750,
    '7230':750,
    '8230':750,
    '9230':750,
    '1220':750,
    '2220':750,
    '3220':750,
    '4220':750,
    '5220':750,
    '6220':750,
    '7220':750,
    '8220':750,
    '9220':750,
    '1210':300,
    '2210':300,
    '3210':300,
    '4210':300,
    '5210':300,
    '6210':300,
    '7210':300,
    '8210':300,
    '9210':300,
    # rural
    '1330':300,
    '2330':300,
    '3330':300,
    '4330':300,
    '5330':300,
    '6330':300,
    '7330':300,
    '8330':300,
    '9330':300,
    '1320':300,
    '2320':300,
    '3320':300,
    '4320':300,
    '5320':300,
    '6320':300,
    '7320':300,
    '8320':300,
    '9320':300,
    '1310':150,
    '2310':150,
    '3310':150,
    '4310':150,
    '5310':150,
    '6310':150,
    '7310':150,
    '8310':150,
    '9310':150   
}

## Part 1: Get GSV Metadata and Determine Sample ###

### calculate distance in meters beween a set of coordinates and a reference location using the haversine formula ###
#### for more details see https://en.wikipedia.org/wiki/Haversine_formula ####
**Inputs:** <br>
- **latVector** (float array) - latitude coordinates of sampled GSV panoids
- **lonVector** (float array) - longitude coordinates of sampled GSV panoids
- **latStd** (float array) - latitude coordinates of reference locations
- **lonStd** (float array) - longitude coordinates of reference locations <br>

**Outputs:** <br>
- array of distances from panoids and reference locations

In [3]:
def calcDistance(latVector,lonVector,latStd,lonStd):
    R = 6371000 # radius of the earth
    phi1 = np.radians(latVector)
    phi2 = np.radians(latStd)
    deltaPhi = np.radians(latStd-latVector)
    deltaLambda = np.radians(lonStd-lonVector)
    a = np.sin(deltaPhi/2)*np.sin(deltaPhi/2) + np.cos(phi1) * np.cos(phi2) * np.sin(deltaLambda/2) * np.sin(deltaLambda/2);
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = R * c
    return(d)

### for a single location, identify closest image within distance threshold for each year ###
**Inputs:**
- **pandDict** (dictionary) - panoids and metadata returned when querying the GSV API for a single location
- **locId** (string) - unique identifier for the location sent to the GSV API
- **distThreshold** (int) - maximum acceptable distance between a GSV panoids and the location send to the GSV API
**Outputs:**
- **tempDict** (dictionary) - metadata for the nearest panoid that is also within the distThreshold

In [1]:

def getClosestImgForSingleLoc(pandDict,locId,distThreshold=10):
    pandDF = ps.DataFrame(pandDict)
    closestImgIndex = pandDF['dist'].argmin()
    #uniqueYears = set(pandDict['year'])
    imgLat,imgLon,imgPan,impageLon,imgMonth,imgYear,imgDist,locAr = [[] for i in range(8)]
    curRecord = pandDF.iloc[closestImgIndex]
    if(curRecord['dist'] <= distThreshold):
        imgLat.append(curRecord['lat'])
        imgLon.append(curRecord['lon'])
        imgPan.append(curRecord['panId'])
        imgYear.append(curRecord['year'])
        imgMonth.append(curRecord['month'])
        imgDist.append(curRecord['dist'])
        locAr.append(locId)
    tempDict = {
        'locId':locId,
        'imgLat':imgLat,
        'imgLon':imgLon,
        'imgYear':imgYear,
        'imgMonth':imgMonth,
        'panId':imgPan,
        'dist':imgDist
    }
    return(tempDict)

### for a single reference location, identify the nearest panoid and return the panoid metadata ###
**Inputs:**
- **locId** (string) - unique identifier for the reference location
- **panoids** (string array) - unique identifiers for panoids returned by GSV API
- **inputLat** (float array) - latitude coordinates for panoids returned by GSV API
- **inputLong** (float array) - longitude coordinates for panoids returned by GSV API
- **distThreshold** (int) - maximum allowable distance between panoids and the reference location <br>

**Outputs:**
- **tempDict** (dictionary) - contains metadata of the nearest panoid within the distThreshold distance

In [5]:
def extractYearAndDiffs(locId,panoids,inputLat,inputLon,distThreshold):
    index, diffTime,lat,lon,year,month,panId,dist= [[] for i in range(8)]
    numWithoutYear = 0
    for panIndex in range(len(panoids)):
        try:
            year.append(panoids[panIndex]['year'])
            month.append(panoids[panIndex]['month'])
            lat.append(panoids[panIndex]['lat'])
            lon.append(panoids[panIndex]['lon'])
            panId.append(panoids[panIndex]['panoid'])
        # this exception is almost always because there is no meatadata for the year being analyzed
        except Exception as e: 
            numWithoutYear +=1
            #print("couldn't create df: %s" %(str(e)))
    pandDict = {
        'lat':np.array(lat),
        'lon':np.array(lon),
        'year':np.array(year),
        'panId':np.array(panId),
        'month':np.array(month)
    }
    # calculate distance between reference coordinates and image coordinates
    pandDict['dist'] = calcDistance(pandDict['lat'],pandDict['lon'],inputLat,inputLon)
    tempDict = getClosestImgForSingleLoc(
        pandDict,
        locId,
        distThreshold
    )
    return(tempDict)

### for all reference locations, identify images close enough to reference coodinates to be considered the 'same location', and collect metadata for the nerest image for each year ###
**Inputs:** <br>
- **coords** (pandas dataframe) - latitude and longitude coordinates for reference locations
- **distThreshold** (int) - maximum allowable distance between reference location and a representative panoid
- **nToSample** (int) - number of panoids to sample.  Return results once this value is reached
- **sampleCode** (int) - unique 4 digit code for each sample classification type (see cell 2 above) <br>

**Outputs:** <br>
- **outputDF** (pandas dataframe) - contains panoids, metadata, and corresponding refrence location identifier for all sampled panoids



In [6]:

def selectBestImages(coords,distThreshold,nToSample,sampleCode):
    outputDF = ps.DataFrame({})
    nSampled,coordIndex = 0,0
    viewingDigit = 1
    while(nSampled < nToSample and coordIndex < coords.count()[0]):
        downloadedFirstImg = True
        latImgLocations, longImgLocations = [],[]
        numTries,numLocations = 0,0
        curRecord = coords.iloc[coordIndex]
        panoids = streetview.panoids(lat=curRecord['Lat'], lon=curRecord['Lon'])
        try:
            tempDF = ps.DataFrame(extractYearAndDiffs(
                curRecord['OID_'],
                panoids,
                curRecord['Lat'],
                curRecord['Lon'],
                distThreshold
            ))
            if(tempDF.count()[0] >0):
                tempDF['sampleCode'] = sampleCode + viewingDigit%2 +1
                nSampled +=1
                viewingDigit +=1
                outputDF = tempDF if len(outputDF.keys()) == 0 else outputDF.append(tempDF)
        except Exception as e:
            a = 1
        coordIndex +=1
        time.sleep(0.0001) # metadata, can be small
    return(outputDF)

### given a single 4 digit sample classification identifier, query the GSV API and sample the alloted number of panoids 
**Inputs:**
- **GIS_data** (pandas dataframe) - contains latitude, longitude, and metadata of reference locations
- **sampleCode** (int) - unique 4 digit code for each sample classification type (see cell 2 above)
- **roadType** (int) - unique 1 digit code for each road type (see cell 2 above) <br>

**Outputs:**
- **imageMeta** (pandas dataframe) - contains panoids, metadata, and corresponding reference ids for sampled locations

In [7]:
def sampleSingleCategory(GIS_data,sampleCode,roadType):
    GIS_data = GIS_data.sample(frac=1)
    sampleData = categoryData = GIS_data[GIS_data['sampleCode']==sampleCode]
    nToSample = nToSample = sampleCodeSizes[str(sampleCode)]
    imageMeta = selectBestImages(sampleData,DISTANCE_THRESHOLD,nToSample,sampleCode)
    imageMeta['urban'] = np.ones((imageMeta.count()[0],1),dtype=np.int16)*list(sampleData['urban'])[0]
    imageMeta['division'] = np.ones((imageMeta.count()[0],1),dtype=np.int16)*list(sampleData['DIVISION'])[0]
    imageMeta['roadType'] = np.ones((imageMeta.count()[0],1),dtype=np.int16)*roadType
    print("completed sample code %i" %(sampleCode))
    return(imageMeta)

### sample all 4 digit sample classifications for a single road type ###
**Inputs:**
- **GIS_data** (pandas dataframe) - contains latitude, longitude, and metadata of reference locations
- **roadType** (int) - road type to sample (primary, secondary/tertiary, or residential) <br>

**Outputs:**
- **SV_Images** (pandas dataframe) - contains panoids, metadata, and corresponding reference ids for sampled locations

In [8]:
def sampleAllCategories(GIS_data,roadType):
    uniqueCodes = list(set(GIS_data['sampleCode']))
    SV_Images = sampleSingleCategory(GIS_data,uniqueCodes[0],roadType)
    for code in uniqueCodes[1:]:
        SV_Images = SV_Images.append(sampleSingleCategory(GIS_data,code,roadType))
    return(SV_Images)

### given a single file with all reference locations of a given road type, load the file into memory and sample all panoids for the given road type ###
**Inputs:**
- **inputFile** (string) - absolute filepath to csv file containing metadata of reference locations
- **outputFile** (string) - absolute filepath were sampled panoids and metadata should be stored
- **roadType** (int) - road type that the inputFile is associated with (primary, secondary/tertiary, or residential)

In [9]:
def sampleSingleFile(inputFile,outputFile,roadType):
    rawData = ps.read_csv(inputFile)
    screenedData = rawData[rawData['sampleCode'] >1000]
    print("road type %i" %(roadType))
    SV_Images = sampleAllCategories(screenedData,roadType)
    SV_Images.to_csv(outputFile,index=False)

In [None]:
for index in range(3):
    sampleSingleFile(INPUT_FILES[index],OUTPUT_FILES[index],index+1)

## Part 2: Adjust for compass heading and download GSV images ##

### load differences between north and road heading at panoids, calculated in ArcPro ###
**Outputs:**
- **allCompassData** (pandas dataframe) - contains panoid metadata and difference between north and road heading at panoid location

In [10]:
def loadCompassData():
    allCompassData = ps.read_csv(COMPASS_FILES[0])
    allCompassData = allCompassData.append(ps.read_csv(COMPASS_FILES[1]))
    allCompassData = allCompassData.append(ps.read_csv(COMPASS_FILES[2]))
    return(allCompassData)

In [11]:
compassData = loadCompassData()
print(compassData.head())
print(compassData.count())

   OID_  Join_Count  TARGET_FID     locId     imgLat     imgLon  imgYear  \
0     1           1           1   98124.0  40.172046 -75.873372     2019   
1     2           1           2  606663.0  44.866249 -73.449893     2019   
2     3           1           3  737073.0  40.779632 -78.857581     2009   
3     4           1           4  255543.0  41.234240 -75.790555     2012   
4     5           1           5  188685.0  43.778818 -73.799070     2019   

   imgMonth                   panId      dist  sampleCode  urban  division  \
0        10  wKku7JX9oLkYwqUh__KSKQ  1.007087        2312      3         2   
1         7  2Yx-Q0olEsjx65z-xE8Wog  2.497234        2311      3         2   
2         7  ZH1fSO2nyQbRXznFoXaVKw  2.434273        2312      3         2   
3         4  da_ZMkAiRJF7CgGj4Z-qTQ  5.491468        2311      3         2   
4         7  wksq-PLNXyuua30xVXy2SQ  1.109623        2312      3         2   

   roadType  compassHeading  
0         1      351.517670  
1         1   

### for a single panoid, randomly 1 of 2 viewing angles, and return the compass heading needed to download an image directly straight or looking to the side of the road ###
**Inputs:**
- **origHeading** - original viewing angle for downloading an image that looks directly ahead on the road 
- **sampleCode** - 1 if downloaded image should look straight ahead or behind, 2 if the downloaded image should look direclty to the left or right of the road <br>

**Outpus:**
- **cat** (string) - the viewing angle, straight or side
- **downloadHeading** (float) - the heading to send to the GSV API for downloading the image at the desired viewing angle

In [12]:
def adjustOneHeading(origHeading,sampleCode):
    cat = "straight" if sampleCode %2 == 1 else "side"
    downloadHeading = int((origHeading + (sampleCode%2 +1)*90)%360)
    return([cat,downloadHeading])

### for all panoids, adjust the headings to download the desired viewing angle ###
**Inputs:**
- **compassData** (pandas dataframe) - contains panoid metadata and difference between north and road heading at panoid location 

**Outputs:**
- the input dataframe with two additional fields, the downloaded viewing angle and the heading that should be sent to the GSV API

In [13]:
def adjustAllHeadings(compassData):
    viewAngle,adjHeading = [],[]
    for rowIndex in range(compassData.count()[0]):
        curRecord = compassData.iloc[rowIndex]
        tempAngle,tempHeading = adjustOneHeading(curRecord['compassHeading'],curRecord['sampleCode'])
        viewAngle.append(tempAngle)
        adjHeading.append(tempHeading)
    compassData['viewingAngle'] = viewAngle
    compassData['adjHeading'] = adjHeading
    return(compassData)

### using predefined naming conventions, determine the filename for a downloaded GSV image.  Filename should include the panoid and compass heading
**Inputs:**
- **compassData** (pandas dataframe) - contains panoid metadata and compass heading to send to the GSV API

**Outputs:** 
- the input dataframe with an additional field, the filename for the downloaded GSV image.

In [14]:
def calcOutputFilename(compassData):
    outputFilenames = []
    for rowIndex in range(compassData.count()[0]):
        curRecord = compassData.iloc[rowIndex]
        outputFilenames.append(curRecord['panId'] + "_" + str(curRecord['adjHeading']) + ".jpg")
    compassData['filename'] = outputFilenames
    return(compassData)

### given download parameters, create a string for a RESTful Street View API query ###
**Inputs:**
- **panid** (string) - unique identifier for the panorama to sample and download from
- **heading** (int) - compass heading that corresponds to the desired viewing angle
**Outputs:**
- **combinedString** (string) - the full query to send to the Street View API


In [25]:
def createDownloadString(panid,heading):
    baseString = "https://maps.googleapis.com/maps/api/streetview?size=640x640"
    locString = "&pano=" + str(panid)
    headingString = "&heading=" + str(heading)
    apiString = "&key=" + str(API_KEY)
    combinedString = baseString + locString + headingString + apiString
    return(combinedString)

### determine the absolute filepath to store a downloaded GSV image ###
**Inputs:**
- **panid** (string) - unique identifier of the panoid
- **heading** (int) - compass heading that corresponds to the desired viewing angle 

**Outputs:**
- the abolute filepath where the downloaded image should be stored

In [None]:
def createDownloadFilepath(panid,heading):
    return(OUTPUT_FOLDER + panid + "_" + str(heading) + ".jpg")

### query the GSV API and download images ###
**Inputs:**
- **compassData** (pandas dataframe) - contains panoids and compass headings for GSV RESTful queries
- **numToDownload** (int) - number of images to try to download.  Useful to stay within account/budget limits

In [23]:
def downloadData(compassData,numToDownload):
    numDownloaded=0
    for rowIndex in range(compassData.count()[0]):
        curRecord = compassData.iloc[rowIndex]
        outputFilepath = IMAGE_FOLDER + curRecord['filename']
        if not(os.path.exists(outputFilepath)) and numDownloaded < numToDownload:
            try:
                downloadUrl = createDownloadString(curRecord['panId'],curRecord['adjHeading'])
                wget.download(downloadUrl,outputFilepath)
                numDownloaded+=1
            except Exception as e:
                print("couldn't download image: " + outputFilepath + str(e))  

## Part 3: Remove images from dataset that were flagged during visual inspection ##

In [28]:
compassData = adjustAllHeadings(compassData)
compassData = calcOutputFilename(compassData)
compassData = ps.read_csv(PARENT_FOLDER + "GSV_Img_Meta.csv")
compassData.to_csv(PARENT_FOLDER + "GSV_Img_Meta.csv")
downloadData(compassData,20000)
compassData.head()

In [None]:
imgsToRemove = list(ps.read_csv(IMGS_TO_REMOVE)['dud imgids'])
for img in imgsToRemove:
    try:
        os.remove(SCREENED_IMAGE_FOLDER + img + ".jpg")
    except Exception as e:
        print(str(e))

In [22]:
compassData = ps.read_csv(PARENT_FOLDER + "GSV_Img_Meta.csv")
compassData.head()
imgsToRemove = ps.read_csv(IMGS_TO_REMOVE)
imgsToRemove['filename'] = imgsToRemove['dud imgids'] + ".jpg"
print(imgsToRemove.head())
compassData2 = compassData[~compassData['filename'].isin(list(imgsToRemove['filename']))]
compassData2.to_csv(PARENT_FOLDER + "/GSV_screened_images.csv")

                   dud imgids                        filename
0  q-fTZKEn8dQBjrPuIoB1Tg_195  q-fTZKEn8dQBjrPuIoB1Tg_195.jpg
1    InMWxv0047x8PJCB1uV__g_0    InMWxv0047x8PJCB1uV__g_0.jpg
2  JauYOD1VQv1vYbm9a4LRug_245  JauYOD1VQv1vYbm9a4LRug_245.jpg
3  rkq1KGRSFX91O34DGgnS0Q_240  rkq1KGRSFX91O34DGgnS0Q_240.jpg
4  AHAIO4GprjBJu8PvygLB6A_258  AHAIO4GprjBJu8PvygLB6A_258.jpg


In [24]:
compassData2.head()

Unnamed: 0.1,Unnamed: 0,OID_,Join_Count,TARGET_FID,locId,imgLat,imgLon,imgYear,imgMonth,panId,dist,sampleCode,urban,division,roadType,compassHeading,viewingAngle,adjHeading,filename
0,0,1,1,1,98124.0,40.172046,-75.873372,2019,10,wKku7JX9oLkYwqUh__KSKQ,1.007087,2312,3,2,1,351.51767,side,81,wKku7JX9oLkYwqUh__KSKQ_81.jpg
1,1,2,1,2,606663.0,44.866249,-73.449893,2019,7,2Yx-Q0olEsjx65z-xE8Wog,2.497234,2311,3,2,1,343.643219,straight,163,2Yx-Q0olEsjx65z-xE8Wog_163.jpg
2,2,3,1,3,737073.0,40.779632,-78.857581,2009,7,ZH1fSO2nyQbRXznFoXaVKw,2.434273,2312,3,2,1,23.60438,side,113,ZH1fSO2nyQbRXznFoXaVKw_113.jpg
3,3,4,1,4,255543.0,41.23424,-75.790555,2012,4,da_ZMkAiRJF7CgGj4Z-qTQ,5.491468,2311,3,2,1,221.217789,straight,41,da_ZMkAiRJF7CgGj4Z-qTQ_41.jpg
4,4,5,1,5,188685.0,43.778818,-73.79907,2019,7,wksq-PLNXyuua30xVXy2SQ,1.109623,2312,3,2,1,20.991724,side,110,wksq-PLNXyuua30xVXy2SQ_110.jpg
