# Calculate and graph time variance in image segmentation #

** Author: Andrew Larkin **, Oregon State University College of Public Health and Human Sciences <br>
** Date created: ** December 5th, 2018

### Summary ###
Google updates Street View Imagery on a regular basis.  This script estimates time variation in image segmentation estimates by calculating the relative standard deviation of imagery over time at each Place Pulse location

### setup ###

In [6]:
# import packages

import pandas as ps
import csv
import os
import numpy as np

In [7]:
# define global constants and filepaths

parentFolder = "C:/users/larkinan/desktop/tmpdlltr/"
comparisonFile = parentFolder + "PulsePlaceComparisons.csv"
imageFolder = parentFolder + "imageData/allImages/"
statsFile = parentFolder + "imageDeviationStats.csv"

In [1]:
# load image-based estimates from a single csv file 
# (one csv file for each perception of interest - safety, lively, and beauty)
def loadSingleImageFile(folder,index):
    files = os.listdir(folder)
    imageData = ps.read_csv(folder + files[index])
    return(imageData)

In [2]:
def createCategory(inData,categories,categoryName):
    tempData = np.zeros((len(inData['wall']),1))
    index=0
    for category in categories:
        tempData = np.add(tempData,np.array(inData[category]).reshape(len(inData['wall']),1))
    inData[categoryName] = tempData

### create latent constructs by summing variables ###
Adds latent constructs in place, nothing is returned from the function <br>
**Inputs**<br>
- **InData** (pandas dataframe) - contains all variables and records of interest <br>

In [3]:
def createCategories(inData):

    # crate the built environment category
    builtEnv = ['wall','building','road','windowpane','sidewalk','hovel','house','fence','railing',
               'signboard','skyscraper','path','stairs','runway','screen door','stairway','bridge',
               'bench','booth','awning','streetlight','television receiver','pole','bannister','escalator',
               'fountain','swimming pool','step','sculpture','traffic light','pier']
    createCategory(inData,builtEnv,'builtEnv')
    
    # create the accessibility category 
    accessibility = ['sidewalk','escalator','path','stairs','stairway','bench','step']
    createCategory(inData,accessibility,'accessibility')
    
    # create the allNature category
    allNature = ['tree','grass','plant','field','land','flower','water','sea','waterfall','lake','earth',
                'mountain','rock','sky','sand','hill','dirt track']
    createCategory(inData,allNature,'allNature')
    
    # create the greenspace cateogry 
    greenspace = ['tree','grass','plant','field','flower']
    createCategory(inData,greenspace,'greenspace')
    
    # create the bluespace category 
    bluespace = ['water','sea','waterfall','lake']
    createCategory(inData,bluespace,'bluespace')
    
    # create the otherNature category
    otherNature = ['earth','mountain','rock','sky','sand','hill','dirt track','land']
    createCategory(inData,otherNature,'otherNature')
    
    # create the animate category
    animate = ['person','boat','car','bus','truck','airplane','van','ship','minibike','animal','bicycle']
    createCategory(inData,animate,'animate')

### calculate relative standard deviation across time for each location ###
**Inputs**<br>
- **imageData** (pandas dataframe) - contains location key, image key, and image measures
- **prevIDs** (string array) - ids of variables that already been processed
**Outputs** <br>
- **allStats** (np matrix) - relative std dev scores for all locations and image measures.  Rows correspond to unique measures, cols correspond to unique locations.  

In [4]:
def calcStats(imageData,prevIds):
    allStats = None
    index = 0
    uniqueIds  = list(set(imageData['id']))
    statsArray = []
    for idVal in uniqueIds: 
        if(idVal not in prevIds):
            subsetData = imageData.loc[imageData['id'] == idVal]
            stdArr = np.std(subsetData,axis=0)*100.0
            meanArr = np.mean(subsetData,axis=0)
            subsetStats = np.divide(stdArr,meanArr)
            subsetStats = subsetStats.reshape((subsetStats.shape[0],1))
            statsArray.append(subsetStats)
            index+=1
            if(index==1000):
                print(index)
    allStats = np.stack(statsArray,axis=1)
    return(allStats)                            

### cleanup relative standard deviation dataset ###
**Inputs** <br>
- **allStats** (np arrray) - measures to calculate relative standard deviation for <br>
- **imageData** (pandas dataframe) - contains variables names location ids <br>

**Outputs** <br>
- **cleanedRelStdDev** (pandas dataframe) - relative standard deviation with column names added, and locations with missing data removed

In [11]:
def cleanRelStdDev(allStats,imageData):
    relStdDev = ps.DataFrame(allStats.reshape(allStats.shape[0],allStats.shape[1]))
    relStdDev = relStdDev.T
    subsetData = imageData.loc[imageData['id'] == imageData['id'][0]]
    stdArr = np.std(subsetData,axis=0)*100.0
    relStdDev.columns = stdArr.keys()
    uniqueIds  = list(set(imageData['id']))
    relStdDev['id'] = uniqueIds
    cleanedRelStdDev = relStdDev[np.isfinite(relStdDev['wall'])]
    return(cleanedRelStdDev)

### calculate relative standard deviation of image based measures for each image dataset (safety, beauty, and lively) ###
**Inputs** <br>
- **imageFolder** (string) - filepath to folder containing datasets to process <br>
- **imageIndex** (int) - image dataset number <br>
- **prevIDs** (string array) - ids of previously processed images <br>

**Outputs** <br>
- **cleanedRelStdDev** (pandas dataframe) - relative standard deviation estimates for the input dataset <br>

In [10]:
def processSingleImageSet(imageFolder,imageIndex,prevIDs):
    
    keepVars = ['lat','lng','id',
                'wall','building','road','windowpane','sidewalk','hovel','house','fence','railing',
                'signboard','skyscraper','path','stairs','runway','screen door','stairway','bridge',
                'bench','booth','awning','streetlight','television receiver','pole','bannister','escalator',
                'fountain','swimming pool','step','sculpture','traffic light','pier',
                'tree','grass','plant','field','land','flower','water','sea','waterfall','lake','earth',
                'mountain','rock','sky','sand','hill','dirt track',
                'person','boat','car','bus','truck','airplane','van','ship','minibike','animal','bicycle',
                'builtEnv','accessibility','allNature','greenspace','bluespace','otherNature','animate']
    
    keepVars = list(set(keepVars)) # clean up check
    
    imageData = loadSingleImageFile(imageFolder,imageIndex)
    createCategories(imageData)   
    imageData2 = imageData[keepVars]
    createCategories(imageData)
    allStats = calcStats(imageData,[])
    
    cleanedRelStdDev = cleanRelStdDev(allStats,imageData)
    
    return(cleanedRelStdDev)
    

### main function ###

In [9]:
def main():
    dataset1 = processSingleImageSet(imageFolder,0,[])
    collectedIDs = dataset1['id']
    dataset2 = processSingleImageSet(imageFolder,1,collectedIDs)
    collectedIDs = collectedIDs + dataset2['id']
    dataset3 = processSingleImageSet(imageFolder,2,collectedIDs)
    combinedData = ps.concat([imageData4,imageData5,imageData3])
    combinedData.to_csv(statsFile)

In [None]:
main()