# Calculate and graph performance summary statistics #

** Author: Andrew Larkin **, Oregon State University College of Public Health and Human Sciences <br>
** Date created: ** November 28, 2018

### Summary ###
Summary statistics image and remote sensing measures at PlacePulse image locations.  Operations include calculating summary statistics, plotting histograms and boxplots, and creating correlation matrices and correlation plots.

### setup ###

In [1]:
import pandas as ps
import csv
import os
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(color_codes=True)

In [15]:
# define filepaths
parentFolder = "C:/users/larkinan/desktop/PlacePulse/"#"G:/dropbox/PlacePulse/"
measuresCSV = parentFolder + "PlacePulseMergedMeasures_Nov28_18.csv"
addedConstructsCSV = parentFolder + "PlacePulseMeasuresConstructs_Nov29_18.csv"

### calculate statistics for one level in a stratificaiton ###
function does not return statistics, but rather appends to already existing variable arrays.  One array corresponds to a single column in a denormalized database.
**Inputs** <br>
- **inputDataset** (pandas dataframe) - contains a large set of variables, including all variables of interest <br>
- **stratifyVal** (string) - stratification level value <br>
- **valueNames** (string array) - variables to calculate summary statistics for <br>
- **stratifyName** (string array) - array designiating the stratify value for an index in all arrays <br>
- **value** (string array) - array designating the variable category for an index in all arrays <br>
- **meanVals** (float array) - mean value for the variable in value and stratification in stratifyNames <br>
- **stdDev** (float array) - std dev for the variable in value and stratification in stratifyNames <br>
- **minVal** (float aray) - min value for the variable in value and stratification in stratifyNames <br>
- **maxVal** (float array) - max value for the variable in value and stratifcation in stratifyNames <br>
- **iqr** (float array) interquartile range for the variable in value and stratification in stratifyNames <br>

In [5]:
def calcStatsOneStratify(inputDataset,stratifyVal,valueNames,
                        stratifyName,value,meanVals,stdDev,minVal,maxVal,iqr):
    
    # for each varaible of interest, calculate summary statistics and append results to arrays
    for valueName in valueNames:
            stratifyName.append(stratifyVal)
            value.append(valueName)
            npArr = np.array(inputDataset[valueName])
            meanVals.append(np.mean(npArr))
            stdDev.append(np.std(npArr))
            q75, q25 = np.percentile(npArr, [75 ,25])
            minVal.append(np.min(npArr))
            maxVal.append(np.max(npArr))
            iqr.append(q75 - q25)

### calculate statistics, stratified by a specific variable ###
**Inputs** <br>
- **inputDataset** (pandas dataframe) - contains all variables and records of interest <br>
- **valueNames** (string array) - names of the variables to calculate summary statistics for <br>
- **stratify** (string) - name of the variable to stratify by <br>

**Outputs** <br>
- **df (pandas dataframe)** - contains derived summary statistics <br>

In [104]:
def calcSummaryStats(inputDataset,valueNames,stratify):
    
    # get set of all stratification levels
    stratifyLevels = set(inputDataset[stratify])
    stratifyName, value, meanVals, stdDev, minVal, maxVal, iqr  = ([] for i in range(7))
    
    # for each stratifcation level, get the subset partition of data 
    # and caculate summary statistics for the partition
    for stratifyLevel in stratifyLevels:
        subsetData = inputDataset.loc[inputDataset[stratify] == stratifyLevel]
        calcStatsOneStratify(subsetData,stratifyLevel,valueNames,
                            stratifyName,value,meanVals,stdDev,minVal,maxVal,iqr)
        
    # calculate summary statistics for the entire dataset, without stratification
    calcStatsOneStratify(inputDataset,"none",valueNames,
                        stratifyName,value,meanVals,stdDev,minVal,maxVal,iqr)
    
    # combine summary statistic arrays into a dict
    statsDict = {stratify:stratifyName,'meanVals':meanVals,'stdDev':stdDev,'IQR':iqr,'value':value,
                'min':minVal,'max':maxVal}
    df = ps.DataFrame.from_dict(statsDict)
    return(df)

### calculate 25th and 75th percentile for a subset of variables of interest ###
**Inputs** <br>
- **inputDataset** (pandas dataframe) - contains all variables of interest <br>
- **categories** (string array) - names of the variables to calculate percentiles for <br>

**Outputs** <br>
- **outputDict** (dict) - dictionary containing 25th and 75th percentile for each variable of interest <br>

In [105]:
def calcLowHighQuartiles(inputDataset,categories):
    outputDict = {}
    
    # for each variable of interest
    for category in categories:
        dataSubset = inputDataset[category]
        dataSubset = dataSubset.dropna()
        outputDict[category + "low"] = np.percentile(dataSubset, 25)
        outputDict[category + "high"] = np.percentile(dataSubset,75)
    return(outputDict)

### calculate summary statistics for a specific subset of variables of interest.  For completing descriptive statistics in manuscript ###
**Inputs** <br>
- **inputDataset** (pandas dataframe) - contains all variables and records of interest <br>
- **outputFilename** (string) - output filepath to write results to <br.

In [106]:
def subsetStatsForTable(inputDataset,outputFilename):
    
    # variables to calculate summary statistics for 
    tableCategories = ['accessibility','allNature','animate','bluespace','building','builtEnv','car','grass','greenspace',
                       'otherNature','person','plant','road','sidewalk','sky','tree','person','bench',
                       'PM251000m','mjRds100m','mjRds250m','NO2100m','NO2250m','tr100m','tr250m','imp1000m','pop1000m',
                       'NDVI250m',
                       'mu_beautiful','mu_lively','mu_safety','count_beautiful','win_beautiful','lose_beautiful',
                       'count_lively','win_lively','lose_lively','count_safety','win_safety','lose_safety',
                       'CITY_NAME']
    
    # subset dataset and calculate summary statistics
    tableSubset = inputDataset[tableCategories]
    tableCategories = tableCategories[0:len(tableCategories)-1]
    subsetStats = calcSummaryStats(tableSubset,tableCategories,'CITY_NAME')
    subsetStats.to_csv(outputFilename + "_" + str(len(tableSubset['CITY_NAME'])) + ".csv")                 

### calculate summary statistics for the bottom and top quartile of perception variables ###
**Inputs** <br>
- **inputDataset** (pandas dataframe) - contains all variables and records of interest <br>
- **parentFolder** (string) - name of the folder to write results to <br>
- **valueNames** (string array) - names of the variables to calculate summary statistics for <br>
- **stratify** (string) - name of the variable to stratify by

In [107]:
def calcLowHighSummaryStats(inputDataset,parentFolder,valueNames):
    
    # perception variables to calculate bottom and top percentile and partition by 
    percentileVars = ['mu_beautiful','mu_lively','mu_safety']
    
    # for each perception variable, calculate bottom and top quartile
    lowHighDict = calcLowHighQuartiles(screenedData,percentileVars)
    
    # for each perception variable, subset the bottom and top quartiles and calculate summary stats
    for percentileVar in percentileVars:
            lowSubset = inputDataset.loc[inputDataset[percentileVar] < lowHighDict[percentileVar + "low"]]
            subsetStatsForTable(lowSubset,parentFolder + percentileVar + "low")
            highSubset = inputDataset.loc[inputDataset[percentileVar] > lowHighDict[percentileVar + "high"]]
            subsetStatsForTable(highSubset,parentFolder + percentileVar + "high")
    print(lowHighDict)

### create a histogram for a single variable of interest.  Part of a larger plot of multiple variables ###
**Inputs** <br>
- **xMax** (int) - max value for the x axis <br>
- **xVals** (float array) - values used to create histograms <br>
- **subplotIndex** (int) - index of the larger plot where the subplot will be placed <br>
- **yLabel** (string) - name of the y axis <br>
- **xLabel** (string) - name of the x axis <br>

In [8]:
def createHistogram(xMax,xVals,subplotIndex,yLabel,xLabel):
    tempAxis = plt.subplot(5,2,subplotIndex)
    tempAxis.set_xlim([min(xVals),xMax])
    plt.xlabel(xLabel)
    plt.ylabel(yLabel)
    n, bins, patches = plt.hist(xVals, 50,facecolor='g', alpha=0.75)

### plot historgrams for multiple variables, stratified by a specific variable ### 
**Inputs** <br>
- **inData** (pandas dataframe) - contains all variables and records of interest <br>
- **outFolder** (string) - where histogram plots are saved <br>
- **stratify** (string) - name of the variable containing stratification levels <br>
- **valuesOfInterest** (string array) - name of the variables to create histograms for <br>

In [108]:
def createHistograms(inData,outFolder,stratify,valuesOfInterest):
    
    # names of the stratification levels
    stratifyLevels = set(inData[stratify])
    
    # for each stratification level, subset data by stratificatoin, and create a plot to store histograms
    for stratifyLevel in stratifyLevels:
        outputFile = outFolder + "RemSensHist_" + str(stratifyLevel) + ".png"
        if not(os.path.exists(outputFile)):
            stratifySubset = inData.loc[inData[stratify] == stratifyLevel]
            fig = plt.figure(num=None, figsize=(12, 15), dpi=160, facecolor='w', edgecolor='k')
            fig.suptitle("Remote Sensing " + stratifyLevel, fontsize=16)
            index = 1
            
            # for each value of interest, create a histogram subplot
            for value in valuesOfInterest:
                subsetData = stratifySubset[value]
                createHistogram(max(subsetData),subsetData,index,"frequency",value)
                index+=1
            plt.savefig(outFolder + "RemSensHist_" + str(stratifyLevel) + ".png", bbox_inches="tight")
            
    # create histograms for all data without stratification 
    stratifySubset = inData
    fig = plt.figure(num=None, figsize=(12, 15), dpi=160, facecolor='w', edgecolor='k')
    fig.suptitle("Remote Sensing " + "AllLevels", fontsize=16)
    index = 1
    for value in valuesOfInterest:
        subsetData = stratifySubset[value]
        createHistogram(max(subsetData),subsetData,index,"frequency",value)
        index+=1
        plt.savefig(outFolder + "RemSensHist_All.png", bbox_inches="tight")
    
    
    #plt.show() # only use for debugging purposes

### createBoxplots for multiple variables, stratified by a specific variable ### 
**Inputs** <br>
- **inData** (pandas dataframe) - contains all variables in the records of interest <br>
- **outFolder** (string) - where boxplots are saved <br>
- **stratify** (string) - name of the variable containing stratification levels <br>
- **valuesOfInterest** (string array) - names of the variables to create boxplots for <br>

In [109]:
def createBoxplots(inData,outFolder,stratify,valuesOfInterest):
    
    # names of the stratification levels
    stratifyLevels = set(inData[stratify])
    
    # for each variable of interest, calculate summary statistics for each stratification level 
    for value in valuesOfInterest:
        outputFile = outFolder + "MeasBoxPlot_" + value + ".png"
        if not os.path.exists(outputFile):
            fig = plt.figure(num=None, figsize=(48, 15), dpi=160, facecolor='w', edgecolor='k')
            fig.suptitle("Measure " + value, fontsize=16)
            boxData, medianVals, cityNames, cityNamesSorted, boxDataSorted  = ([] for i in range(5))
            index=0
        
            for stratifyLevel in stratifyLevels:
                stratifySubset = inData.loc[inData[stratify] == stratifyLevel]
                stratifySubset = stratifySubset.dropna()
                stratifySubset = stratifySubset[value]
                boxData.append(stratifySubset)
                medianVals.append((index,np.median(stratifySubset)))
                cityNames.append(stratifyLevel)
                index+=1
                
            # order the stratified summary statistics from low to high median
            medianVals.sort(key=lambda x:x[1])
            
            for sortedMean in meanVals:
                boxDataSorted.append(boxData[sortedMean[0]])
                cityNamesSorted.append(cityNames[sortedMean[0]])
            plt.boxplot(boxDataSorted)
            plt.xticks(range(1,len(cityNamesSorted)+1),cityNamesSorted,rotation=70)
            #plt.show()
            plt.savefig(outFolder + "MeasBoxPlot_" + value + ".png")
            plt.close()

### sum multiple variables to create latent constructs ###
**Inputs** <br>
- **inData** (pandas dataframe) - contains all variables and records of interest <br>
- **categories** (string array) - names of the variables to sum <br>
- **categoryName** (string) - name to designate for the latent construct <br>

In [11]:
def createCategory(inData,categories,categoryName):
    tempData = np.zeros((len(inData['wall']),1))
    index=0
    for category in categories:
        tempData = np.add(tempData,np.array(inData[category]).reshape(len(inData['wall']),1))
    inData[categoryName] = tempData

### create latent constructs by summing variables ###
Adds latent constructs in place, nothing is returned from the function <br>
**Inputs**<br>
- **InData** (pandas dataframe) - contains all variables and records of interest <br>

In [110]:
def createCategories(inData):

    # crate the built environment category
    builtEnv = ['wall','building','road','windowpane','sidewalk','hovel','house','fence','railing',
               'signboard','skyscraper','path','stairs','runway','screen door','stairway','bridge',
               'bench','booth','awning','streetlight','television receiver','pole','bannister','escalator',
               'fountain','swimming pool','step','sculpture','traffic light','pier']
    createCategory(inData,builtEnv,'builtEnv')
    
    # create the accessibility category 
    accessibility = ['sidewalk','escalator','path','stairs','stairway','bench','step']
    createCategory(inData,accessibility,'accessibility')
    
    # create the allNature category
    allNature = ['tree','grass','plant','field','land','flower','water','sea','waterfall','lake','earth',
                'mountain','rock','sky','sand','hill','dirt track']
    createCategory(inData,allNature,'allNature')
    
    # create the greenspace cateogry 
    greenspace = ['tree','grass','plant','field','flower']
    createCategory(inData,greenspace,'greenspace')
    
    # create the bluespace category 
    bluespace = ['water','sea','waterfall','lake']
    createCategory(inData,bluespace,'bluespace')
    
    # create the otherNature category
    otherNature = ['earth','mountain','rock','sky','sand','hill','dirt track','land']
    createCategory(inData,otherNature,'otherNature')
    
    # create the animate category
    animate = ['person','boat','car','bus','truck','airplane','van','ship','minibike','animal','bicycle']
    createCategory(inData,animate,'animate')
    

### calculate correlations and create a correlation plot ###
**Inputs** <br>
- **outFolder** (string) - name of the folder to store correlation matrix and correlation plot

In [111]:
def createCorrPlot(outFolder):
    
    # variables to include in correlation matrix and correlation plot
    correlationVars = ['accessibility','allNature','animate','bluespace','building','builtEnv','car','grass','greenspace',
                       'otherNature','person','plant','road','sidewalk','sky','tree','bench',
                       'PM251000m','mjRds100m','mjRds250m','NO2100m','NO2250m','tr100m','tr250m','imp1000m','pop1000m',
                       'NDVI250m',
                       'mu_beautiful','mu_lively','mu_safety']

    correlationSubset = screenedData[correlationVars]
    
    # calculate correlations 
    corr = correlationSubset.corr()
    fig = plt.figure(num=None, figsize=(48, 15), dpi=160, facecolor='w', edgecolor='k')
    
    # create correlation plot
    sns.clustermap(corr,cmap="coolwarm")#, robust=True)
    #plt.show()
    plt.savefig(outFolder + "CorrPlot.png")
    plt.close()
    #print(corr)
    corr.to_csv(outFolder + "corrMatrix.csv")

### main function ###

In [112]:
def main():
    
    # load data and clean
    measuresData = ps.read_csv(measuresCSV)
    removeVals = ['id','latit','longit','latJoin','longJoin','latitude','longitude','CITY_NAME',
             'GMI_ADMIN','ADMIN_NAME','FIPS_CNTRY','CNTRY_NAME','STATUS','POP','POP_RANK','POP_CLASS']

    screenedData = measuresData.dropna(subset=['wall']) 
    screenedData.to_csv(addedConstructsCSV)
    
    # create latent constructs 
    createCategories(screenedData)
    valuesOfInterest = ([val for val in list(screenedData) if val not in removeVals])
    
    # calculate summary stats
    calcLowHighSummaryStats(screenedData,parentFolder,valuesOfInterest)
    allSumStats = calcSummaryStats(screenedData,valuesOfInterest,'CITY_NAME')
    allSumStats.to_csv(parentFolder + "remoteSensingsummaryStats.csv")
    
    # create histograms, boxplots and correlation plots
    createHistograms(screenedData,parentFolder,'CITY_NAME',valuesOfInterest)
    createBoxplots(screenedData,parentFolder,'CITY_NAME',valuesOfInterest)
    createCorrPlot(parentFolder)

In [None]:
main()