In [1]:
# sys, file and nav packages:
import datetime as dt
import json

# math packages:
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.distributions.empirical_distribution import ECDF

# charting:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib import ticker
from matplotlib import colors
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.gridspec import GridSpec
from mpl_toolkits.axes_grid1 import make_axes_locatable
from mpl_toolkits.axes_grid1.inset_locator import inset_axes

import seaborn as sns

import IPython
from PIL import Image as PILImage
from IPython.display import Markdown as md
from IPython.display import display

import time

start_date = '2020-03-01'
end_date ='2021-05-31'

a_qty = 20

a_fail_rate = .5

use_fail = False

unit_label = 'p/100m'

# survey data:
dfx= pd.read_csv('resources/checked_sdata_eos_2020_21.csv')

dfBeaches = pd.read_csv("resources/beaches_with_land_use_rates.csv")
dfCodes = pd.read_csv("resources/codes_with_group_names_2015.csv")

# set the index of the beach data to location slug
dfBeaches.set_index('slug', inplace=True)

# set the index of to codes
dfCodes.set_index("code", inplace=True)

# code description map
code_d_map = dfCodes.description

# code material map
code_m_map = dfCodes.material

In [2]:
dfBeaches.head()

Unnamed: 0_level_0,location,latitude,longitude,post,country,water,water_name,city_slug,water_name_slug,is_2020,...,% to buildings,% to trans,% to recreation,% to agg,% to woods,% to water,% to unproductive,is_slr,is_mcbp,both
slug,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aabach,Aabach,47.220989,8.940365,8716,CH,l,Zurichsee,schmerikon,zurichsee,True,...,0.170626,0.045356,0.025918,0.25486,0.384449,0.520518,0.11879,False,False,False
aare-limmatspitz,Aare Limmatspitz,47.50106,8.237371,5300,CH,r,Aare,gebenstorf,aare,True,...,0.224684,0.120253,0.012658,0.316456,0.322785,0.118671,0.003165,False,False,False
aare-port,Aare Port,47.11617,7.26955,2558,CH,r,Aare|Nidau-Büren-Kanal,port,aarenidau-buren-kanal,True,...,0.296623,0.07489,0.063142,0.185022,0.378855,0.041116,0.001468,False,False,False
aare-solothurn-lido-strand,Aare Solothurn Lido Strand,47.196949,7.521643,4500,CH,r,Aare,solothurn,aare,True,...,0.157576,0.124242,0.027273,0.4,0.289394,0.069697,0.001515,False,False,False
aarezufluss_bern_scheurerk,aarezufluss_bern_scheurerk,46.97101,7.45279,3004,CH,r,Aare,bern,aare,False,...,0.40458,0.20458,0.065649,0.071756,0.251908,0.076336,0.001527,True,False,False


In [3]:
dfBeaches.columns

Index(['location', 'latitude', 'longitude', 'post', 'country', 'water',
       'water_name', 'city_slug', 'water_name_slug', 'is_2020', 'city',
       'bfsnum', 'population', 'streets', 'intersects', 'river_bassin',
       'industrial', 'residential', 'government', 'agg_buildings',
       'unk_building', 'roads', 'railways', 'airports', 'special',
       'recreational', 'orchards', 'vineyards', 'horticulture', 'arable',
       'meadows', 'farmpastures', 'alpinemeadows', 'aplinepasteurs',
       'closed_forest', 'open_forest', 'brush_forest', 'woods', 'lakes',
       'rivers', 'unproductive', 'bareland', 'glaciers', 'luse_total',
       'water_value', 'adjusted_land_use', 'part_industrial',
       'part_residential', 'part_government', 'part_agg_buildings',
       'part_unk_building', 'part_roads', 'part_railways', 'part_airports',
       'part_special', 'part_recreational', 'part_orchards', 'part_vineyards',
       'part_horticulture', 'part_arable', 'part_meadows', 'part_farmpastures'

In [4]:
dfBeaches.roads

slug
aabach                                       21.0
aare-limmatspitz                             63.0
aare-port                                    47.0
aare-solothurn-lido-strand                   68.0
aarezufluss_bern_scheurerk                  107.0
                                            ...  
zurcher-strand                              148.0
zurichsee-feldeggstr-banningersand          105.0
zurichsee_kusnachterhorn_thirkell-whitej     31.0
zurichsee_wollishofen_langendorfm            77.0
clean-up-event-test                          77.0
Name: roads, Length: 262, dtype: float64

In [5]:
dfBeaches.loc["vidy"].intersects

2.0

# Indentifying accumulation and leakage with Spearmans Rho

_Notes:__ Louise and Roger
_Notes:__ Revised Roger

To better control (eliminate) trash in the environment it is essential that the limited resources that are attributed to this domain be used as efficiently as possible. Correctly identifying zones of accumulation and or the objects that are accumulating in the watershed would enable more coordinated and precise actions between stakeholders. The process of identification needs to fulfill certain operating requirements to be effective:

* accurate
* repeatable
* scale-able (up and down)

Furthermore, beach-litter data is `count` data and highly variable. Any test dependent on a linear relationship or continuous data can be ruled out. Spearman's $\rho$ does not require that the two variables be continuous. Included in most standard computing libraries and spreadsheets, the process is easy to automate and integrate in most processes.  Spearman's $\rho$ or Spearman's rank correlation coefficient is a non parametric test of rank correlation between two variables. Spearmans $\rho$ defines the magnitude (how much it approaches linear) of monotonic relationships and the direction. When $\rho$ is 0 there is no evidence of a monotonic relationship. Values of 1 and -1 signify a perfect monotonic relationship between two variables.

__The hypothesis of $\rho$__

$H_{0}$: There is no monotonic association between the two variables

Statistical significance is defined as a very unlikely outcome under the null hypothesis, for this report unlikely is defined as less than 5% chance or p < 0.05. Spearmans $\rho$ is implemented using SciPy {cite}`spearmansscipy` on Jupyter Lab {cite}`jupyter`.

The data is beach litter data collected during a national survey of lakes and rivers in Switzerland, [IQAASL](https://hammerdirt-analyst.github.io/IQAASL-End-0f-Sampling-2021/titlepage_de.html). Standard MSFD/OSPAR protocols were followed within geographic limitations.

__Using $\rho$__

Land use data for each survey location was collected by creating buffer-zones around each survey location at radiuses from 1.5 - 5km and 10km and sectioning layers of the _Swiss Statistical Survey_ {cite}`lusestats`, the complete process is defined here [_The land use profile_](https://hammerdirt-analyst.github.io/IQAASL-End-0f-Sampling-2021/land_use_correlation.htmlwith).

The land use is designated For every 100m² of the country using satellite images. There are 27 different possible categories for each 100m², a buffer zone with radius of 1500m  has an area ( $\pi r^{2}$ ) of 7,068,583m² or $\approxeq$ 700 100m squares. In Switzerland the width of the lake shore is fairly small, a survey of 100m x 50m would be considered fairly large. For the data in this report the median survey length was 52 meters. 

However, there are over three hundred samples in the data set. Each sample has between zero and 100 different objects, each object has its own count. In total there over 50,000 items collected, identified and counted in the 12 month period. Each sample is recorded as pieces of trash per meter and quantity. __The cumulative sum of the different land-use categories is evaluated with the survey results of the most abundant codes.__

*__Below:__ The land use categories and the corresponding groups. Notice that `ind` is the only group that has one land use component* 
```python
    group_parts = {'buildings':[2,3,4,5,9],
                   'ind':[1],              
                   'trans':[6,7,8],
                   'recreation':[10],
                   'agg':[11, 12, 13, 14, 15, 16, 18],
                   'woods':[17,19,20,21,22],
                   'water':[23,24],
                   'unproductive':[25,26,27]
                  }
```

The land use values were calculated at the different radi, the total number of associations and positive associations was compared per buffer-zone to identify the bufferzone most appropriate for surveys in Switzerland.

In [6]:
# categorical variables, aggregate groups,
# the landuse descriptors are integers, key them to a readable descriptor
# the source is https://www.bfs.admin.ch/bfs/fr/home/statistiques/espace-environnement/enquetes/area.html
# using column AS1827
label_keys = {
    1:"industrial",
    2:"residential",
    3:"government",
    4:"agg_buildings",
    5:"unk_building",
    6:"roads",
    7:"railways",
    8:"airports",
    9:"special",
    10:"recreational",
    11:"orchards",
    12:"vineyards",
    13:"horticulture",
    14:"arable",
    15:"meadows",
    16:"farmpastures",
    17:"alpinemeadows",
    18:"aplinepasteurs",
    19:"closed_forest",
    20:"open_forest",
    21:"brush_forest",
    22:"woods",
    23:"lakes",
    24:"rivers",
    25:"unproductive",
    26:"bareland",
    27:"glaciers"
}

new_label_keys = {
    1:"ind",
    2:"buildings",
    3:"buildings",
    4:"buildings",
    5:"buildings",
    6:"roads",
    7:"railways",
    8:"airports",
    9:"buildings",
    10:"recreation",
    11:"agg",
    12:"agg",
    13:"agg",
    14:"agg",
    15:"agg",
    16:"agg",
    17:"woods",
    18:"agg",
    19:"woods",
    20:"woods",
    21:"woods",
    22:"woods",
    23:"water",
    24:"water",
    25:"unproductive",
    26:"unproductive",
    27:"unproductive"
}

key_labels = {v:k for k,v in label_keys.items()}

# group the land use into functional groups
# land use categories that are to be tested alone
# should be in their own group - array
group_parts = {'buildings':[2,3,4,5,9],
               'ind':[1],              
               'trans':[6,7,8],
               'recreation':[10],
               'agg':[11, 12, 13, 14, 15, 16, 18],
               'woods':[17,19,20,21,22],
               'water':[23,24],
               'unproductive':[25,26,27]
              }

# make column names based on the key values:
as_1827_part ={k:F"part_{v}" for k,v in label_keys.items()}

# this is the final value that is being tested against
# the survey results. Map survey results to these 
# aggregated luse features
som_cols = ['% to buildings',
            '% to ind',
            '% to trans',
            '% to recreation',
            '% to agg',
            '% to woods',
            '% to water',
            '% to unproductive',
                
]

key_labels.update({'luse_total':'total','water_value':'water_value', 'adjusted_land_use':'adjusted_land_use'})

cols_lu = ['recreational',
           'farmpastures',
           'closed_forest',
           'rivers',
           'residential',
           'roads',
           'unproductive',
           'industrial',
           'railways',
           'woods',
           'arable',
           'special',
           'government',
           'unk_building',
           'lakes',
           'meadows',
           'horticulture',
           'orchards',
           'agg_buildings',
           'bareland',
           'open_forest',
           'vineyards',
           'airports',
           'brush_forest',
           'aplinepasteurs',
           'alpinemeadows',
           'glaciers',
           'luse_total',
           'water_value',
           'adjusted_land_use'
          ]

new_descriptions = { 'Industrial pellets (nurdles)': 'Pellets',
                    'Glass drink bottles, pieces': 'Glass bottles and pieces', 
                    'Food wrappers; candy, snacks': 'Food wrappers', 
                    'Food wrappers; candy, snacks': 'Food wrappers', 
                    'Foam packaging/insulation/polyurethane': 'Foam packaging for insulation', 
                    'Packaging films nonfood or unknown': 'Packaging films (not food-related)', 
                    'Lids unidentified': 'Other lids', 
                    'Tobacco; plastic packaging, containers': 'Plastic packaging for tobacco', 
                    'Fireworks paper tubes and fragments': 'Fireworks (paper parts)', 
                    'Diapers - wipes': 'Diapers and wipes', 
                    'Foamed items & pieces (non packaging/insulation) foamed sponge material': 'Foam items and pieces (not insulation)', 
                    'Labels, bar codes': 'Labels and bar codes', 
                    'Sanitary pads /panty liners/tampons and applicators': 'Sanitary pads, tampons and applicators', 
                    'Pellet mass from injection molding': 'Pellet mass', 
                    'Bags; plastic shopping/carrier/grocery and pieces': 'Plastic bags and pieces', 
                    'Lids for chemicals, detergents (non-food)': 'Lids for chemicals and detergents', 
                    'Coverings; plastic packaging, sheeting for protecting large cargo items': 'Plastic sheeting for large cargo items', 
                    'Construction material; bricks, pipes, cement': 'Construction materials (bricks, pipes, cement)', 
                    'Fireworks; rocket caps, exploded parts & packaging': 'Fireworks (rocket caps, exploded parts and packaging)', 
                    'Foamed EVA (flexible plastic) for crafts & watersports': 'Foamed EVA', 
                    'Metal bottle caps, lids & pull tabs from cans': 'Metal bottle caps, lids and pull tabs from cans', 
                    'Lid/cap rings from plastic bottles/containers': 'Lids and cap rings from plastic bottles', 
                    'Tissue, toilet paper, napkins, paper towels': 'Tissues, toilet paper and paper towels', 
                    'Cotton bud/swab sticks': 'Cotton buds and swab sticks', 
                    'Foil wrappers, aluminum foil': 'Aluminum foil', 
                    'Clothing, towels & rags': 'Clothing and rags', 
                    'Food containers single use foamed or plastic': 'Plastic food containers', 
                    'Plastic fragments angular <5mm': 'Angular plastic fragments (< 0.5 cm)', 
                    'Plastic fragments rounded <5mm': 'Round plastic fragments (< 0.5 cm)', 
                    'Foamed  plastic <5mm': 'Foam plastic (< 0.5 cm)', 
                    'Styrofoam < 5mm': 'Styrofoam fragments (< 0.5 cm)', 
                    'Glass or ceramic fragments > 2.5 cm': 'Glass and ceramic fragments (> 2.5 cm)',
                   }

to_replace = new_descriptions.keys()

### Definition of abundant_codes

Currently the `< abundant_codes >` are defined as any object that had a value greater than `< a_qty >` at any survey. That is one possible way to identify objects that accumulate, the fail rate may be another.

*__Below:__ Using the fail rate or quantity to define the `< abundant codes >`.*

```python
# decide which method to use
agg_this = {"quantity":"sum", "p/100m":"sum", "fail":"sum"}
c_totals = fd.groupby("code", as_index=False).agg(agg_this)
c_totals["fr"] = c_totals["fail"]/fd.loc_date.nunique()

if use_fail:
    abundant_codes = c_totals[c_totals.fr > a_fail_rate]
else:
    abundant_codes = fd[fd.quantity > a_qty].code.unique()
```

## 1500 meters codes with an association to a landuse feature

Where p <= 0.05 for Spearmans test for association.

In [7]:

pdtype = pd.core.frame.DataFrame
pstype = pd.core.series.Series

def combineLandUseFeatures(buffer_data: pdtype = None, a_col: str = "AS18_27", new_col: str = "label",
                           new_label_keys: dict = new_label_keys) -> pdtype:
    """Assigns labels to land-use values according to <label_keys_new>. The new labels,
    when aggregated, create groups of land-use values that are similar. For exmaple,
    all the different types of buildings are labeled "buildings"
    
    Args:
        buffer_data: The land use values at a given radius
        a_col: The original column name that holds the labels for the land use values
        new_col: The new name of the column with the new labels
    
    Returns:
        The data frame with the new column and the new labels    
    """    
    
    buffer_data.rename(columns={"slug":"location", a_col:new_col}, inplace=True)
    buffer_data[new_col] = buffer_data[new_col].apply(lambda x : new_label_keys[x])
    
    return buffer_data

def adjustLandUse(buffer_data: pdtype = None, exclude_these: list = ["water", "roads"]) -> pdtype:
    """The surface area of the water feature is removed from land-use calcluation. This
    can be bypassed. However, the study considers the surface area of the water as a fixed
    feature that exchanges with the different land-use features (which are not fixed).
    
    Args:
        buffer_data: The land use values at a given radius
        exclude_these: The labels to be excluded from the land use total
        
    Returns:
        The dataframe without the excluded labels        
    """
    data = buffer_data[~buffer_data.label.isin(roads)]
    
    return data

def addRoadLengthToBuffer(buffer_data: pdtype = None, location: str = None, 
                          road_lengths: pstype = None, scale: float = 1000.0):
    """Adds the length of road network to the % land use values.
    """
    
    road_length = road_lengths.loc[location]
    if scale != 1:
        road_length = round(road_length/scale, 1)
    
    buffer_data["roads"] = road_length
    
    return buffer_data

def addIntersectsToBuffer(buffer_data: pdtype = None, location: str = None, 
                          intersects: pstype = None, scale: float = 100.0):
    
    """Adds the number of intersects to the buffer.
    """
    
    n_intersects = intersects.loc[location]
        
    buffer_data["intersects"] = n_intersects
    
    return buffer_data

def calculatePercentLandUse(buffer_data: pdtype = None, location: str = None, label: str = "label",
                           add_intersects_roads: bool = True,  road_lengths: pstype = None, intersects: pstype = None) -> pd.Series:
    """Figures the % of total of each land-use feature for one location.
    
    Args:
        buffer_data: The land use values at a given radius
        location: The survey location of interest
    
    Returns:
        A pandas series of the % of total for each land-use feature in the index
    """
    
    try:
        location_data = buffer_data[buffer_data.location == location][label].value_counts()
    except ValueError:
        print("The location data could not retrieved")
        raise
        
    total = location_data.sum()
    no_water = 
    results = location_data/total
    results.name = location
    
    if add_intersects_roads:
        results = addIntersectsToBuffer(buffer_data=results, location=location, intersects=intersects)
        results = addRoadLengthToBuffer(buffer_data=results, location=location, road_lengths=road_lengths)
    
    return results

In [8]:
# assign_luse_stat_to_survey_results
# code = "G67"
# land_use = "roads"
# exlcude_these = ["water"]

def assignLanduseValue(sample, land_use) -> float:
    
    try:
        result = sample.loc[land_use]
    except KeyError:
        result = 0
    
    return result

def calculateAndAssignLanduseValues(code_data, land_use, locations):
    # assigins the % of total for the requested land use value
    # to the locations in locations. Returns two arrays the survey results
    # for the code and the correponding land use value
    for location in locations:
        sample = calculatePercentLandUse(buffer_data=dxt, location=location)
        code_data.loc[code_data.location == location, land_use] = assignLanduseValue(sample, land_use)
    return code_data.pcs_m.values, code_data[land_use].values

def getRhoForALAndUseCategory(data: pdtype, land_use: str=None, values: str = "pcs_m", method: callable = stats.spearmanr) -> ():
    # returns the asymptotic results
    c, p = method(data[values].values, data[land_use].values)
    return c, p

def permutateX(x):
    return stats.spearmanr(x, y)
    
    c, p = method(data[values].values, data[land_use].values)
    return c, p
    
def cleanSurveyResults(data):
    
    data['loc_date'] = list(zip(data.location, data["date"]))
    data['date'] = pd.to_datetime(data["date"])
    
    # get rid of microplastics
    mcr = data[data.groupname == "micro plastics (< 5mm)"].code.unique()
    
    # replace the bad code
    data.code = data.code.replace('G207', 'G208')
    data = data[~data.code.isin(mcr)]
    
    # walensee has no landuse values
    print("removing walensee")
    data = data[data.water_name_slug != 'walensee']
    print('walensee' in data.water_name_slug.unique())
    
    return data

class SurveyResults:
    
    file_name = 'resources/checked_sdata_eos_2020_21.csv'
    columns_to_keep=[
        'loc_date',
        'location', 
        'river_bassin',
        'water_name_slug', 
        'w_t', 
        'intersects', 
        'code', 
        'pcs_m'
    ]
        
    def __init__(self, data: str = 'resources/checked_sdata_eos_2020_21.csv', clean_data: bool = True, columns: list = columns_to_keep, w_t: str = None):
        self.dfx = pd.read_csv(data)
        self.df_results = None
        self.locations = None
        self.valid_codes = None
        self.clean_data = clean_data
        self.columns = columns
        self.w_t = w_t
        
    def validCodes(self):    
    
        conditions = [
            isinstance(self.df_results, pdtype),
            "code" in self.df_results.columns
        ]

        if all(conditions):

            try:
                valid_codes = self.df_results.code.unique()
            except ValueError:
                print("There was an error retrieving the unique code names, self.df.code.unique() failed.")
                raise
            else:
                self.valid_codes = valid_codes
                
        
    def surveyResults(self):
        
        # if this method has been called already
        # return the result
        if self.df_results is not None:
            return self.df_results
        
        # for the default data self.clean data must be called        
        if self.clean_data is True:
            fd = cleanSurveyResults(self.dfx)
            
        # if the data is clean then if can be used directly
        else:
            fd = self.dfx
        
        # filter the data by the variable w_t
        if self.w_t is not None:
            fd = fd[fd.w_t == self.w_t]
            
         
        # keep only the required columns
        if self.columns:
            fd = fd[self.columns]
        
        # assign the survey results to the class attribute
        self.df_results = fd
        
        # define the list of codes in this df
        self.validCodes()
        
        return self.df_results
    
    def surveyLocations(self):
        if self.locations is not None:
            return self.locations
        if self.df_results is not None:
            self.locations = self.dfResults.location.unique()
            return self.locations
        else:
            print("There is no survey data loaded")
            return None    

class BufferData:
    a_col="AS18_27"
    new_col = "label"
    exclude_these = ["water", "roads"]
    label_keys = new_label_keys
    beach_data = dfBeaches
    
    
    def __init__(self, file_name: str = None, location: str = None):
        self.buffer = pd.read_csv(file_name)
        self.buffer_data = combineLandUseFeatures(buffer_data=self.buffer, a_col=self.a_col, new_col=self.new_col)
        self.adjusted_buffer = adjustLandUse(buffer_data=self.buffer_data, exclude_these=self.exclude_these)
        self.pctLandUse = None
        
    def percentLandUse(self):
        
        if isinstance(self.pctLandUse, pdtype):
            return self.pctLandUse
        
        if isinstance(self.adjusted_buffer, pdtype):
            locations = self.adjusted_buffer.location.unique()
            results = []
            road_lengths = self.beach_data.streets
            intersects = self.beach_data.intersects
            for location in locations:
                result = calculatePercentLandUse(buffer_data=self.adjusted_buffer, location=location, road_lengths=road_lengths, intersects=intersects)
                results.append(result)
        else:
            raise TypeError
            
        self.pctLandUse = pd.concat(results, axis=1)
        
        return self.pctLandUse
    
class CodeData:
    
    def __init__(self, data: pdtype = None, code: str = None, **kwargs):
                
        self.data = data
        self.code = code
        self.code_data = None       
    
    def makeCodeData(self):
        
        if isinstance(self.code_data, pdtype):
            return self.code_data        
        
        conditions = [
            isinstance(self.data, pdtype)            
        ]
        
        if all(conditions):
            self.code_data = self.data[self.data.code == self.code]
            return self.code_data
    
    
class CodeResults:   
        
    def __init__(self, code_data: pdtype = None, buffer: pdtype = None, code: str = None, 
                 method: callable = stats.spearmanr, **kwargs):        
        
        self.code_data = code_data
        self.buffer = buffer
        self.code = code
        self.method = method
        self.y = None
        self.x = None
        super().__init__()
    
        
    def landuseValueForOneCondition(self, land_use: str = None, locations: list = None):
        
        x = self.code_data.pcs_m.values      
        y = [self.buffer[x].loc[land_use] for x in self.code_data.location.values]
        self.x, self.y = x, np.array(y)
                
        return self.x, self.y
    
    def rhoForALAndUseCategory(self, x: np.ndarray = None, y: np.ndarray = None) -> ():
        # returns the asymptotic results if ranking based method is used        
        c, p = self.method(x, y)
        return c, p
    
    def getRho(self, x: np.array = None):
        # assigns y from self        
        result = self.method(x, self.y)
                       
        return result.correlation
            
        
    def exactPValueForRho(self):
        # perform a permutation test instead of relying on 
        # the asymptotic p-value. Only one of the two inputs 
        # needs to be shuffled.
        p = stats.permutation_test((self.x,) , self.getRho, permutation_type='pairings', n_resamples=1000)
        
        return p     

In [9]:
fdx = SurveyResults(w_t="l")
df = fdx.surveyResults()
import functools
import time

def timer(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.perf_counter()
        value = func(*args, **kwargs)
        end_time = time.perf_counter()
        run_time = end_time - start_time
        print("Finished {} in {} secs".format(repr(func.__name__), round(run_time, 3)))
        return value

    return wrapper

code = "G27"
land_use = "buildings"

def makeBufferObject(file_name: str = "resources/buffer_output/luse_1500.csv"):
    
    buffer_data = BufferData(file_name=file_name)
    pct_vals = buffer_data.percentLandUse()
    pct_vals.fillna(0, inplace=True)
    
    return buffer_data, pct_vals

abuf, bufvals = makeBufferObject(file_name="resources/buffer_output/luse_1500.csv")
    

def asymptoticAndExactPvalues(data: pdtype = None, buffer: pdtype = None, code: 'str'=None, land_use: 'str'=None):
    
    code_data = CodeData(data=data, code=code).makeCodeData()
    code_results = CodeResults(code_data=code_data, buffer=buffer)
    
    x, y = code_results.landuseValueForOneCondition(land_use=land_use)
    ci, pi = code_results.rhoForALAndUseCategory(x, y)
    px = code_results.exactPValueForRho()
    
    return {"code": code, "landuse": land_use, "a_symp": (round(pi, 3), ci), "exact": (round(px.pvalue, 3), px.statistic,)}

@timer
def rhoForOneBuffer(data: pdtype = None, buffer_file: str = "resources/buffer_output/luse_1500.csv", codes: list=None, land_use: list=None):
    
    new_buffer, buffer_vals = makeBufferObject(file_name=buffer_file)
        
    rhovals_for_this_buffer = []
    for code in codes:
        for use in land_use:
            results = asymptoticAndExactPvalues(data=data, buffer=buffer_vals, code=code, land_use=use)
            rhovals_for_this_buffer.append(results)
    
    return rhovals_for_this_buffer

def resultsDf(rhovals, pvals):
    results_df = []
    for i, n in enumerate(pvals.index):
        arow_of_ps = pvals.iloc[i]
        p_fail = arow_of_ps[ arow_of_ps > 0.05]
        arow_of_rhos = rhovals.iloc[i]
        
        for label in p_fail.index:
            arow_of_rhos[label] = 0
        results_df.append(arow_of_rhos)
    
    return results_df
          

removing walensee
False


In [10]:
# this defines the css rules for the note-book table displays
header_row = {'selector': 'th:nth-child(1)', 'props': f'background-color: #FFF;'}
even_rows = {"selector": 'tr:nth-child(even)', 'props': f'background-color: rgba(139, 69, 19, 0.08);'}
odd_rows = {'selector': 'tr:nth-child(odd)', 'props': 'background: #FFF;'}
table_font = {'selector': 'tr', 'props': 'font-size: 12px;'}
table_css_styles = [even_rows, odd_rows, table_font, header_row]

# mcd = mc_comp.style.format(aformatter).set_table_styles(table_css_styles)
# mcd = mcd.background_gradient(axis=None, vmin=mc_comp.min().min(), vmax=mc_comp.max().max(), cmap="YlOrBr")

In [11]:
codes = ["G27", "Gfrags", "G95", "Gfoam", "G30", "Gfoam", "G67", "G89", "G21", "G23"]
land_use = ["woods", "buildings", "agg", "recreation", "unproductive", "roads", "intersects"]
buffer_file = "resources/buffer_output/luse_1500.csv"

f = rhoForOneBuffer(data=df, buffer_file=buffer_file , codes=codes, land_use=land_use)
buffer_results = [{"code":x["code"], "use": x["landuse"], "exact_p": x["exact"][0], "p": x["a_symp"][0], "rho": x["exact"][1]} for x in f]
rho_at_buffer = pd.DataFrame(buffer_results)
    
pvals = rho_at_buffer.pivot_table(index="code", columns="use", values="p", aggfunc='first')
rhovals = rho_at_buffer.pivot_table(index="code", columns="use", values="rho", aggfunc='first').round(3)

buffer_results = pd.DataFrame(resultsDf(rhovals, pvals)).round(2)
bfr = buffer_results.style.format(precision=3).set_table_styles(table_css_styles)
bfr.background_gradient(axis=None, vmin=buffer_results.min().min(), vmax=buffer_results.max().max(), cmap="coolwarm")

Finished 'rhoForOneBuffer' in 26.0 secs


use,agg,buildings,intersects,recreation,roads,unproductive,woods
G21,-0.12,0.0,0.0,0.0,0.0,0.0,0.0
G23,0.0,0.0,0.0,0.11,0.0,0.0,0.0
G27,-0.3,0.37,-0.28,0.3,0.35,-0.34,-0.18
G30,-0.21,0.21,0.0,0.25,0.27,-0.16,-0.19
G67,0.0,0.0,0.17,0.0,0.0,0.0,0.0
G89,-0.13,0.0,0.0,0.0,0.0,-0.12,0.0
G95,0.0,0.0,-0.13,0.16,0.17,-0.24,0.0
Gfoam,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Gfrags,0.0,0.0,0.0,0.14,0.17,0.0,0.0


In [12]:
codes = ["G27", "Gfrags", "G95", "Gfoam", "G30", "Gfoam", "G67", "G89", "G21", "G23"]
land_use = ["woods", "buildings", "agg", "recreation", "unproductive", "roads", "intersects"]
buffer_file = "resources/buffer_output/luse_2000.csv"

f = rhoForOneBuffer(data=df, buffer_file=buffer_file , codes=codes, land_use=land_use)
buffer_results = [{"code":x["code"], "use": x["landuse"], "exact_p": x["exact"][0], "p": x["a_symp"][0], "rho": x["exact"][1]} for x in f]
rho_at_buffer = pd.DataFrame(buffer_results)
    
pvals = rho_at_buffer.pivot_table(index="code", columns="use", values="p", aggfunc='first')
rhovals = rho_at_buffer.pivot_table(index="code", columns="use", values="rho", aggfunc='first').round(3)

buffer_results = pd.DataFrame(resultsDf(rhovals, pvals)).round(2)
bfr = buffer_results.style.format(precision=3).set_table_styles(table_css_styles)
bfr.background_gradient(axis=None, vmin=buffer_results.min().min(), vmax=buffer_results.max().max(), cmap="coolwarm")

Finished 'rhoForOneBuffer' in 26.209 secs


use,agg,buildings,intersects,recreation,roads,unproductive,woods
G21,-0.16,0.0,0.0,0.0,0.0,0.0,0.0
G23,0.0,0.0,0.0,0.0,0.0,0.0,0.0
G27,-0.31,0.34,-0.28,0.33,0.35,-0.35,-0.14
G30,-0.24,0.19,0.0,0.26,0.27,-0.15,-0.14
G67,0.0,-0.17,0.17,0.0,0.0,0.0,0.0
G89,-0.13,0.0,0.0,0.0,0.0,-0.13,0.0
G95,-0.16,0.0,-0.13,0.17,0.17,-0.24,0.0
Gfoam,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Gfrags,-0.12,0.0,0.0,0.13,0.17,0.0,0.0


In [13]:
codes = ["G27", "Gfrags", "G95", "Gfoam", "G30", "Gfoam", "G67", "G89", "G21", "G23"]
land_use = ["woods", "buildings", "agg", "recreation", "unproductive", "roads", "intersects"]
buffer_file = "resources/buffer_output/luse_2500.csv"

f = rhoForOneBuffer(data=df, buffer_file=buffer_file , codes=codes, land_use=land_use)
buffer_results = [{"code":x["code"], "use": x["landuse"], "exact_p": x["exact"][0], "p": x["a_symp"][0], "rho": x["exact"][1]} for x in f]
rho_at_buffer = pd.DataFrame(buffer_results)
    
pvals = rho_at_buffer.pivot_table(index="code", columns="use", values="p", aggfunc='first')
rhovals = rho_at_buffer.pivot_table(index="code", columns="use", values="rho", aggfunc='first').round(3)

buffer_results = pd.DataFrame(resultsDf(rhovals, pvals)).round(2)
bfr = buffer_results.style.format(precision=3).set_table_styles(table_css_styles)
bfr.background_gradient(axis=None, vmin=buffer_results.min().min(), vmax=buffer_results.max().max(), cmap="coolwarm")

Finished 'rhoForOneBuffer' in 26.402 secs


use,agg,buildings,intersects,recreation,roads,unproductive,woods
G21,-0.13,0.0,0.0,0.0,0.0,0.0,0.0
G23,0.0,0.0,0.0,0.0,0.0,0.0,0.0
G27,-0.32,0.36,-0.28,0.32,0.35,-0.36,-0.14
G30,-0.23,0.21,0.0,0.25,0.27,-0.14,-0.13
G67,0.0,-0.14,0.17,0.0,0.0,0.0,0.14
G89,-0.13,0.0,0.0,0.0,0.0,0.0,0.0
G95,-0.14,0.12,-0.13,0.14,0.17,-0.26,0.0
Gfoam,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Gfrags,0.0,0.0,0.0,0.0,0.17,0.0,0.0


In [14]:
codes = ["G27", "Gfrags", "G95", "Gfoam", "G30", "Gfoam", "G67", "G89", "G21", "G23"]
land_use = ["woods", "buildings", "agg", "recreation", "unproductive", "roads", "intersects"]
buffer_file = "resources/buffer_output/luse_3000.csv"

f = rhoForOneBuffer(data=df, buffer_file=buffer_file , codes=codes, land_use=land_use)
buffer_results = [{"code":x["code"], "use": x["landuse"], "exact_p": x["exact"][0], "p": x["a_symp"][0], "rho": x["exact"][1]} for x in f]
rho_at_buffer = pd.DataFrame(buffer_results)
    
pvals = rho_at_buffer.pivot_table(index="code", columns="use", values="p", aggfunc='first')
rhovals = rho_at_buffer.pivot_table(index="code", columns="use", values="rho", aggfunc='first').round(3)

buffer_results = pd.DataFrame(resultsDf(rhovals, pvals)).round(2)
bfr = buffer_results.style.format(precision=3).set_table_styles(table_css_styles)
bfr.background_gradient(axis=None, vmin=buffer_results.min().min(), vmax=buffer_results.max().max(), cmap="coolwarm")

Finished 'rhoForOneBuffer' in 26.861 secs


use,agg,buildings,intersects,recreation,roads,unproductive,woods
G21,-0.12,0.0,0.0,0.0,0.0,0.0,0.0
G23,0.0,0.0,0.0,0.0,0.0,0.0,0.0
G27,-0.32,0.37,-0.28,0.32,0.35,-0.4,0.0
G30,-0.22,0.2,0.0,0.18,0.27,-0.16,0.0
G67,0.0,-0.13,0.17,0.0,0.0,0.0,0.16
G89,0.0,0.0,0.0,0.0,0.0,0.0,0.0
G95,-0.14,0.12,-0.13,0.12,0.17,-0.26,0.0
Gfoam,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Gfrags,0.0,0.0,0.0,0.0,0.17,0.0,0.0


In [15]:
codes = ["G27", "Gfrags", "G95", "Gfoam", "G30", "Gfoam", "G67", "G89", "G21", "G23"]
land_use = ["woods", "buildings", "agg", "recreation", "unproductive", "roads", "intersects"]
buffer_file = "resources/buffer_output/luse_3500.csv"

f = rhoForOneBuffer(data=df, buffer_file=buffer_file , codes=codes, land_use=land_use)
buffer_results = [{"code":x["code"], "use": x["landuse"], "exact_p": x["exact"][0], "p": x["a_symp"][0], "rho": x["exact"][1]} for x in f]
rho_at_buffer = pd.DataFrame(buffer_results)
    
pvals = rho_at_buffer.pivot_table(index="code", columns="use", values="p", aggfunc='first')
rhovals = rho_at_buffer.pivot_table(index="code", columns="use", values="rho", aggfunc='first').round(3)

buffer_results = pd.DataFrame(resultsDf(rhovals, pvals)).round(2)
bfr = buffer_results.style.format(precision=3).set_table_styles(table_css_styles)
bfr.background_gradient(axis=None, vmin=buffer_results.min().min(), vmax=buffer_results.max().max(), cmap="coolwarm")

Finished 'rhoForOneBuffer' in 27.286 secs


use,agg,buildings,intersects,recreation,roads,unproductive,woods
G21,-0.12,0.0,0.0,0.0,0.0,0.0,0.0
G23,0.0,0.0,0.0,0.0,0.0,0.0,0.0
G27,-0.29,0.35,-0.28,0.35,0.35,-0.39,0.0
G30,-0.22,0.18,0.0,0.17,0.27,-0.16,0.0
G67,0.0,-0.14,0.17,-0.12,0.0,0.0,0.17
G89,-0.12,0.0,0.0,0.0,0.0,0.0,0.12
G95,-0.12,0.0,-0.13,0.14,0.17,-0.25,0.0
Gfoam,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Gfrags,-0.12,0.0,0.0,0.0,0.17,0.0,0.0


In [16]:
codes = ["G27", "Gfrags", "G95", "Gfoam", "G30", "Gfoam", "G67", "G89", "G21", "G23"]
land_use = ["woods", "buildings", "agg", "recreation", "unproductive", "roads", "intersects"]
buffer_file = "resources/buffer_output/luse_4000.csv"

f = rhoForOneBuffer(data=df, buffer_file=buffer_file , codes=codes, land_use=land_use)
buffer_results = [{"code":x["code"], "use": x["landuse"], "exact_p": x["exact"][0], "p": x["a_symp"][0], "rho": x["exact"][1]} for x in f]
rho_at_buffer = pd.DataFrame(buffer_results)
    
pvals = rho_at_buffer.pivot_table(index="code", columns="use", values="p", aggfunc='first')
rhovals = rho_at_buffer.pivot_table(index="code", columns="use", values="rho", aggfunc='first').round(3)

buffer_results = pd.DataFrame(resultsDf(rhovals, pvals)).round(2)
bfr = buffer_results.style.format(precision=3).set_table_styles(table_css_styles)
bfr.background_gradient(axis=None, vmin=buffer_results.min().min(), vmax=buffer_results.max().max(), cmap="coolwarm")

Finished 'rhoForOneBuffer' in 32.134 secs


use,agg,buildings,intersects,recreation,roads,unproductive,woods
G21,-0.13,0.0,0.0,0.0,0.0,0.0,0.0
G23,0.0,0.0,0.0,0.0,0.0,0.0,0.0
G27,-0.29,0.34,-0.28,0.37,0.35,-0.39,0.0
G30,-0.21,0.17,0.0,0.15,0.27,-0.15,0.0
G67,0.0,-0.16,0.17,-0.14,0.0,0.0,0.19
G89,0.0,0.0,0.0,0.0,0.0,0.0,0.12
G95,0.0,0.0,-0.13,0.13,0.17,-0.24,0.0
Gfoam,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Gfrags,0.0,0.0,0.0,0.0,0.17,0.0,0.0


In [17]:
codes = ["G27", "Gfrags", "G95", "Gfoam", "G30", "Gfoam", "G67", "G89", "G21", "G23"]
land_use = ["woods", "buildings", "agg", "recreation", "unproductive", "roads", "intersects"]
buffer_file = "resources/buffer_output/luse_4500.csv"

f = rhoForOneBuffer(data=df, buffer_file=buffer_file , codes=codes, land_use=land_use)
buffer_results = [{"code":x["code"], "use": x["landuse"], "exact_p": x["exact"][0], "p": x["a_symp"][0], "rho": x["exact"][1]} for x in f]
rho_at_buffer = pd.DataFrame(buffer_results)
    
pvals = rho_at_buffer.pivot_table(index="code", columns="use", values="p", aggfunc='first')
rhovals = rho_at_buffer.pivot_table(index="code", columns="use", values="rho", aggfunc='first').round(3)

buffer_results = pd.DataFrame(resultsDf(rhovals, pvals)).round(2)
bfr = buffer_results.style.format(precision=3).set_table_styles(table_css_styles)
bfr.background_gradient(axis=None, vmin=buffer_results.min().min(), vmax=buffer_results.max().max(), cmap="coolwarm")

Finished 'rhoForOneBuffer' in 31.594 secs


use,agg,buildings,intersects,recreation,roads,unproductive,woods
G21,-0.13,0.0,0.0,0.0,0.0,0.0,0.0
G23,0.0,0.0,0.0,0.0,0.0,0.0,0.0
G27,-0.28,0.35,-0.28,0.36,0.35,-0.38,0.0
G30,-0.2,0.16,0.0,0.16,0.27,-0.17,0.0
G67,0.0,-0.17,0.17,-0.13,0.0,0.0,0.19
G89,0.0,0.0,0.0,0.0,0.0,0.0,0.12
G95,0.0,0.0,-0.13,0.11,0.17,-0.26,0.0
Gfoam,0.0,0.0,0.0,0.0,0.0,0.0,0.11
Gfrags,0.0,0.0,0.0,0.0,0.17,-0.12,0.0


In [18]:
codes = ["G27", "Gfrags", "G95", "Gfoam", "G30", "Gfoam", "G67", "G89", "G21", "G23"]
land_use = ["woods", "buildings", "agg", "recreation", "unproductive", "roads", "intersects"]
buffer_file = "resources/buffer_output/luse_5k.csv"

f = rhoForOneBuffer(data=df, buffer_file=buffer_file , codes=codes, land_use=land_use)
buffer_results = [{"code":x["code"], "use": x["landuse"], "exact_p": x["exact"][0], "p": x["a_symp"][0], "rho": x["exact"][1]} for x in f]
rho_at_buffer = pd.DataFrame(buffer_results)
    
pvals = rho_at_buffer.pivot_table(index="code", columns="use", values="p", aggfunc='first')
rhovals = rho_at_buffer.pivot_table(index="code", columns="use", values="rho", aggfunc='first').round(3)

buffer_results = pd.DataFrame(resultsDf(rhovals, pvals)).round(2)
bfr = buffer_results.style.format(precision=3).set_table_styles(table_css_styles)
bfr.background_gradient(axis=None, vmin=buffer_results.min().min(), vmax=buffer_results.max().max(), cmap="coolwarm")

Finished 'rhoForOneBuffer' in 30.905 secs


use,agg,buildings,intersects,recreation,roads,unproductive,woods
G21,-0.13,0.0,0.0,0.0,0.0,0.0,0.0
G23,0.0,0.0,0.0,0.0,0.0,0.0,0.0
G27,-0.27,0.35,-0.28,0.35,0.35,-0.35,0.0
G30,-0.19,0.15,0.0,0.16,0.27,-0.16,0.0
G67,0.0,-0.18,0.17,-0.13,0.0,0.0,0.18
G89,0.0,0.0,0.0,0.0,0.0,0.0,0.0
G95,0.0,0.0,-0.13,0.0,0.17,-0.26,0.0
Gfoam,-0.12,0.0,0.0,0.0,0.0,0.0,0.12
Gfrags,0.0,0.0,0.0,0.0,0.17,-0.12,0.0


In [19]:
# the locations that need land use data
fd = df.copy()
abundant_codes = codes
data = pd.DataFrame(index = fd.location.unique())

# the bufferdata for this radius
bufferdata = make_bufferdata("resources/buffer_output/luse_1500.csv", a_col="AS18_27", label_keys=label_keys)

radius = 1500

# the functional land use groups:
these_groups = list(group_parts.keys())

add_these_cols = bufferdata.label.unique()

a1500, s, this_data_1500 = check_rho_for_this_radius(data, fd, add_these_cols, bufferdata, radius, label_keys=label_keys, group_parts=group_parts,
                              som_cols=som_cols, as_1827_part=as_1827_part, abundant_codes=abundant_codes)

a1500["quantity"] = a1500.index.map(lambda x: code_totals.loc[x])
a1500["material"] = a1500.index.map(lambda x: code_m_map.loc[x])
a1500.sort_values(by="quantity", inplace=True)

# use intersects
water_int = fd[fd.code.isin(abundant_codes)][["loc_date", "code", "pcs_m", "quantity", "intersects"]]
for each_code in abundant_codes:
    new_data = water_int[water_int.code == each_code][["intersects", "pcs_m"]]
    c, p = stats.spearmanr(new_data.pcs_m.values, new_data.intersects.values)
    if p <= 0.05:
        a1500.loc[each_code, "intersects"] = c
    else:
        a1500.loc[each_code, "intersects"] = 0

totals=[]
lnd_u = []
totals.append(landuse_per_radius(s))
lnd_u.append(s)

a1500["description"] = a1500.index.map(lambda x: code_d_map.loc[x])

col_order = [a1500.columns[-1], *a1500.columns[:-1]]
a1500 = a1500.reindex(code_order)
a1500[col_order].head()

NameError: name 'make_bufferdata' is not defined

### Results Spearmans Rho

In [None]:
# [Correlation matrix plot]
new_column_names = {
    "% to buildings":"Buildings",
    "% to ind":"Industries",
    "% to trans":"Roads",
    "% to recreation":"Recreational",
    "% to agg":"Agriculture",
    "% to woods":"Forests",
    "% to water":"Rivers/canals",
    "% to unproductive":"Unproductive"
}
# a1500.set_index('description', inplace=True)
a1500.rename(columns=new_column_names, inplace=True)
use_these = list(new_column_names.values())
aplot = a1500[[*use_these, "description"]].round(2)
aplot.set_index("description", inplace=True, drop=True)
fig, ax= plt.subplots(figsize=(17,17))
sns.set(font_scale=1.2)

kwargs = dict(annot=True, fmt="g", cmap='coolwarm',
              annot_kws={"size":9.5},center =0, mask=aplot==0,
              square = True, linewidths=0.1,linecolor="lightgrey",
              cbar = False)

ax = sns.heatmap(aplot, ax=ax, **kwargs)

ax.set_ylabel('')
ax.yaxis.tick_left()

ax.xaxis.tick_top() 
ax.xaxis.set_label_position('top')
ax.set_xticklabels(use_these,rotation=90)
       
ax_divider = make_axes_locatable(ax)
axins = inset_axes(ax,
                   width="60%",  # width = 5% of parent_bbox width
                   height="1.5%",  # height : 50%
                   loc='lower center',
                   bbox_to_anchor=(0.03, -0.03, 1, 1),
                   bbox_transform=ax.transAxes, 
                   borderpad=0
                   )

fig.colorbar(ax.get_children()[0], cax = axins, orientation = 'horizontal', extendfrac='auto')
# plt.savefig('output/corr_matrix_1500.png', bbox_inches='tight')

plt.show()

### Covariance of explanatory variables

In [None]:
rho = aplot.corr(method='spearman')
pval = aplot.corr(method=lambda x, y: stats.spearmanr(x, y)[1]) - np.eye(*rho.shape)
p = pval.applymap(lambda x: ''.join(['*' for t in [0.01,0.05,0.1] if x<=t]))
rho.round(2).astype(str) + p

#### Sum of the total number of objects with a correlation (positive or negative) collected under the different land use categories

In [None]:
# the cumulative sum of the objects of interest under
# the different landuse categories
cum_sums_objs = [a1500.loc[a1500[x] != 0, "quantity"].sum() for x in use_these]
a_df=pd.DataFrame(index=use_these, data=cum_sums_objs, columns=["total"])
a_df

#### The % total of materials of the objects of interest with respect to the total number of objects collected.

In [None]:
# the % total of materials of the objects of interest
# with respect to the total amount collected
a_total = a1500.quantity.sum()
material_df = pd.DataFrame(a1500.groupby("material").quantity.sum() / a_total).round(3)
material_df

#### The % total of the top 36 objects with respect to all objects collected.

In [None]:
# the % total of the 36 objects included in the analysis
# with respect to the total number of objects collected
index = ['ratio of top 36 items over all items:','number of top 36 items:']
data = [f"{(round((t['qMostCommon']/t['quantity'])*100))}%","{:,}".format(round(a_total))]
q_and_p = pd.DataFrame(data=data, index=index, columns=["value"])
q_and_p

#### The % total of the top 20 objects with respect to all objects collected

In [None]:
# the % total of the top 20 objects included in the analysis
# with respect to the total number of objects collected
dT20 =a1500.iloc[:20,:]
dT20 = dT20[['description', 'quantity']]

index = ['ratio of top 20 items over all items:','number of top 20 items:']
data = [f"{(round((dT20.quantity.sum()/t['quantity'])*100))}%","{:,}".format(round(dT20.quantity.sum()))]

q_and_p20 = pd.DataFrame(data=data, index=index, columns=["value"])
q_and_p20

### The cumulative totals of the top 20 objects, grouped by economic source

In [None]:
# define the broad categories:
tobaco = ["Tobacco", "Smoking related"]
industry = ['Industry','Construction', 'Industrial', 'Manufacturing']
sanitary = ["Sanitary", "Personal hygiene", "Water treatment"]
packaging = ['Packaging (non-food)','Packaging films nonfood or unknown', 'Paper packaging']
food = ['Food and drinks','Foil wrappers, aluminum foil', 'Food and drinks', 'Food and drink']
fragments = ['Plastic fragments and pieces',
             'Plastic fragments angular <5mm',
             'Styrofoam < 5mm', 
             'Plastic fragments rounded <5mm',
             'Foamed  plastic <5mm',
             'Fragmented plastics']

conditions = [tobaco, industry, sanitary, packaging, food, fragments]

codes = dT20.index    
    
for each_code in codes:
    srcs = dfCodes.loc[each_code][["source", "source_two", "source_three", "description"]]
    
    a = check_condition(srcs.values, conditions, len(conditions)-1)
    dT20.loc[each_code, "Type"] = a

fig, ax = plt.subplots(figsize=(7,7))
colors = {'Industry': 'firebrick', 'Tobacco': 'darkslategrey', 'Food and drinks': 'navy', 'Plastic fragments and pieces':'lightgrey',
         'Others':'linen','Sanitary':'plum','Packaging (non-food)':'saddlebrown'}
# N = 3
# ind = np.arange(N)
width = 0.6

labels = list(colors.keys())
handles = [plt.Rectangle((0,0),1,1, color=colors[label]) for label in labels]

ax.barh(dT20.description, dT20.quantity, color=[colors[i] for i in dT20.Type], edgecolor='darkgrey')
ax.invert_yaxis()  
ax.set_ylabel('')
ax.set_xticks([0,1000,2000,3000,4000,5000,6000,7000,8000])
ax.set_xticklabels([0,'1,000','2,000','3,000','4,000','5,000','6,000','7,000','8,000'])

ax.set_xlabel('Total item count', fontsize=16, labelpad =15)
ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')
ax.tick_params(labelcolor='k', labelsize=14, width=1)
ax.yaxis.grid(color='lightgray')
ax.xaxis.grid(color='lightgray')
ax.set_facecolor('white')

plt.legend(handles, labels, fontsize=13,facecolor='white')
for ha in ax.legend_.legendHandles:
    ha.set_edgecolor("darkgrey")

plt.grid(True)
ax.spines['top'].set_color('0.5')
ax.spines['right'].set_color('0.5')
ax.spines['bottom'].set_color('0.5')
ax.spines['left'].set_color('0.5')

# plt.savefig('C:/Users/schre086/figures/land_use_ch/top_20items.png', bbox_inches='tight')

plt.show()

#### Litter items of local origin

Items that have four or more positive associations with a land use category.

In [None]:
cols_corr = ['Buildings', 'Industries', 'Roads', 'Recreational', 'Agriculture', 'Forests', 'Unproductive']

c_count = a1500.copy()

# count the total number of correlations rho < 0 or rho > 0 and p < .05
c_count['corr_count'] = c_count[cols_corr].apply(lambda x: (x!=0).sum(), axis=1)

# count the total number of positive correlations
c_count['pos_corr_count'] = c_count[cols_corr].apply(lambda x: (x>0).sum(), axis=1)

# limt only to items that have four or more correlations
DfLocalItems = c_count[c_count.corr_count >= 4]

# get the local total
local_total = DfLocalItems.quantity.sum()

# pretty print this to jupyter
bullets = [f"* {x}" for x in DfLocalItems.description.unique()]

b = "\n".join(bullets)

a_string = f"""
* The total quantity of items that have more than four correlations, likely to be locally littered: {"{:,}".format(local_total)}
* Proportion of items likely to be of local origin among top items: {int((DfLocalItems.quantity.sum()/t["qMostCommon"])*100)}%

There are {len(DfLocalItems.description.unique())} categories in that group:\n\n {b}'
"""
md(a_string)

#### Ubiquitous items

Items that have three or less positive associations with a land use category and are positively associated with river or stream intersections.

In [None]:
DfUbiItems = c_count[(c_count.corr_count <= 3)&(c_count.intersects > 0)]
ub_total = DfUbiItems.quantity.sum()

bullets = [f"* {x}" for x in DfUbiItems.description.unique()]
b = "\n".join(bullets)

a_string = f"""
* The total quantity of items with less than three correlations and at least one correlation to intersects: {"{:,}".format(ub_total)}
* Proportion of items likely to be locally littered among top items: {int((DfUbiItems.quantity.sum()/t["qMostCommon"])*100)}%

There are {len(DfUbiItems.description.unique())} categories in that group:\n\n {b}'
"""
md(a_string)

#### Other items

Items that have three or less positive associations with a land use category and are NOT positively associated with river or stream intersections.

In [None]:
DfOthers = c_count[(c_count.corr_count <= 3)&(c_count.intersects <= 0)]
other_total = DfOthers.quantity.sum()
bullets = [f"* {x}" for x in DfOthers.description.unique()]

b = "\n".join(bullets)

a_string = f"""
* The total quantity of items with less than two positive correlations: {"{:,}".format(other_total)}
* Proportion of items likely to be locally littered among top items: {int((DfOthers.quantity.sum()/t["qMostCommon"])*100)}%

There are {len(DfOthers.description.unique())} categories in that group:\n\n {b}'
"""
md(a_string)

In [None]:
# checking if total proportion is 100%
print('total proportion of groups:',((DfOthers.quantity.sum()/t["qMostCommon"])*100) 
      + ((DfUbiItems.quantity.sum()/t["qMostCommon"])*100) 
      + ((DfLocalItems.quantity.sum()/t["qMostCommon"])*100))

## 2000 meters codes with an association to a landuse feature

Where p <= 0.05 for Spearmans test for association.

In [None]:
# the locations that need land use data
data = pd.DataFrame(index = fd.location.unique())
radius = 2000

# the functional land use groups:
these_groups = list(group_parts.keys())

# the bufferdata for this radius
bufferdata = make_bufferdata("resources/buffer_output/luse_2000.csv", a_col="AS18_27", label_keys=label_keys)

add_these_cols = bufferdata.label.unique()

a2000, s, this_data = check_rho_for_this_radius(data, fd, add_these_cols, bufferdata, radius, label_keys=label_keys, group_parts=group_parts,
                              som_cols=som_cols, as_1827_part=as_1827_part, abundant_codes=abundant_codes)

totals.append(landuse_per_radius(s))
lnd_u.append(s)

a2000["quantity"] = a2000.index.map(lambda x: code_totals.loc[x])
a2000["material"] = a2000.index.map(lambda x: code_m_map.loc[x])
a2000.sort_values(by="quantity", inplace=True)

a2000["description"] = a2000.index.map(lambda x: code_d_map.loc[x])

col_order = [a2000.columns[-1], *a2000.columns[:-1]]
a2000 = a2000.reindex(code_order)
a2000[col_order].sort_values(by=['% to buildings', '% to woods', '% to ind', '% to recreation', '% to agg'], ascending=True).head()

## 2500 meters codes with an association to a landuse feature

Where p <= 0.05 for Spearmans test for association.

In [None]:
# the locations that need land use data
data = pd.DataFrame(index = fd.location.unique())
radius = 2500

# the functional land use groups:
these_groups = list(group_parts.keys())

# the bufferdata for this radius
bufferdata = make_bufferdata("resources/buffer_output/luse_2500.csv", a_col="AS18_27", label_keys=label_keys)

add_these_cols = bufferdata.label.unique()

a2500, s, this_data = check_rho_for_this_radius(data, fd, add_these_cols, bufferdata, radius, label_keys=label_keys, group_parts=group_parts,
                              som_cols=som_cols, as_1827_part=as_1827_part, abundant_codes=abundant_codes)

totals.append(landuse_per_radius(s))
lnd_u.append(s)

a2500["quantity"] = a2500.index.map(lambda x: code_totals.loc[x])
a2500["material"] = a2500.index.map(lambda x: code_m_map.loc[x])
a2500.sort_values(by="quantity", inplace=True)

a2500["description"] = a2500.index.map(lambda x: code_d_map.loc[x])

col_order = [a2500.columns[-1], *a2500.columns[:-1]]
a2500 = a2500.reindex(code_order)
a2500[col_order].sort_values(by=['% to buildings', '% to woods', '% to ind', '% to recreation', '% to agg'], ascending=True).head()

## 3000 meters codes with an association to a landuse feature

Where p <= 0.05 for Spearmans test for association.

In [None]:
# the locations that need land use data
data = pd.DataFrame(index = fd.location.unique())
radius = 3000

# the functional land use groups:
these_groups = list(group_parts.keys())

# the bufferdata for this radius
bufferdata = make_bufferdata("resources/buffer_output/luse_3000.csv", a_col="AS18_27", label_keys=label_keys)

add_these_cols = bufferdata.label.unique()

a3000, s, this_data = check_rho_for_this_radius(data, fd, add_these_cols, bufferdata, radius, label_keys=label_keys, group_parts=group_parts,
                              som_cols=som_cols, as_1827_part=as_1827_part, abundant_codes=abundant_codes)
totals.append(landuse_per_radius(s))
lnd_u.append(s)

a3000["quantity"] = a3000.index.map(lambda x: code_totals.loc[x])
a3000["material"] = a3000.index.map(lambda x: code_m_map.loc[x])
a3000.sort_values(by="quantity", inplace=True)

a3000["description"] = a3000.index.map(lambda x: code_d_map.loc[x])

col_order = [a3000.columns[-1], *a3000.columns[:-1]]
a3000 = a3000.reindex(code_order)
a3000[col_order].sort_values(by=['% to buildings', '% to woods', '% to ind', '% to recreation', '% to agg'], ascending=True).head()

## 3500 meters codes with an association to a landuse feature

Where p <= 0.05 for Spearmans test for association.

In [None]:
# the locations that need land use data
data = pd.DataFrame(index = fd.location.unique())
radius = 3500

# the functional land use groups:
these_groups = list(group_parts.keys())

# the bufferdata for this radius
bufferdata = make_bufferdata("resources/buffer_output/luse_3500.csv", a_col="AS18_27", label_keys=label_keys)

add_these_cols = bufferdata.label.unique()

a3500, s, this_data = check_rho_for_this_radius(data, fd, add_these_cols, bufferdata, radius, label_keys=label_keys, group_parts=group_parts,
                              som_cols=som_cols, as_1827_part=as_1827_part, abundant_codes=abundant_codes)

totals.append(landuse_per_radius(s))
lnd_u.append(s)

a3500["quantity"] = a3500.index.map(lambda x: code_totals.loc[x])
a3500["material"] = a3500.index.map(lambda x: code_m_map.loc[x])
a3500.sort_values(by="quantity", inplace=True)

# use intersects
water_int = fd[fd.code.isin(abundant_codes)][["loc_date", "code", "pcs_m", "quantity", "intersects"]]
for each_code in abundant_codes:
    new_data = water_int[water_int.code == each_code][["intersects", "pcs_m"]]
    c, p = stats.spearmanr(new_data.pcs_m.values, new_data.intersects.values)
    if p <= 0.05:
        a3500.loc[each_code, "intersects"] = c
    else:
        a3500.loc[each_code, "intersects"] = 0

a3500["description"] = a3500.index.map(lambda x: code_d_map.loc[x])

col_order = [a3500.columns[-1], *a3500.columns[:-1]]
a3500 = a3500.reindex(code_order)
a3500[col_order].sort_values(by=['% to buildings', '% to woods', '% to ind', '% to recreation', '% to agg'], ascending=True).head()

### Results Spearmans Rho

In [None]:
# [Correlation matrix plot]
new_column_names = {
    "% to buildings":"Buildings",
    "% to ind":"Industries",
    "% to trans":"Roads",
    "% to recreation":"Recreational",
    "% to agg":"Agriculture",
    "% to woods":"Forests",
    "% to water":"Rivers/canals",
    "% to unproductive":"Unproductive"
}
# a1500.set_index('description', inplace=True)
a3500.rename(columns=new_column_names, inplace=True)
use_these = list(new_column_names.values())
aplot = a3500[[*use_these, "description"]].round(2)
aplot.set_index("description", inplace=True, drop=True)

fig, ax= plt.subplots(figsize=(17,17))
sns.set(font_scale=1.2)

kwargs = dict(annot=True, fmt="g", cmap='coolwarm',
              annot_kws={"size":9.5},center =0, mask=aplot==0,
              square = True, linewidths=0.1,linecolor="lightgrey",
              cbar = False)

ax = sns.heatmap(aplot, ax = ax, **kwargs)

ax.set_ylabel('')
ax.yaxis.tick_left()

ax.xaxis.tick_top() 
ax.xaxis.set_label_position('top')
ax.set_xticklabels(use_these,rotation=90)
       
ax_divider = make_axes_locatable(ax)
axins = inset_axes(ax,
                   width="60%",  # width = 5% of parent_bbox width
                   height="1.5%",  # height : 50%
                   loc='lower center',
                   bbox_to_anchor=(0.03, -0.03, 1, 1),
                   bbox_transform=ax.transAxes, 
                   borderpad=0
                   )

fig.colorbar(ax.get_children()[0], cax = axins, orientation = 'horizontal', extendfrac='auto')
plt.show()

#### Litter items of local origin

Items that have four or more positive associations with a land use category.

In [None]:
# cols_corr = ['Buildings', 'Industries', 'Roads', 'Recreational', 'Agriculture', 'Forests', 'Unproductive']

c_count = a3500.copy()

# count the total number of correlations rho < 0 or rho > 0 and p < .05
c_count['corr_count'] = c_count[cols_corr].apply(lambda x: (x!=0).sum(), axis=1)

# count the total number of positive correlations
c_count['pos_corr_count'] = c_count[cols_corr].apply(lambda x: (x>0).sum(), axis=1)

# limt only to items that have four or more correlations
DfLocalItems = c_count[c_count.corr_count >= 4]

# get the local total
local_total = DfLocalItems.quantity.sum()

# pretty print this to jupyter
bullets = [f"* {x}" for x in DfLocalItems.description.unique()]

b = "\n".join(bullets)

a_string = f"""
* The total quantity of items that have more than four correlations, likely to be locally littered: {"{:,}".format(local_total)}
* Proportion of items likely to be of local origin among top items: {int((DfLocalItems.quantity.sum()/t["qMostCommon"])*100)}%

There are {len(DfLocalItems.description.unique())} categories in that group:\n\n {b}'
"""
md(a_string)

#### Ubiquitous items

Items that have three or less positive associations with a land use category and are positively associated with river or stream intersections.

In [None]:
DfUbiItems = c_count[(c_count.corr_count <= 3)&(c_count.intersects > 0)]
ub_total = DfUbiItems.quantity.sum()

bullets = [f"* {x}" for x in DfUbiItems.description.unique()]
b = "\n".join(bullets)

a_string = f"""
* The total quantity of items with less than three correlations and at least one correlation to intersects: {"{:,}".format(ub_total)}
* Proportion of items likely to be locally littered among top items: {int((DfUbiItems.quantity.sum()/t["qMostCommon"])*100)}%

There are {len(DfUbiItems.description.unique())} categories in that group:\n\n {b}'
"""
md(a_string)

#### Other items

Items that have three or less positive associations with a land use category and are NOT positively associated with river or stream intersections.

In [None]:
DfOthers = c_count[(c_count.corr_count <= 2)&(c_count.intersects <= 0)]
other_total = DfOthers.quantity.sum()
bullets = [f"* {x}" for x in DfOthers.description.unique()]

b = "\n".join(bullets)

a_string = f"""
* The total quantity of items with less than two positive correlations: {"{:,}".format(other_total)}
* Proportion of items likely to be locally littered among top items: {int((DfOthers.quantity.sum()/t["qMostCommon"])*100)}%

There are {len(DfOthers.description.unique())} categories in that group:\n\n {b}'
"""
md(a_string)

In [None]:
# checking if total proportion is 100%
print('total proportion of groups:',((DfOthers.quantity.sum()/t["qMostCommon"])*100) 
      + ((DfUbiItems.quantity.sum()/t["qMostCommon"])*100) 
      + ((DfLocalItems.quantity.sum()/t["qMostCommon"])*100))

### 4000 meters codes with an association to a landuse feature

Where p <= 0.05 for Spearmans test for association.

In [None]:
# the locations that need land use data
data = pd.DataFrame(index = fd.location.unique())
radius = 4000

# the functional land use groups:
these_groups = list(group_parts.keys())

# the bufferdata for this radius
bufferdata = make_bufferdata("resources/buffer_output/luse_4000.csv", a_col="AS18_27", label_keys=label_keys)

add_these_cols = bufferdata.label.unique()

a4000, s, this_data = check_rho_for_this_radius(data, fd, add_these_cols, bufferdata, radius, label_keys=label_keys, group_parts=group_parts,
                              som_cols=som_cols, as_1827_part=as_1827_part, abundant_codes=abundant_codes)

totals.append(landuse_per_radius(s))
lnd_u.append(s)

a4000["quantity"] = a4000.index.map(lambda x: code_totals.loc[x])
a4000["material"] = a4000.index.map(lambda x: code_m_map.loc[x])
a4000.sort_values(by="quantity", inplace=True)

a4000["description"] = a4000.index.map(lambda x: code_d_map.loc[x])

col_order = [a4000.columns[-1], *a4000.columns[:-1]]
a4000 = a4000.reindex(code_order)
a4000[col_order].sort_values(by=['% to buildings', '% to woods', '% to ind', '% to recreation', '% to agg'], ascending=True).head()

### 4500 meters codes with an association to a landuse feature

Where p <= 0.05 for Spearmans test for association.

In [None]:
# the locations that need land use data
data = pd.DataFrame(index = fd.location.unique())
radius = 4500

# the functional land use groups:
these_groups = list(group_parts.keys())

# the bufferdata for this radius
bufferdata = make_bufferdata("resources/buffer_output/luse_4500.csv", a_col="AS18_27", label_keys=label_keys)

add_these_cols = bufferdata.label.unique()

a4500, s, this_data = check_rho_for_this_radius(data, fd, add_these_cols, bufferdata, radius, label_keys=label_keys, group_parts=group_parts,
                              som_cols=som_cols, as_1827_part=as_1827_part, abundant_codes=abundant_codes)

totals.append(landuse_per_radius(s))
lnd_u.append(s)

a4500["quantity"] = a4500.index.map(lambda x: code_totals.loc[x])
a4500["material"] = a4500.index.map(lambda x: code_m_map.loc[x])
a4500.sort_values(by="quantity", inplace=True)

a4500["description"] = a4500.index.map(lambda x: code_d_map.loc[x])

col_order = [a4500.columns[-1], *a4500.columns[:-1]]
a4500 = a4500.reindex(code_order)
a4500[col_order].sort_values(by=['% to buildings', '% to woods', '% to ind', '% to recreation', '% to agg'], ascending=True).head()

### 5000 meters codes with an association to a landuse feature

Where p <= 0.05 for Spearmans test for association.

In [None]:
# the locations that need land use data
data = pd.DataFrame(index = fd.location.unique())
radius = 5000

# the functional land use groups:
these_groups = list(group_parts.keys())

# the bufferdata for this radius
bufferdata = make_bufferdata("resources/buffer_output/luse_5k.csv", a_col="AS18_27", label_keys=label_keys)

add_these_cols = bufferdata.label.unique()

a5000, s, this_data = check_rho_for_this_radius(data, fd, add_these_cols, bufferdata, radius, label_keys=label_keys, group_parts=group_parts,
                              som_cols=som_cols, as_1827_part=as_1827_part, abundant_codes=abundant_codes)

totals.append(landuse_per_radius(s))
lnd_u.append(s)

a5000["quantity"] = a5000.index.map(lambda x: code_totals.loc[x])
a5000["material"] = a5000.index.map(lambda x: code_m_map.loc[x])
a5000.sort_values(by="quantity", inplace=True)

a5000["description"] = a5000.index.map(lambda x: code_d_map.loc[x])

col_order = [a5000.columns[-1], *a5000.columns[:-1]]
a5000 = a5000.reindex(code_order)
a5000[col_order].sort_values(by=['% to buildings', '% to woods', '% to ind', '% to recreation', '% to agg'], ascending=True).head()

### 10000 meters codes with an association to a landuse feature

Where p <= 0.05 for Spearmans test for association.

In [None]:
# the locations that need land use data
data = pd.DataFrame(index = fd.location.unique())
radius = 10000

# the functional land use groups:
these_groups = list(group_parts.keys())

# the bufferdata for this radius
bufferdata = make_bufferdata("resources/buffer_output/luse_10k.csv", a_col="AS18_27", label_keys=label_keys)

add_these_cols = bufferdata.label.unique()

a10k, s, this_data = check_rho_for_this_radius(data, fd, add_these_cols, bufferdata, radius, label_keys=label_keys, group_parts=group_parts,
                              som_cols=som_cols, as_1827_part=as_1827_part, abundant_codes=abundant_codes)

totals.append(landuse_per_radius(s))
lnd_u.append(s)

a10k["quantity"] = a10k.index.map(lambda x: code_totals.loc[x])
a10k["material"] = a10k.index.map(lambda x: code_m_map.loc[x])
a10k.sort_values(by="quantity", inplace=True)

a10k["description"] = a10k.index.map(lambda x: code_d_map.loc[x])

col_order = [a10k.columns[-1], *a10k.columns[:-1]]
a10k = a10k.reindex(code_order)
a10k[col_order].sort_values(by=['% to buildings', '% to woods', '% to ind', '% to recreation', '% to agg'], ascending=True).head()

### Total correlations, total positive correlations and weight for each buffer radius

In [None]:
# the total land use for each buffer zone
t_lu_by_r = pd.concat(totals, axis=1)

# the column names to aggregate
cols = list(new_column_names.values())

# these buffer values need the columns renamed
dfs_oldnames = [a2000, a2500, a3000, a4000, a4500, a5000, a10k]
dfs_newnames = [data.rename(columns=new_column_names) for data in dfs_oldnames]

# combined all the buffer values
dfs = [a1500,dfs_newnames[0], dfs_newnames[1], dfs_newnames[2], a3500, dfs_newnames[3], dfs_newnames[4], dfs_newnames[5], dfs_newnames[6]]

# count the total correlations and positive correlations
for data in dfs:
    data["count"] = data[cols].apply(lambda x: (x!=0).sum(), axis=1)
    data['positive'] = data[cols].apply(lambda x: (x>0).sum(), axis=1)

# convert meters to kilometers for index and printing
con_to_km = [x/1000 for x in t_lu_by_r.columns]

# for each radius get the total and positive count
total_correlation_count = {}
pos_correlations = {}
for i,a_radius in enumerate(con_to_km):
    
    data = dfs[i]
    
    total_count = data["count"].sum()
    pos_count = data["positive"].sum()
    label = a_radius
    total_correlation_count.update({label:total_count})
    pos_correlations.update({label:pos_count})
    
    
# combine the correlation totals per radius in to one df
t_corrs = pd.DataFrame.from_dict(total_correlation_count, orient="index", columns=["# of correlations"])

# add positive correlations
for k,v in pos_correlations.items():
    t_corrs.loc[k, "positive correlations"] = v

# add the weight factor
for i,a_radius in enumerate(con_to_km):
    data=dfs[i]    
    data['weight'] = (data['positive'] * data['quantity'])  
    t_corrs.loc[a_radius, "weight"] = data["weight"].sum()

t_corrs["positive correlations"] = t_corrs["positive correlations"].astype("int")
t_corrs["weight"] = t_corrs["weight"].astype("int")
t_corrs

## Changes in $\rho$ at different buffer sizes

*__Below:__ Spearmans $\rho$ evaluated for the top 20 items and the land use categories at the different buffer zone radi. Code definitions below chart.*

In [None]:
comps = pd.concat(dfs[:-1])

comps.reset_index(inplace=True)
comps.rename(columns={'index':'code'}, inplace=True)

In [None]:
fig, ax = plt.subplots(figsize=(8,6))

data = comps[comps.code.isin(abundant_codes[:20])].copy()

aval = "Buildings"
variables=[aval]
data[variables] = data[variables].astype('float').round(2)
data.sort_values(by=aval, inplace=True)

sns.scatterplot(data=data, x='dist', y=aval, hue='code')
sns.lineplot(data=data, x='dist', y=aval, hue='code')
ax.set_ylabel('Rho', fontsize=14)
ax.set_xlabel('buffer radius meters', fontsize=14)

handles, labels = ax.get_legend_handles_labels()

plt.legend(handles[:20][::-1], labels[:20][::-1], bbox_to_anchor=(.5,1.01), loc='lower center', ncol=4)
plt.show()

In [None]:
dfCodes.loc[abundant_codes[:20]]["description"]

## Changes of land use profile for different buffer zones

As the radius of the buffer zone changes the land use mix changes. Defining the radius of the buffer zone is determined by the scale at which the reporting is being done. For the report to Switzerland the target administrative level was the municipality. Therefore a radius of 1500m was appropriate, given the geographic size of a municipality in Switzerland.

*__Below:__ The percent total of land attributed to each category at each buffer radius* 

In [None]:
# the row label for the total combined land use
total ='adjusted_land_use'
totaldf = t_lu_by_r.loc[total]

# divide the individual land use categories by the total land-use
# for each radius
landuse = ['agg', 'buildings', 'ind', 'recreation', 'trans', 'unproductive', 'woods']
adf = t_lu_by_r.loc[landuse].copy()
for element in totaldf.index:
    adf[element] =  ((adf[element]/totaldf.loc[element])*100).astype("int")

# pretty print
pprint =adf.applymap(lambda x:f"{x}%")
pprint

In [None]:
data =adf.values
labels = adf.index
colors = adf.index
xlabels = [str(x) for x in adf.columns]

colors = ['bisque','lightcoral','k','orchid','lightgrey','saddlebrown', 'forestgreen']

bottom = [0]*(len(adf.columns))

width = 0.8      # the width of the bars: can also be len(x) sequence

fig, ax = plt.subplots(figsize=(8,5))

for i,group in enumerate(data):
    ax.bar(xlabels, group, width, bottom=bottom, label=labels[i], color = colors[i])
    bottom += group


ax.set_ylabel('Land-use profile [%]', fontsize=16)

ax.set_xlabel("Buffer zone radius [m]", labelpad =15, fontsize=16)
ax.set_facecolor('white')

ax.spines['top'].set_color('0.5')
ax.spines['right'].set_color('0.5')
ax.spines['bottom'].set_color('0.5')
ax.spines['left'].set_color('0.5')
ax.set_ylim(0,100)

ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')
ax.tick_params(labelcolor='k', labelsize=14, width=1)

ax.legend(bbox_to_anchor=(1,1), facecolor = 'white', fontsize=14)

plt.show()

*__Above:__ The percent total of land use at the different buffer radius. As the radius increases the % total attributed to forest and agriculture gets larger* 

*__Below:__ The land use category as a percent of total adjusted land use*

In [None]:
survey_data = dfx

# explanatory variables that are being considered
#luse_exp = ['% to buildings', '% to recreation', '% to agg', '% to woods', 'streets km', 'intersects']
luse_exp = ['Buildings [%]','Industrial [%]', 'Recreational [%]', 'Agricultural [%]', 'Forests [%]', 'Unproductive [%]', 'Road network \nlength [km]', 'Rivers/canals [#]']
# columns needed
use_these_cols = ['loc_date' ,
                  'date',
                  '% to buildings',
                  '% to trans',
                  '% to recreation',
                  '% to agg',
                  '% to woods',
                  '% to ind', 
                  '% to unproductive',
                  'population',
                  'water_name_slug',
                  'streets km',
                  'intersects',
                  'groupname',
                  'code'
                 ]

# the land use data was unvailable for these municipalities
no_land_use = ['Walenstadt', 'Weesen', 'Glarus Nord', 'Quarten']

# slice the data by start and end date, remove the locations with no land use data
use_these_args = ((survey_data.date >= start_date)&(survey_data.date <= end_date)&(~survey_data.city.isin(no_land_use)))
survey_data = survey_data[use_these_args].copy()

use_these_samples = survey_data.loc_date.unique()

# use this data
utd = this_data_1500[this_data_1500.loc_date.isin(use_these_samples)]

In [None]:
# the survey total for each survey indifferent of object
dfdt = utd.groupby(use_these_cols[:-2], as_index=False).agg({unit_label:'sum', 'quantity':'sum'})

pretty_names = {"% to buildings":"Built-up environment [%]",
                '% to ind':'Industries [%]',
                '% to recreation': 'Recreational [%]',
                '% to agg':'Agriculture [%]',
                '% to woods':'Forests [%]',               
                '% to unproductive':'Unproductive land [%]',
                'streets km':'Road network \nlength [km]',
                'intersects': 'Rivers/canals [#]',
                'population':'Population',
                
               }

# method to get the ranked correlation of pcs_m to each explanatory variable
def make_plot_with_spearmans(data, ax, n):
    sns.scatterplot(data=data, x=n, y=unit_label, ax=ax, color='black', s=30, edgecolor='white', alpha=0.6)
    corr, a_p = stats.spearmanr(data[n], data[unit_label])
    return ax, corr, a_p

sns.set_style("whitegrid")
fig, axs = plt.subplots(1,4, figsize=(10,3.2), sharey=True)

data = dfdt.copy()

perc_cols = list(pretty_names.keys())[:5]

data[perc_cols] = data[perc_cols].apply(lambda x:x*100)

cols1 = list(pretty_names.keys())[:4]
for i, n in enumerate(cols1):
    ax=axs[i]
    
    # the ECDF of the land use variable
    the_data = ECDF(data[n].values)
    sns.lineplot(x=the_data.x, y= (the_data.y)*100, ax=ax, color='dodgerblue', label="% of surface area" )
    
    # get the median % of land use for each variable under consideration from the data
    the_median = data[n].median()
    
    # plot the median and drop horzontal and vertical lines
    ax.scatter([the_median], 50, color='red',s=50, linewidth=2, zorder=100, label="the median")
    ax.vlines(x=the_median, ymin=0, ymax=50, color='red', linewidth=2)
    ax.hlines(xmax=the_median, xmin=0, y=50, color='red', linewidth=2)
    
    #remove the legend from ax   
    ax.get_legend().remove()
    
    if i == 0:
        ax.set_ylabel("Share of \nsurveys [%]", labelpad = 15)
    else:
        pass
    
    # add the median value from all locations to the ax title
    ax.set_title(F"median: {(round(the_median, 2))}",fontsize=12, loc='left')
    ax.set_xlabel(pretty_names[n], fontsize=14, labelpad=15)

plt.tight_layout()

plt.show()

*__Below:__ The land use category as a percent of total adjusted land use*

In [None]:
sns.set_style("whitegrid")
fig, axs = plt.subplots(1,4, figsize=(10,3.2), sharey=True)

cols2 = list(pretty_names.keys())[4:-1]
for i, n in enumerate(cols2):
    ax=axs[i]
    
    # the ECDF of the land use variable
    the_data = ECDF(data[n].values)
    sns.lineplot(x=the_data.x, y= (the_data.y)*100, ax=ax, color='dodgerblue', label="% of surface area" )
    
    # get the median % of land use for each variable under consideration from the data
    the_median = data[n].median()
    
    # plot the median and drop horzontal and vertical lines
    ax.scatter([the_median], 50, color='red',s=50, linewidth=2, zorder=100, label="the median")
    ax.vlines(x=the_median, ymin=0, ymax=50, color='red', linewidth=2)
    ax.hlines(xmax=the_median, xmin=0, y=50, color='red', linewidth=2)
    
    #remove the legend from ax   
    ax.get_legend().remove()
    
    if i == 0:
        ax.set_ylabel("Share of \nsurveys [%]", labelpad = 15)
    else:
        pass
    
    # add the median value from all locations to the ax title
    ax.set_title(F"median: {(round(the_median, 2))}",fontsize=12, loc='left')
    ax.set_xlabel(pretty_names[n], fontsize=14, labelpad=15)

plt.tight_layout()

plt.show()

In [None]:
today = dt.datetime.now().date().strftime("%d/%m/%Y")
where = "Biel, CH"

my_block = f"""

This script updated {today} in {where}

> \u2764\ufe0f what you do everyday

_ANALYSTATHAMMERDIRT_
"""

md(my_block)