In [1]:
# sys, file and nav packages:
import datetime as dt
import json

# math packages:
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.distributions.empirical_distribution import ECDF

# charting:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib import ticker
from matplotlib import colors
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.gridspec import GridSpec
import seaborn as sns

import base64, io, IPython
from PIL import Image as PILImage
from IPython.display import Markdown as md
from IPython.display import display, Math, Latex


# set some parameters:
today = dt.datetime.now().date().strftime("%Y-%m-%d")
start_date = '2020-03-01'
end_date ='2021-05-31'

a_fail_rate = 100

unit_label = 'p/100m'
reporting_unit = 100

# name of the output folder:
name_of_project = 'dist_map'

a_color = 'dodgerblue'

# colors for gradients
colors = ['beige', 'navajowhite', 'sandybrown', 'salmon', 'sienna']
nodes = [0.0, 0.2, 0.6, 0.8, 1.0]
cmap2 = LinearSegmentedColormap.from_list("mycmap", list(zip(nodes, colors)))

bassin_pallette = {'rhone':'dimgray', 'aare':'salmon', 'linth':'tan', 'ticino':'steelblue', 'reuss':'purple'}

# define the feature level and components
this_level = 'river_bassin'
comps = ['linth', 'rhone', 'aare', 'ticino']
comp_labels = {"linth":"Linth/Limmat", "rhone":"Rhône", 'aare':"Aare", "ticino":"Ticino/Cerisio", "reuss":"Reuss"}
comp_palette = {"Linth/Limmat":"dimgray", "Rhône":"tan", "Aare":"salmon", "Ticino/Cerisio":"steelblue", "Reuss":"purple"}

# explanatory variables:
luse_exp = ['% to buildings', '% to recreation', '% to agg', '% to woods', 'streets km', 'intersects']

# columns needed
use_these_cols = ['loc_date' ,
                  '% to buildings',
                  '% to trans', 
                  '% to recreation',
                  '% to agg',
                  '% to woods',
                  'population',
                  'river_bassin',
                  'water_name_slug',
                  'streets km',
                  'intersects',
                  'length',
                  'groupname',
                  'code'
                 ]

# these are default
top_name = ["All survey areas"]

# get your data:
df= pd.read_csv('checked_sdata_eos_2020_21.csv')

with open("river_basins.json", "r") as infile:
    river_bassins = json.load(infile)

dfBeaches = pd.read_csv("beaches_with_land_use_rates.csv")
dfCodes = pd.read_csv("codes_with_group_names_2015.csv")
dfDims = pd.read_csv("corrected_dims.csv")

# set the index of the beach data to location slug
dfBeaches.set_index('slug', inplace=True)

city_map = dfBeaches['city']

# map locations to feature names
location_wname_key = dfBeaches.water_name_slug

# map water_name_slug to water_name
wname_wname = dfBeaches[['water_name_slug','water_name']].reset_index(drop=True).drop_duplicates()
wname_wname.set_index('water_name_slug', inplace=True)
        
dfCodes.set_index("code", inplace=True)

# these descriptions need to be shortened for display
def shorten_the_value(an_array, a_df):
    """Change the value in a data frame column with an array of three values.

    Args:
    an_array: array: the index, column and the new value
    a_df: dataframe: the dataframe to change:

    Returns:
    The changed dataframe
    """
    a_df.loc[an_array[0], an_array[1]] = an_array[2]
    return a_df

dfCodes = shorten_the_value(["G74", "description", "Insulation: includes spray foams"], dfCodes)
dfCodes = shorten_the_value(["G940", "description", "Foamed EVA for crafts and sports"], dfCodes)
dfCodes = shorten_the_value(["G96", "description", "Sanitary-pads/tampons, applicators"], dfCodes)
dfCodes = shorten_the_value(["G178", "description", "Metal bottle caps and lids"], dfCodes)
dfCodes = shorten_the_value(["G82", "description", "Expanded foams 2.5cm - 50cm"], dfCodes)
dfCodes = shorten_the_value(["G81", "description", "Expanded foams .5cm - 2.5cm"], dfCodes)
dfCodes = shorten_the_value(["G117", "description", "Expanded foams < 5mm"], dfCodes)
dfCodes = shorten_the_value(["G75", "description", "Plastic/foamed polystyrene 0 - 2.5cm"], dfCodes)
dfCodes = shorten_the_value(["G76", "description", "Plastic/foamed polystyrene 2.5cm - 50cm"], dfCodes)
dfCodes = shorten_the_value(["G24", "description", "Plastic lid rings"], dfCodes)
dfCodes = shorten_the_value(["G33", "description", "Lids for togo drinks plastic"], dfCodes)
dfCodes = shorten_the_value(["G3", "description", "Plastic bags, carier bags"], dfCodes)
dfCodes = shorten_the_value(["G204", "description", "Bricks, pipes not plastic"], dfCodes)

# make a map to the code descriptions
code_description_map = dfCodes.description

# make a map to the code descriptions
code_material_map = dfCodes.material

In [2]:
# this is the data before the expanded fragmented plastics and foams are aggregated to Gfrags and Gfoams


# this is the aggregated survey data that is being used
temporal = (df.date >= start_date)&(df.date <= end_date)
fd = df.copy()
fd['loc_dates'] = list(zip(fd.location.values, fd.date.values))
fd['date'] = pd.to_datetime(fd.date)


dfb = dfBeaches.loc[fd.location.unique()].copy()

lu_2000 = pd.read_csv('luse_2000.csv')
lu_2000.rename(columns={'location':'slug'}, inplace=True)


In [3]:
# the landuse descriptors are integers, key them to a readable descriptor
# no27 is the results of intersecting buffer to Land use

no27_keys = {
    1:"industrial",
    2:"residential",
    3:"government",
    4:"agg_buildings",
    5:"unk_building",
    6:"roads",
    7:"railways",
    8:"airports",
    9:"special",
    10:"recreational",
    11:"orchards",
    12:"vineyards",
    13:"horticulture",
    14:"arable",
    15:"meadows",
    16:"farmpastures",
    17:"alpinemeadows",
    18:"aplinepasteurs",
    19:"closed_forest",
    20:"open_forest",
    21:"brush_forest",
    22:"woods",
    23:"lakes",
    24:"rivers",
    25:"unproductive",
    26:"bareland",
    27:"glaciers"
}

# group the land use into functional groups
no27_groups = {'buildings':[1,2,3,4,5,9],
               'trans':[6,7,8],
               'recreation':[10],
               'agg':[11, 12, 13, 14, 15, 16, 18],
               'woods':[17,19,20,21,22],
               'water':[23,24],
               'unproductive':[25,26,27]
              }

# make column names based on the key values:
no27_part ={k:F"part_{v}" for k,v in no27_keys.items()}

# make a df with location slug, landuse label and land use value:
for a_num in lu_2000.AS18_27.unique():
    lu_2000.loc[lu_2000.AS18_27==a_num, "label"]=no27_keys[a_num]

buffer_totals = lu_2000.groupby(['slug','label'], as_index=False).AS18_27.count()

# all locations may not have land use data
# identify those locations and give the land use criteria a value of zero for each label
no_lu_data = [x for x in dfb.index if x not in buffer_totals.slug.unique()]

add_these_cols = lu_2000.label.unique()

for a_beach in no_lu_data:
    for a_label in buffer_totals.label.unique():
        new_row={'slug':a_beach, 'label':a_label, 'AS18_27':0}
        buffer_totals.append(new_row, ignore_index=True)

locs_2000 = pd.DataFrame(index = fd.location.unique())


for acol in add_these_cols:
    locs_2000[acol]= 0



for beach in lu_2000.slug.unique():
    for label in list(no27_keys.values()):        
        try:
            new_data = buffer_totals[(buffer_totals.slug == beach)&(buffer_totals.label == label)].AS18_27.values[0]
        except:
            new_data = 0
        
        locs_2000.loc[beach, label] = new_data
        
# total land use
locs_2000['luse_total'] = locs_2000.loc[:,list(no27_keys.values())].sum(axis=1)

# amount attributed to water
locs_2000['water_value'] = locs_2000.loc[:, ['lakes','rivers']].sum(axis=1)

# the adjsuted land ues
locs_2000['adjusted_land_use'] = locs_2000.luse_total - locs_2000.water_value

# divide the total amount for each feature by the adjusted land use
for label in list(no27_keys.values()):
    a_label = F"part_{label}"
    locs_2000[a_label] = locs_2000[label]/locs_2000['adjusted_land_use']    

these_groups = list(no27_groups.keys())

# aggregate the groups
for a_group in these_groups:
        part_groups = [no27_part[x] for x in no27_groups[a_group]]
        new_group = F"% to {a_group}"
        locs_2000[new_group] = locs_2000.loc[:,part_groups].sum(axis=1)

# map land use values to survey results
som_cols = ['% to buildings',
            '% to trans',
            '% to recreation',
            '% to agg',
            '% to woods',
            '% to water',
            '% to unproductive',
          
]

# iterate through the beach names and assign land use data
for a_beach in fd.location.unique():
    for element in som_cols:
        fd.loc[fd.location == a_beach, element] = locs_2000.loc[a_beach, element]
        

this_data = fd[[unit_label,*som_cols, 'code']]

abundant_codes = fd[fd.quantity > 30].code.unique()
myresults = {}
for i,code in enumerate(abundant_codes):
    data = this_data[this_data.code == code]
    code_results ={code:{}}
    for j, n in enumerate(som_cols):
        corr, a_p = stats.spearmanr(data[n], data[unit_label])
        if a_p <= 0.05:
            code_results[code].update({n:corr})
    myresults.update(code_results)

srho_results_2000 = pd.DataFrame.from_dict(myresults, orient='index')
srho_results_2000.fillna("X", inplace=True)
      

### 2000 meters Codes with an association to a landuse feature

Where p <= 0.05 for Spearmans test for association.

In [4]:
srho_results_2000

Unnamed: 0,% to buildings,% to trans,% to recreation,% to agg,% to woods,% to water,% to unproductive
G27,0.333677,0.359557,0.302469,-0.284461,-0.164146,0.123172,-0.298779
G95,0.133874,0.187697,0.195286,-0.151915,X,0.124518,-0.169153
G30,0.259606,0.277116,0.275198,-0.202422,-0.133838,X,X
G67,-0.128185,X,X,X,X,X,X
G112,0.102842,0.105651,0.156375,X,X,X,X
G200,0.191516,X,0.11617,-0.21987,X,0.241411,X
G178,0.361203,0.35621,0.257235,-0.310423,-0.126493,X,-0.326207
G25,0.134717,0.110049,0.134953,-0.10919,X,0.130293,X
G98,0.103159,0.181116,0.215513,-0.172881,X,X,X
G73,0.129071,0.103307,X,X,X,X,X


In [5]:
# make a df with location slug, landuse label and land use value:
# Do it again for 2500 meters

lu_2000 = pd.read_csv("luse_2500.csv")
lu_2000.rename(columns={'location':'slug'}, inplace=True)
lu_2000.head()
for a_num in lu_2000.AS18_27.unique():
    lu_2000.loc[lu_2000.AS18_27==a_num, "label"]=no27_keys[a_num]

buffer_totals = lu_2000.groupby(['slug','label'], as_index=False).AS18_27.count()

for a_beach in no_lu_data:
    for a_label in buffer_totals.label.unique():
        new_row={'slug':a_beach, 'label':a_label, 'AS18_27':0}
        buffer_totals.append(new_row, ignore_index=True)
        
locs_2000 = pd.DataFrame(index = fd.location.unique())


for acol in add_these_cols:
    locs_2000[acol]= 0



for beach in lu_2000.slug.unique():
    for label in list(no27_keys.values()):        
        try:
            new_data = buffer_totals[(buffer_totals.slug == beach)&(buffer_totals.label == label)].AS18_27.values[0]
        except:
            new_data = 0
        
        locs_2000.loc[beach, label] = new_data
        
# total land use
locs_2000['luse_total'] = locs_2000.loc[:,list(no27_keys.values())].sum(axis=1)

# amount attributed to water
locs_2000['water_value'] = locs_2000.loc[:, ['lakes','rivers']].sum(axis=1)

# the adjsuted land ues
locs_2000['adjusted_land_use'] = locs_2000.luse_total - locs_2000.water_value

# divide the total amount for each feature by the adjusted land use
for label in list(no27_keys.values()):
    a_label = F"part_{label}"
    locs_2000[a_label] = locs_2000[label]/locs_2000['adjusted_land_use']    

these_groups = list(no27_groups.keys())

# aggregate the groups
for a_group in these_groups:
        part_groups = [no27_part[x] for x in no27_groups[a_group]]
        new_group = F"% to {a_group}"
        locs_2000[new_group] = locs_2000.loc[:,part_groups].sum(axis=1)
        

myresults = {}
for i,code in enumerate(abundant_codes):
    data = this_data[this_data.code == code]
    code_results ={code:{}}
    for j, n in enumerate(som_cols):
        corr, a_p = stats.spearmanr(data[n], data[unit_label])
        if a_p <= 0.05:
            code_results[code].update({n:corr})
    myresults.update(code_results)

### 2500 meters codes with an association to a landuse feature

Where p <= 0.05 for Spearmans test for association.

In [6]:
srho_results_2500 = pd.DataFrame.from_dict(myresults, orient='index')
srho_results_2500.fillna("X", inplace=True)
srho_results_2500

Unnamed: 0,% to buildings,% to trans,% to recreation,% to agg,% to woods,% to water,% to unproductive
G27,0.333677,0.359557,0.302469,-0.284461,-0.164146,0.123172,-0.298779
G95,0.133874,0.187697,0.195286,-0.151915,X,0.124518,-0.169153
G30,0.259606,0.277116,0.275198,-0.202422,-0.133838,X,X
G67,-0.128185,X,X,X,X,X,X
G112,0.102842,0.105651,0.156375,X,X,X,X
G200,0.191516,X,0.11617,-0.21987,X,0.241411,X
G178,0.361203,0.35621,0.257235,-0.310423,-0.126493,X,-0.326207
G25,0.134717,0.110049,0.134953,-0.10919,X,0.130293,X
G98,0.103159,0.181116,0.215513,-0.172881,X,X,X
G73,0.129071,0.103307,X,X,X,X,X
