In [1]:
%load_ext watermark
# sys, file and nav packages:
import datetime as dt
import json
import functools
import time
from os import listdir
from os.path import isfile, join
import typing

# math packages:
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.distributions.empirical_distribution import ECDF

# charting:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib import ticker
from matplotlib import colors
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.gridspec import GridSpec
from mpl_toolkits.axes_grid1 import make_axes_locatable
from mpl_toolkits.axes_grid1.inset_locator import inset_axes

import seaborn as sns

import IPython
from PIL import Image as PILImage
from IPython.display import Markdown as md
from IPython.display import display
from myst_nb import glue

import time

unit_label = 'p/100m'

# survey data:
dfx= pd.read_csv('resources/checked_sdata_eos_2020_21.csv')


dfBeaches = pd.read_csv("resources/beaches_with_land_use_rates.csv")
dfCodes = pd.read_csv("resources/codes_with_group_names_2015.csv")

# set the index of the beach data to location slug
dfBeaches.set_index('slug', inplace=True)

# set the index of to codes
dfCodes.set_index("code", inplace=True)

# code description map
code_d_map = dfCodes.description.copy()

# shorten the descriptions of two codes
code_d_map.loc["G38"] = "sheeting for protecting large cargo items"
code_d_map.loc["G73"] = "Foamed items & pieces (non packaging/insulation)"

# code material map
code_m_map = dfCodes.material

# this defines the css rules for the note-book table displays
header_row = {'selector': 'th:nth-child(1)', 'props': f'background-color: #FFF; text-align:right'}
even_rows = {"selector": 'tr:nth-child(even)', 'props': f'background-color: rgba(139, 69, 19, 0.08);'}
odd_rows = {'selector': 'tr:nth-child(odd)', 'props': 'background: #FFF;'}
table_font = {'selector': 'tr', 'props': 'font-size: 12px;'}
table_data = {'selector': 'td', 'props': 'padding: 6px;'}
table_css_styles = [even_rows, odd_rows, table_font, header_row]

pdtype = pd.core.frame.DataFrame
pstype = pd.core.series.Series

cmap = sns.diverging_palette(230, 20, as_cmap=True)

def scaleTheColumn(x):
    
    xmin = x.min()
    xmax = x.max()
    xscaled = (x-xmin)/(xmax-xmin)
    
    return xscaled

def cleanSurveyResults(data):
    # performs data cleaning operations on the
    # default data ! this does not remove 
    # Walensee ! The new map data is complete    
    data['loc_date'] = list(zip(data.location, data["date"]))
    data['date'] = pd.to_datetime(data["date"])
    
    # get rid of microplastics
    mcr = data[data.groupname == "micro plastics (< 5mm)"].code.unique()
    
    # replace the bad code
    data.code = data.code.replace('G207', 'G208')
    data = data[~data.code.isin(mcr)]
    
    return data

class SurveyResults:
    """Creates a dataframe from a valid filename. Assigns the column names and defines a list of
    codes and locations that can be used in the CodeData class.
    """
    
    file_name = 'resources/checked_sdata_eos_2020_21.csv'
    columns_to_keep=[
        'loc_date',
        'location', 
        'river_bassin',
        'water_name_slug',
        'city',
        'w_t', 
        'intersects', 
        'code', 
        'pcs_m',
        'quantity'
    ]
        
    def __init__(self, data: str = file_name, clean_data: bool = True, columns: list = columns_to_keep, w_t: str = None):
        self.dfx = pd.read_csv(data)
        self.df_results = None
        self.locations = None
        self.valid_codes = None
        self.clean_data = clean_data
        self.columns = columns
        self.w_t = w_t
        
    def validCodes(self):
        # creates a list of unique code values for the data set    
        conditions = [
            isinstance(self.df_results, pdtype),
            "code" in self.df_results.columns
        ]

        if all(conditions):

            try:
                valid_codes = self.df_results.code.unique()
            except ValueError:
                print("There was an error retrieving the unique code names, self.df.code.unique() failed.")
                raise
            else:
                self.valid_codes = valid_codes
                
        
    def surveyResults(self):
        
        # if this method has been called already
        # return the result
        if self.df_results is not None:
            return self.df_results
        
        # for the default data self.clean data must be called        
        if self.clean_data is True:
            fd = cleanSurveyResults(self.dfx)
            
        # if the data is clean then if can be used directly
        else:
            fd = self.dfx
        
        # filter the data by the variable w_t
        if self.w_t is not None:
            fd = fd[fd.w_t == self.w_t]            
         
        # keep only the required columns
        if self.columns:
            fd = fd[self.columns]
        
        # assign the survey results to the class attribute
        self.df_results = fd
        
        # define the list of codes in this df
        self.validCodes()
        
        return self.df_results
    
    def surveyLocations(self):
        if self.locations is not None:
            return self.locations
        if self.df_results is not None:
            self.locations = self.dfResults.location.unique()
            return self.locations
        else:
            print("There is no survey data loaded")
            return None
        

def surface_area_of_feature(data, locations, columns):
    d = data[data.location.isin(locations)]
    d = d.groupby(columns, as_index=False).surface.sum()
    
    return d
def account_for_undefined(data, var_col="OBJVAL", var_label="Undefined", metric="surface", total_metric=None):
    data[var_col] = var_label
    data[metric] = total_metric - data[metric]
    
    return data

def check_file_type(files: list = None):
    return files

def collect_feature_data(path_to_data: str = "resources/hex-3000m"):
    # checks the files in the path_to_data are .csv
    # applies pd.DataFrame to each .csv and stores results
    # in a dictionary where key = name of map and value is
    # the corresponding dataframe
    
    files = listdir(path_to_data)
    
    files = check_file_type(files)
    
    data_map = {f.split('.')[0]:pd.read_csv(join(path_to_data, f)) for f in files}
    
    return data_map

def collectAggregateValues(data: pd.DataFrame = None, locations: [] = None, columns: list = ["location", "OBJVAL"],
                           to_aggregate: str = None):
    return data[data.location.isin(locations)].groupby(columns, as_index=False)[to_aggregate].sum()

def pivotValues(aggregate_values, index: str = "location", columns: str = "OBJVAL", values: str = "surface"):
    return aggregate_values.pivot(index=index, columns=columns, values=values).fillna(0)
    
def collectAndPivot(data: pd.DataFrame = None, locations: [] = None, columns: list = ["location", "OBJVAL"],
                    to_aggregate: str = None, scale_data: bool= False ):
    # collects the geo data and aggregates the categories for a 3000 m hex
    # the total for each category is the total amount for that category in the specific 3000 m hex
    # with the center defined by the survey location.
    aggregated = collectAggregateValues(data=data, locations=locations, columns=columns, to_aggregate=to_aggregate)
    pivoted = pivotValues(aggregated, index=columns[0], columns=columns[1], values=to_aggregate)
    pivoted.columns.name = "None"
    
    if scale_data is True:
        pivoted[pivoted.columns[1:]] = pivoted[pivoted.columns[1:]].apply(lambda x: scale_the_column(x))
        
    return pivoted.reset_index(drop=False)

def define_the_objects_of_interest(data, param: str = None, param_value: typing.Union[float, int] = 0, param_label: str = None):
    # returns a list of objects whose survey value was
    # greater than the threshold value
    
    return data[data[param] > param_value][param_label].unique()

def make_merge_kwargs(on: str = "location", how: str = "outer", validate: str = "many_to_one"):
    return dict(on=on, how=how, validate=validate)

def merge_test_results_and_land_use_attributes(test_results, land_use_values, **kwargs):
   
    results = test_results.merge(land_use_values, **kwargs)
    return results

def test_threshhold(data, threshold, gby_column):
    # given a data frame, a threshold and a groupby column
    # the given threshold will be tested against the aggregated
    # value produced by the groupby column    
    data["k"] = data.pcs_m > threshold
    exceeded = data.groupby([gby_column])['k'].sum()
    exceeded.name = "k"
    
    tested = data.groupby([gby_column]).loc_date.nunique()
    tested.name = 'n'
    
    passed = tested-exceeded
    passed.name = "n-k"
    
    ratio = exceeded/tested
    ratio.name = 'k/n'
    
    return exceeded, tested, passed, ratio

def test_one_object(data, threshold, gby_column):
    exceeded, tested, passed, ratio = test_threshhold(data, threshold, gby_column)
    tested = pd.concat([exceeded, tested, passed, ratio], axis=1)    
    
    return tested

def group_land_use_values(collected_and_pivoted, columns, quantile, labels):
    """For groups that are considered cover and not use the magnitude of the polygon
    is taken into consideration. 
    
    The resulting magnitudes are grouped according to the quantile variable
    """
    
    for column in columns:
        collected_and_pivoted[column] = pd.qcut(collected_and_pivoted[column], q=quantile, duplicates='drop')
    
    return collected_and_pivoted

def land_use_is_present(collected_and_pivoted, columns):
    """For groups that are superimposed or occurr infrequently only the presence
    is noted and the success rate.
    
    An example is a school, it takes up very little space but it does generate alot of
    activity.
    """
    
    for column in columns:
        if column in collected_and_pivoted.columns:
            collected_and_pivoted[column] = (collected_and_pivoted[column] > 0)*1
        else:
            pass
    
    return collected_and_pivoted

def inference_for_one_attribute(data, attribute, operation):
    """The data are grouped according to attribute and aggregated
    according to operation. The results are tallied here
    
    The index is on the attribute range and labeled according
    to the index position. It can be accessed in two ways
    and searched by magnitude.
    """
    print(attribute)
    
    d = data.groupby(attribute).agg(operation)
    
    d["n-k"] = d["n"]-d["k"]
    d["rate"] = d["k"]/d["n"]
    d["odds"] = d["k"]/d["n-k"]
    d["pass_rate"] = d["n-k"]/d["n"]
    total_probability = d["rate"].sum()
    d["posterior"] = d["rate"]/total_probability
    d.reset_index(inplace=True, drop=False)
    d["label"] = d.index
    d.set_index(attribute, drop=True, inplace=True)
    
    return d

class LandUseValues:
    
    def __init__(self, data_map: pd.DataFrame = None, locations: list = None, region: str = None, columns: list = None,
                 id_vals: list = None, dim_oi: int = None, to_aggregate: str = None, land_use_groups: list = None):
        
        self.data_map = data_map
        self.locations = locations
        self.region = region
        self.columns = columns
        self.dim_oi = dim_oi
        self.to_aggregate = to_aggregate
        self.id_vals = id_vals
        self.land_use_groups = land_use_groups
        self.land_cover = None
        self.length_of = None
        self.land_use = None
        
        
        
        # return super()__init__(self)
    
    def assign_undefined(self):
        
        # from the data catalog:
        # https://www.swisstopo.admin.ch/fr/geodata/landscape/tlm3d.html#dokumente
        defined_land_cover = surface_area_of_feature(self.data_map, self.locations, self.columns)

        # there are areas on the map that are not defined by a category.
        # the total surface area of all categories is subtracted from the
        # the surface area of a 3000m hex = 5845672
        defined_land_cover = account_for_undefined(defined_land_cover, total_metric=self.dim_oi)
        
        # add the undefined land-use values to the to the defined ones 
        land_cover = pd.concat([self.data_map, defined_land_cover])
        
        # aggregate the geo data for each location
        # the geo data for the 3000 m hexagon surrounding the survey location
        # is aggregated into the labled categories, these records get merged with
        # survey data, keyed on location
        kwargs = dict(data=land_cover, locations=self.locations, columns=self.id_vals, to_aggregate=self.to_aggregate)
        al_locations = collectAndPivot(**kwargs)
        
        self.land_cover = al_locations
        
    def define_total_length(self, label: str = "total"):
        kwargs = dict(data=self.data_map, locations=self.locations, columns=self.id_vals, to_aggregate=self.to_aggregate)
        al_locations = collectAndPivot(**kwargs)
        al_locations[label] = al_locations[self.land_use_groups].sum(axis=1)        
        
        self.length_of = al_locations[["location", label]]
        
    def define_total_surface(self, label: str = "total"):
        kwargs = dict(data=self.data_map, locations=self.locations, columns=self.id_vals, to_aggregate=self.to_aggregate)
        al_locations = collectAndPivot(**kwargs)
        al_locations[label] = al_locations[self.land_use_groups].sum(axis=1)
        
        self.land_use = al_locations[["location", label]]   
        
        
        
class TestResultsAndLandUse(LandUseValues):
    
    def __init__(self, df: pd.DataFrame = None, threshhold: typing.Union[float, int] = None, merge_column: str = None,
                 merge_method: str = None, merge_validate: str = None, groups: list = None, presence: list = None,
                 quantiles: list = None, labels: list = None, **kwargs):
        
        self.to_test = df
        self.threshhold = threshhold
        self.merge_column = merge_column
        self.merge_method = merge_method
        self.merge_validate = merge_validate
        self.groups = groups
        self.presence = presence
        self.quantiles = quantiles
        self.labels = labels
        
        return super().__init__(**kwargs)
    
    
    def test_and_merge(self, cover: bool = True):
        
        tested = test_one_object(self.to_test, self.threshhold, self.merge_column)
       
        kwargs = dict(on=self.merge_column, how=self.merge_method, validate=self.merge_validate)
        
        if cover is True:
            results = merge_test_results_and_land_use_attributes(tested, self.land_cover, **kwargs)
        else:
            results = merge_test_results_and_land_use_attributes(tested, self.length_of, **kwargs)
            
        
        return results
    
    def make_groups_test_presence(self, cover: bool = True, label: str = "total", regional_label: str = None, label_map: pd.Series = None):
        
        results = self.test_and_merge(cover=cover)
        # print(results.head())
        
        if self.groups is not None:
            if not cover:
                self.groups = [label]
            results = group_land_use_values(results, self.groups, self.quantiles, self.labels)
            
        if self.presence is not None:
            results = land_use_is_present(results, self.presence)
            
        if regional_label is not None:
            results[regional_label] = results[merge_column].apply(lambda x: label_map.loc[x])
            
        
        
        return results
    
    

class InferenceGroups:
    
    def __init__(self, results: pd.DataFrame = None, column_names: list = None, operation: dict = None):
        
        self.results = results
        self.column_names = column_names
        self.operation = operation
        self.inf_groups = None
        
        return super().__init__()
    
    
    def make_inference_for_each_attribute(self):
        
        res = {}
        
        for name in self.column_names:
            if name in self.results.columns:
                inf = inference_for_one_attribute(self.results, name, self.operation)
                res.update({name:inf})
            else:
                pass
        
        self.inf_groups = res
        
    def apply_infgroup_labels_to_results(self):
        pass



def attach_inference_group_labels_to_survey_data(fg: pd.DataFrame = None, inf_groups: dict = None, groups_and_presence: list = None):
    
    newfg = fg.copy()
    
    # order the columns according to groups_and_presence
    ls = [x for x in newfg.columns if x not in groups_and_presence]
    xgt = newfg[ls]
    cols = [x for x in groups_and_presence if x in newfg.columns]
    xgl = newfg[["location", *cols]]
    newfg = xgt.merge(xgl, on="location")

    
    for label in cols:     
        
        
        ifg = inf_groups.inf_groups[label]
       
        newfg[label] = newfg[label].apply(lambda x: ifg.loc[x, "label"])
        
        
    return newfg, cols

def tries_or_fails(df, columns, probability_tables, product=True, tries_or_fails="k"):
    data=df.copy()
    for x in columns:
        w = probability_tables[x]
        data[x] = data[x].apply(lambda x: w.loc[x, tries_or_fails])
    
    if product:
        data["total"] = data[columns].prod(axis=1)
    else:
        data["total"] = data[columns].sum(axis=1)
        
    
    return data

class LanduseConfiguration:
    
    def __init__(self, land_use_kwargs: dict = None, test_kwargs: dict = None, label: str = None, assign_undefined: bool = False,
                 length: bool = False, cover: bool = False, inf_operation: dict = {"k":"sum", "n":"sum"}, regional_label: str = None, 
                 label_map: pd.Series = None, total: bool = False):
        
        self.land_use_kwargs = land_use_kwargs
        self.test_kwargs = test_kwargs
        self.label = label
        self.label_map = label_map
        self.regional_label = regional_label
        self.assign_undefined = assign_undefined
        self.length = length
        self.cover = cover
        self.inf_operation = inf_operation
        self.total = total
        self.test_results = None
        self.grouped_data = None
        self.inf_groups = None
        self.p_tables = None
        
        
        return super().__init__()
    
    def groups_and_presence(self):
        
        d = TestResultsAndLandUse(**self.test_kwargs, **self.land_use_kwargs)
        
        if self.length is True:
            d.define_total_length(label=self.label)
        if self.cover is True:
            d.assign_undefined()
        if self.regional_label is not None:
            dg = d.make_groups_test_presence(regional_label=self.regional_label, label_map=self.label_map)
        else:
            dg = d.make_groups_test_presence(cover=self.cover, label=self.label)
            
        self.test_results = d
        self.grouped_data = dg
        
    def inference_groups(self):        
        # create the inference groups for this land use class
        # define the column names of the inference groups
        if self.total is True:
            column_names = [self.label]
        else:
            column_names = [*test_kwargs["groups"], *test_kwargs["presence"]]
            
        if self.grouped_data is None:
            self.groups_and_presence()
        
        # group the land use features by magnitude
        inf_groups = InferenceGroups(results=self.grouped_data, column_names=column_names, operation=self.inf_operation)
        inf_groups.make_inference_for_each_attribute()
        
        # attach the the inference group labels to the survey data for each feature
        labeled_groups, cols = attach_inference_group_labels_to_survey_data(self.grouped_data, inf_groups, column_names)
        labeled_groups["conf"] = list(zip(*[labeled_groups[x] for x in cols]))
        configuration_keys = labeled_groups[["location", "conf"]].set_index("location")
        
        self.inf_groups = inf_groups
        self.p_tables = inf_groups.inf_groups
        self.labeled_groups = labeled_groups
        self.configuration_keys = configuration_keys

def select_a_land_use_conf(conf, p_tables: dict = None, vals_to_drop: tuple=None):
    """Takes the land use confiuguration from a location and sums the number of trials
    and successes. 
    
    Vals to drop gives the option to eliminate all matching land use categories that appear
    in the conf. Example if a locations has conf (0,1,2,3,4)  and another location has conf
    (2,3,4,5,6) then land-use categories 2, 3, 4 are only counted once.
    
    returns a tuple k=success and n=trials
    """
    
    k = 0
    n = 0
    for i, pair in enumerate(conf):
        if vals_to_drop is not None and vals_to_drop[i][1] == pair[1]:
            pass
        else:
            # print(pair)
            # print(p_tables[pair[0]])
            d = p_tables[pair[0]]
            
            e = d.loc[d.label == pair[1],["k","n"] ].values[0]
            
            k += e[0]
            n += e[1]    
    
    return k, n

def select_a_group_of_confs(conf, p_tables, drop_vals: tuple=None):    
    """Takes an array of location configurations (confs) and applies
    select_a_land_use_conf to each one.
    
    returns a tuple k=success and n = trials
    """
    
    failed = 0
    tried = 0   
    for i, pair in enumerate(conf):
        d = p_tables[pair[0]]
        attribute = pair[0]
        d_index = d.index.name
        
        if attribute != d_index:
            print(attribute, d_index)
            print("ouch")
        # collect = [x for x in d.label if x != pair[1]]
        
        e = d[["k", "n"]].sum()
        failed += e[0]
        tried += e[1]
    return failed, tried

def inferenceTableForOneLocation(name: str = None, lake: str = None, conf: tuple = None, p_tables: dict = None,
                                 tested_groups_presence: pd.DataFrame = None, tested: pd.DataFrame = None,
                                 prior: float = None, conf_names: list = None, drop_vals: tuple = None):
    
    # h1 the threshold was exceeded at the location given the land use values
    # likelihood of exceeding the threshold given the land use in that hex
    lk, ln = select_a_land_use_conf(conf=conf, p_tables=p_tables, vals_to_drop=drop_vals)
    failed = lk/ln
    # h2 the threshold was not exceeded with that land use configuration
    passed = (ln-lk)/ln
    
    # total probability
    # the threshold was exceeded under any land use configuration
    tk = [v["k"].sum() for k, v in p_tables.items()]
    tk = sum(tk)
    
    # the number of tries
    tn = [v["n"].sum() for k, v in p_tables.items()]
    tn = sum(tn)
    
    # the threshold was not exceeded
    passed_t = (tn-tk)/tn
    failed_t = tk/tn

    if prior is not None:
        p = prior
    else:
        # assign a prior from the survey results
        # if there is data for the location in question
        # use it.        
        pkn, pnn = tested.loc[tested.location == name, ['k', 'n']].sum().values
                
        if pkn == 0:
            p = (pkn+1)/(pnn+2)
        elif pkn/pnn == 1:
            p = (pkn+1)/(pnn+2)
        else:
            p = (pkn+1)/(pnn+2)
    
    m = ((1-p)*passed_t) +(p*failed_t)
    
    # print(f'conf failed: {round(failed, 3)}, conf passed: {round(passed, 3)}, total failed: {round(failed_t, 3)}, total passed: {round(passed_t, 3)}, prior: {round(p, 3)}')
    
    return (failed*p)/m

def inference_for_one_location(location: str = None, lake: str = None, conf_label: str='configuration',
                              conf_columns: list = None, p_tables: dict = None, prior: float = None,
                              tested: pd.DataFrame = None, drop_vals: tuple = None):
    conf = fgl_conf_keys.loc[location, "configuration"]
    conf = tuple(zip(conf_columns, conf))

    p = inferenceTableForOneLocation(
        name =location,
        lake=lake,
        conf_names = conf_columns,
        conf=conf,
        p_tables=p_tables,
        tested=tested,
        prior=prior,
        drop_vals=drop_vals
    )
    
    return p


In [2]:
# the SurveyResults class will collect the data in the
# resources data folder
fdx = SurveyResults()
# call the surveyResults method to get the survey data
df = fdx.surveyResults()

# the name of the lake was changed in the revision
# change it to the correct name
df_none = df[df.water_name_slug != 'quatre-cantons']
df_with = df[df.water_name_slug == 'quatre-cantons'].copy()
df_with['water_name_slug'] = 'vierwaldstattersee'
df = pd.concat([df_none, df_with])

# the lakes of interest
# collection points would be a good
# label. This is how they are considered in the model
# these locations serve as the endpoint for many 
# small rivers
collection_points = [
    'zurichsee',
    'bielersee',
    'neuenburgersee',
    'walensee',
    'vierwaldstattersee',
    'brienzersee',
    'thunersee',
    'lac-leman',
    'lago-maggiore',
    'lago-di-lugano',
    'zugersee'
]

# the data-frame of survey results to be considered
df = df[df.water_name_slug.isin(collection_points)]

# map the location to the name of the lake
wn_map = df[["location", "water_name_slug"]].drop_duplicates("location").set_index("location")
wn_map = wn_map["water_name_slug"]


# The summary of the survey data
locations = df.location.unique()
samples = df.loc_date.unique()
lakes = df[df.w_t == "l"].drop_duplicates("loc_date").w_t.value_counts().values[0]
codes_identified = df[df.quantity > 0].code.unique()
codes_possible = df.code.unique()
total_id = df.quantity.sum()

data_summary = {
    "n locations": len(locations),
    "n samples": len(samples),
    "n lake samples": lakes,
    "n identified object types": len(codes_identified),
    "n possible object types": len(codes_possible),
    "total number of objects": total_id
}

pd.DataFrame(index = data_summary.keys(), data=data_summary.values()).style.set_table_styles(table_css_styles)

Unnamed: 0,0
n locations,93
n samples,331
n lake samples,331
n identified object types,186
n possible object types,211
total number of objects,48060


Summary data of the surveys, not including locations in the Walensee area:

## The map data

The proceeding is the land-use categories and the relevant sub-categories. With the exception of _Land Cover_ the total of each category was considered for each hex. The covariance of each sub-category is given in the annex.

### The base: Land cover

This is how the earth is covered, independent of its use. The following categories are the base land-cover categories:

1. Orchard
2. Vineyards
3. Settlement
4. City center
5. Forest
6. Undefined
7. Wetlands

The area of each sub-category of land-cover within a 3000 m hex is totaled and the correlation is considered independently. The categories that follow are superimposed on to these surfaces.

In [3]:
my_path=  "resources/hex-3000m"

columns = [
    "river_bass",
    "location", 
    "lat","lon",
    "city",
    "feature"
]

column_rename = {"undefined":"Undefined"}
area_of_a_hex = 5845672
id_vals = ["location", "OBJVAL"]
agg_val = "surface"

data_map_name = "hex-3000-lake-locations-landcover"

lc_groups = [
    'Siedl',
    'Undefined',
    'Wald'
]

lc_presence = [
    'Obstanlage',
    'Reben',
    'Stadtzentr',
    'Sumpf',
    'Fels'
]

groups_and_presence = [*lc_groups, *lc_presence]
       
merge_column = gby_column = "location"
merge_method = 'outer'
merge_validate = "many_to_one"
quantiles =   [0,.1, .25, .5, .75, .9, 1]

        
# test the survey results of an object of interest
# against a threshhold
code = "Gfoam"
to_test = df[df.code == code].copy()
threshhold = to_test.pcs_m.median()

# label the region of interest
# this creates a hierarcal group of the locations
# that are within a region. The most common would be lake
regional_label = "lake"
label_map = wn_map


# collect the data
data_map = collect_feature_data(path_to_data=my_path)
land_cover_data = data_map[data_map_name]
land_cover_data.rename(columns=column_rename, inplace=True)

# limit the data to the parameters of interest
land_cover_data = land_cover_data[land_cover_data[id_vals[1]].isin(groups_and_presence)].copy()

land_use_kwargs = {
    "data_map":land_cover_data,
    "locations":locations,
    "region":None,
    "columns":columns,
    "id_vals": id_vals,
    "dim_oi": area_of_a_hex,
    "to_aggregate":agg_val,
    "land_use_groups":lc_groups,
}

test_kwargs = {
    "df":to_test,
    "threshhold":threshhold,
    "merge_column":merge_column,
    "merge_method":merge_method,
    "merge_validate":merge_validate,
    "groups":lc_groups,
    "presence":lc_presence,
    "quantiles":quantiles
}

inf_operation = {"k":"sum", "n":"sum"}

In [4]:
kwargs = {
    'land_use_kwargs': land_use_kwargs,
    'test_kwargs': test_kwargs,
    'label': None,
    'assign_undefined': True,
    'length': False,
    'cover': True,
    'inf_operation': inf_operation,
    'regional_label': regional_label,
    'label_map': label_map,
    'total': False
}


nx = LanduseConfiguration(**kwargs)
nx.groups_and_presence()
nx.inference_groups()
nxf = nx.grouped_data
nx_conf = nx.labeled_groups

Siedl
Undefined
Wald
Obstanlage
Reben
Stadtzentr
Sumpf
Fels


In [5]:
# add the data from another land use layer
# the results will be added to the landcover data frame
label = "strasse"
data_map_name = 'hex-3000-lake-locations-strasse'
id_vals = ["location", "OBJEKTART"]
agg_val = "length"

# identify the columns of interest
st_groups = [
    '10m Strasse',
    '1m Weg',
    '1m Wegfragment',
    '2m Weg',
    '2m Wegfragment',
    '3m Strasse',
    '4m Strasse',
    '6m Strasse',
    '8m Strasse',
    'Ausfahrt',
    'Autobahn',
    'Autostrasse',
    'Dienstzufahrt',
    'Einfahrt',
    'Faehre',
    'Markierte Spur',
    'Platz',
    'Raststaette', 'Verbindung', 'Zufahrt'
]

st_presence = []

groups_and_presence = [*st_groups, *st_presence]

# collect the data
street_lengths_data = data_map[data_map_name]
street_lengths_data.rename(columns=column_rename, inplace=True)

# update the key word arguments for the land use data
land_use_update = dict(data_map=street_lengths_data, id_vals=id_vals, to_aggregate=agg_val, land_use_groups=st_groups)
land_use_kwargs.update(land_use_update)

# update the key word arguments for the test data
test_update = dict(groups=st_groups, presence=None)
test_kwargs.update(test_update)

kwargs = {
    'land_use_kwargs': land_use_kwargs,
    'test_kwargs': test_kwargs,
    'label': 'strasse',
    'assign_undefined': False,
    'length': True,
    'cover': False,
    'inf_operation': inf_operation,
    'regional_label': None,
    'label_map': None,
    'total': True
}

strasse = LanduseConfiguration(**kwargs)

strasse.groups_and_presence()
strasse.inference_groups()
sxf = strasse.grouped_data
sx_conf = strasse.labeled_groups
strasse_config = strasse.configuration_keys

strasse


In [6]:
# add the data from another land use layer
# the results will be added to the landcover data frame
column_rename = {"undefined":"Undefined"}
data_map_name = 'hex-3000-lake-locations-freizeitareal'
id_vals = ["location", "OBJEKTART"]
agg_val = "surface"

recreation_data = data_map[data_map_name]
recreation_data.rename(columns=column_rename, inplace=True)

# identify the columns of interest
rec_groups = recreation_data.OBJEKTART.unique()
rec_presence = []

groups_and_presence = [*rec_groups, *rec_presence]

# update the key word arguments for the land use data
land_use_update = dict(data_map=recreation_data, id_vals=id_vals, to_aggregate=agg_val, land_use_groups=rec_groups)
land_use_kwargs.update(land_use_update)

# update the key word arguments for the test data
test_update = dict(groups=rec_groups, presence=None)
test_kwargs.update(test_update)

kwargs = {
    'land_use_kwargs': land_use_kwargs,
    'test_kwargs': test_kwargs,
    'label': 'recreation',
    'assign_undefined': False,
    'length': True,
    'cover': False,
    'inf_operation': inf_operation,
    'regional_label': None,
    'label_map': None,
    'total': True
}

recreation = LanduseConfiguration(**kwargs)
recreation.groups_and_presence()
recreation.inference_groups()
rec_config = recreation.configuration_keys

recreation


In [7]:
# add the data from another land use layer
# the results will be added to the landcover data frame
column_rename = {"undefined":"Undefined"}
data_map_name = 'hex-3000-lake-locations-nutuzungsareal'
id_vals = ["location", "OBJEKTART"]
agg_val = "surface"

infrastructure_data = data_map[data_map_name]
infrastructure_data.rename(columns=column_rename, inplace=True)

# identify the columns of interest
inf_groups = infrastructure_data.OBJEKTART.unique()
inf_presence = []

groups_and_presence = [*inf_groups, *inf_presence]

# update the key word arguments for the land use data
land_use_update = dict(data_map=infrastructure_data, id_vals=id_vals, to_aggregate=agg_val, land_use_groups=inf_groups)
land_use_kwargs.update(land_use_update)

# update the key word arguments for the test data
test_update = dict(groups=inf_groups, presence=None)
test_kwargs.update(test_update)

kwargs = {
    'land_use_kwargs': land_use_kwargs,
    'test_kwargs': test_kwargs,
    'label': 'infrastructure',
    'assign_undefined': False,
    'length': True,
    'cover': False,
    'inf_operation': inf_operation,
    'regional_label': None,
    'label_map': None,
    'total': True
}

infrastructure = LanduseConfiguration(**kwargs)
infrastructure.groups_and_presence()
infrastructure.inference_groups()
inra_config = infrastructure.configuration_keys

infrastructure


In [8]:
data_map_name = 'resources/hex-3000m/lake-locations-distance-to_intersection.csv'
intersections = pd.read_csv(data_map_name)
intersections = intersections[intersections.location.isin(locations)].copy()
intersections = intersections.drop_duplicates(["location", "distance"])
dints = intersections.groupby("location").distance.sum()
nints = intersections.groupby("location").distance.nunique()
ints_d = pd.DataFrame(dints/nints).reset_index()
river_length = pd.read_csv('resources/buffer_output/intersection_length.csv')
river_length = river_length[river_length.location.isin(locations)].copy()
river_length = river_length.drop_duplicates(["location", "length"])
nlen = river_length.groupby("location").length.nunique()
llen = river_length.groupby("location").length.sum()
lengths_d = pd.DataFrame(llen/nlen).reset_index()
# dist_len = lengths_d.merge(ints_d, on="location").reset_index()
# dist_len = pd.melt(dist_len, id_vars="location", value_vars=["length", "distance"])
# dist_len
len_d = pd.melt(lengths_d, id_vars="location", value_vars="length").fillna(0)

In [9]:

id_vals = ["location", "variable"]
agg_val = "value"
inf_groups = ["length"]
inf_presence = []

groups_and_presence = [*inf_groups, *inf_presence]



# update the key word arguments for the land use data
land_use_update = dict(data_map=len_d, id_vals=id_vals, to_aggregate=agg_val, land_use_groups=inf_groups)
land_use_kwargs.update(land_use_update)

# update the key word arguments for the test data
test_update = dict(groups=inf_groups, presence=None)
test_kwargs.update(test_update)



kwargs = {
    'land_use_kwargs': land_use_kwargs,
    'test_kwargs': test_kwargs,
    'label': "r-length",
    'assign_undefined': False,
    'length': True,
    'cover': False,
    'inf_operation': inf_operation,
    'regional_label': None,
    'label_map': label_map,
    'total': True
}

intersections = LanduseConfiguration(**kwargs)
intersections.groups_and_presence()
intersections.inference_groups()
int_conf = intersections.configuration_keys

r-length


In [10]:
# add results for land use to land cover 
# drop the conf column
fgl = nx_conf.drop("conf", axis=1)

fgl['infrastructure'] = fgl.location.apply(lambda x: inra_config.loc[x][0][0])
fgl['strasse'] = fgl.location.apply(lambda x: strasse_config.loc[x][0][0])
fgl['recreation'] = fgl.location.apply(lambda x: rec_config.loc[x][0][0])
fgl['r-length'] = fgl.location.apply(lambda x: int_conf.loc[x][0][0])
fgl = fgl.fillna(0)
# define the confiugration columns
conf_columns = [*lc_groups, *lc_presence, 'infrastructure', 'strasse', 'recreation', 'r-length']
fgl['configuration'] = list(zip(*[fgl[x] for x in conf_columns]))
fgl_conf_keys = fgl[["location", "configuration"]].set_index("location")

# update the probability tables
fgl_ptables =nx.p_tables
fgl_ptables.update(infrastructure.p_tables)
fgl_ptables.update(recreation.p_tables)
fgl_ptables.update(strasse.p_tables)
fgl_ptables.update(intersections.p_tables)

In [11]:
inference_kwargs = {
    'location': " ",
    'lake': "",
    'conf_label': 'configuration',
    'conf_columns': conf_columns,
    'p_tables': fgl_ptables,
    'tested':nx.grouped_data,
    'prior':None,
    'drop_vals': None
}



bsee_l = nx.grouped_data[nx.grouped_data.lake == "lac-leman"].location.unique()
res = {}
for alocation in bsee_l:
    inference_kwargs.update({"location":alocation})
    p=inference_for_one_location(**inference_kwargs)
    res.update({alocation:p})
    
print(code, threshhold)
res
    

Gfoam 0.07


{'anarchy-beach': 0.655279488452582,
 'baby-plage-geneva': 0.2905146915249824,
 'baby-plage-ii-geneve': 0.23486529791743418,
 'bain-des-dames': 0.6501828241621083,
 'baye-de-montreux-g': 0.7478335812501131,
 'boiron': 0.6564500234067034,
 'cully-plage': 0.7343319144035644,
 'grand-clos': 0.9548329343708758,
 'la-pecherie': 0.5942529321416143,
 'lacleman_gland_lecoanets': 0.23895209557658775,
 'le-pierrier': 0.6459062121316048,
 'maladaire': 0.705474136911692,
 'oyonne': 0.6499399490718982,
 'parc-des-pierrettes': 0.642468747226554,
 'plage-de-st-sulpice': 0.3114724772227811,
 'preverenges': 0.6283591860128622,
 'preverenges-le-sout': 0.6545251586166427,
 'quai-maria-belgia': 0.5668653213328498,
 'rocky-plage': 0.3131905935493407,
 'tiger-duck-beach': 0.7776161323771168,
 'tolochenaz': 0.6670945098345765,
 'versoix': 0.18715028420355406,
 'vidy-ruines': 0.5312928333258125,
 'villa-barton': 0.47332959011791126}

In [12]:
lct = nx.test_results.test_and_merge()
sct = strasse.test_results.test_and_merge(cover=False)
rct = recreation.test_results.test_and_merge(cover=False)
ict = infrastructure.test_results.test_and_merge(cover=False)


In [13]:
today = dt.datetime.now().date().strftime("%d/%m/%Y")
where = "Biel, CH"

my_block = f"""

This script updated {today} in {where}

> \u2764\ufe0f what you do everyday

*analyst at hammerdirt*
"""

md(my_block)



This script updated 01/05/2023 in Biel, CH

> ❤️ what you do everyday

*analyst at hammerdirt*


In [14]:
%watermark --iversions -b -r

Git repo: https://github.com/hammerdirt-analyst/landuse.git

Git branch: probability

scipy     : 1.10.1
PIL       : 9.5.0
pandas    : 2.0.0
matplotlib: 3.7.1
numpy     : 1.24.2
json      : 2.0.9
IPython   : 8.12.0
seaborn   : 0.12.2

