In [1]:
%load_ext watermark
import datetime as dt
import pandas as pd
import numpy as np
from scipy import stats


import IPython
from PIL import Image as PILImage
from IPython.display import Markdown as md
from IPython.display import display
from myst_nb import glue

# code descriptions
dfCodes = pd.read_csv("resources/codes_with_group_names_2015.csv")

# set the index of to codes
dfCodes.set_index("code", inplace=True)

# code description map
code_d_map = dfCodes.description

# code material map
code_m_map = dfCodes.material

pdtype = pd.core.frame.DataFrame
pstype = pd.core.series.Series

def scaleTheColumn(x):
    
    xmin = x.min()
    xmax = x.max()
    xscaled = (x-xmin)/(xmax-xmin)
    
    return xscaled

def rotateText(x):
    return 'writing-mode: vertical-lr; transform: rotate(-180deg);  padding:10px; margins:0; vertical-align: baseline;'

def cleanSurveyResults(data):
    # performs data cleaning operations on the
    # default data
    
    data['loc_date'] = list(zip(data.location, data["date"]))
    data['date'] = pd.to_datetime(data["date"])
    
    # get rid of microplastics
    mcr = data[data.groupname == "micro plastics (< 5mm)"].code.unique()
    
    # replace the bad code
    data.code = data.code.replace('G207', 'G208')
    data = data[~data.code.isin(mcr)]
    
    # walensee has no landuse values
    data = data[data.water_name_slug != 'walensee']   
    
    return data

class SurveyResults:
    """Creates a dataframe from a valid filename. Assigns the column names and defines a list of
    codes and locations that can be used in the CodeData class.
    """
    
    file_name = 'resources/checked_sdata_eos_2020_21.csv'
    columns_to_keep=[
        'loc_date',
        'location', 
        'river_bassin',
        'water_name_slug',
        'city',
        'w_t', 
        'intersects', 
        'code', 
        'pcs_m',
        'quantity'
    ]
        
    def __init__(self, data: str = file_name, clean_data: bool = True, columns: list = columns_to_keep, w_t: str = None):
        self.dfx = pd.read_csv(data)
        self.df_results = None
        self.locations = None
        self.valid_codes = None
        self.clean_data = clean_data
        self.columns = columns
        self.w_t = w_t
        
    def validCodes(self):
        # creates a list of unique code values for the data set    
        conditions = [
            isinstance(self.df_results, pdtype),
            "code" in self.df_results.columns
        ]

        if all(conditions):

            try:
                valid_codes = self.df_results.code.unique()
            except ValueError:
                print("There was an error retrieving the unique code names, self.df.code.unique() failed.")
                raise
            else:
                self.valid_codes = valid_codes
                
        
    def surveyResults(self):
        
        # if this method has been called already
        # return the result
        if self.df_results is not None:
            return self.df_results
        
        # for the default data self.clean data must be called        
        if self.clean_data is True:
            fd = cleanSurveyResults(self.dfx)
            
        # if the data is clean then if can be used directly
        else:
            fd = self.dfx
        
        # filter the data by the variable w_t
        if self.w_t is not None:
            fd = fd[fd.w_t == self.w_t]            
         
        # keep only the required columns
        if self.columns:
            fd = fd[self.columns]
        
        # assign the survey results to the class attribute
        self.df_results = fd
        
        # define the list of codes in this df
        self.validCodes()
        
        return self.df_results
    
    def surveyLocations(self):
        if self.locations is not None:
            return self.locations
        if self.df_results is not None:
            self.locations = self.dfResults.location.unique()
            return self.locations
        else:
            print("There is no survey data loaded")
            return None    

# this defines the css rules for the note-book table displays
header_row = {'selector': 'th:nth-child(1)', 'props': f'background-color: #FFF;'}
even_rows = {"selector": 'tr:nth-child(even)', 'props': f'background-color: rgba(139, 69, 19, 0.08);'}
odd_rows = {'selector': 'tr:nth-child(odd)', 'props': 'background: #FFF;'}
table_font = {'selector': 'tr', 'props': 'font-size: 12px;'}
table_css_styles = [even_rows, odd_rows, table_font, header_row]

# collecting survey data
fdx = SurveyResults()
df = fdx.surveyResults()

# exclude the data from the walensee area
# inlcuding river samples
location_no_luse = [
    "linth_route9brucke",
    "seez_spennwiesenbrucke",
    "limmat_dietikon_keiserp",
    "seez"
]

city_no_luse = ["Walenstadt", "Weesen", "Glarus Nord", "Quarten"]

df = df[(~df.city.isin(city_no_luse))]
df = df[~df.location.isin(location_no_luse)]

# use the same criteria from the project results
codes = df[df.quantity > 20].code.unique()

# the intersect data
ind = pd.read_csv('resources/hex-3000m-cleaned/river-intersect-length.csv')

# merge the intersection data with the survey data
ints_and_data = df.merge(ind, on="location", how="outer")

# define the locations of interest
locations = df.location.unique()

data = ints_and_data[(ints_and_data.code.isin(codes)) & (ints_and_data.location.isin(locations))].copy()

# the number of intersects is from old map data
# it can be dropped for
data.drop("intersects", inplace=True, axis=1)

n_c_names = {
    "KLASSE": 'class',
    "NAMN": 'name',
    "BREITE": 'size',
    "water_name_slug":"feature",
}
data.rename(columns=n_c_names, inplace=True)
data.fillna(0, inplace=True)

def collectCorrelation(data, codes, columns):
    results = []
    for code in codes:
        d = data[data.code == code]
        dx = d.pcs_m.values
        for name in columns:
            dy = d[name].values            
            c, p = stats.spearmanr(dx, dy)
            results.append({"code":code, "variable":name, "rho":c, "p":p})
    return results

def resultsDf(rhovals: pdtype = None, pvals: pdtype = None)-> pdtype:
    results_df = []
    for i, n in enumerate(pvals.index):
        arow_of_ps = pvals.iloc[i]
        p_fail = arow_of_ps[ arow_of_ps > 0.05]
        arow_of_rhos = rhovals.iloc[i]
        
        for label in p_fail.index:
            arow_of_rhos[label] = 0
        results_df.append(arow_of_rhos)
    
    return results_df

def styleBufferResults(buffer_results):
    buffer_results.columns.name = None
    bfr = buffer_results.style.format(precision=2).set_table_styles(table_css_styles)
    bfr = bfr.background_gradient(axis=None, vmin=buffer_results.min().min(), vmax=buffer_results.max().max(), cmap="coolwarm")
    bfr = bfr.applymap_index(rotateText, axis=1)
    
    return bfr

def countTheNumberOfCorrelationsPerBuffer(pvals: pdtype = None, rhovals: pdtype = None) -> (pdtype, pstype):
    
    # the number of times p <= 0.05
    number_p_less_than = (pvals <= 0.05).sum()
    number_p_less_than.name = "correlated"
    
    # the number of postive correlations
    number_pos = (rhovals > 0).sum()
    number_pos.name = "positive"
    
    # the number of negative correlations
    number_neg = (rhovals < 0).sum()
    number_neg.name = "negative"

    ncorrelated = pd.DataFrame([number_p_less_than, number_pos, number_neg])
    ncorrelated["total"] = ncorrelated.sum(axis=1)
    totals = ncorrelated.total    
    
    return ncorrelated, totals

(distancetoriver)=
# River discharge and lake intersections

In the intitial report and in the project-results the influence of river inputs was quantitied by the number of river intersects within 1500 m of a survey location. With this method 13 possible correlations were identified, 11 positive and two negative. This method does not take into account the distance to the intersection, the length of the river section within the 1500 m buffer nor does it consider the size of the inputs.

Here we consider the distance, the length, the size and the class of each river within 1 500 m of the survey location.  

```{figure} resources/images/stream_length_buffer_land_use.jpeg
---
name: dist_to_int
---
` `
```
{numref}`figure %s: <dist_to_int>` Measuring the distance to the intersection and length of the intersection in the buffer. Location: grand-clos, St. Gingolph - Lac Léman.

## Extracting the values from the map layer

The map layers that are publicly available have changed since the land-use attributes were originally considered for the project national survey. At that time we did not consider the length or distance. The size and the class of each river was not indicated on the previous map layers either. All that has changed:

1. There are fewer rivers and streams in the new map layers
2. Each river (section) is labled with the size, class, name and designated as man-made or natural.

To extract the required data for the analysis for each location and river the following steps were followed:

1. Identify locations of interest
2. Construct a buffer around each point
3. Mark the intersection of the river with the buffer and the lake
4. Calculate the length of the river section that traverses the buffer
5. Calculate the straight line distance from the survey location to the point where the river leaves the buffer and enters the lake

Most locations have more than one intersection. Which means that the survey result for a code is considered under all the possible conditions for each location. The results from St. Gingolph illustrate this:

In [2]:
data[(data.location == 'grand-clos') & (data.code == "Gfrags")&(data.loc_date == ("grand-clos", "2020-05-07"))].head().style.set_table_styles(table_css_styles)

Unnamed: 0,loc_date,location,river_bassin,feature,city,w_t,code,pcs_m,quantity,name,size,class,distance,length
32125,"('grand-clos', '2020-05-07')",grand-clos,rhone,lac-leman,Saint-Gingolph,l,Gfrags,2.56,82.0,Canal Stockalper,18.0,8.0,1114.305677,498.263964
32126,"('grand-clos', '2020-05-07')",grand-clos,rhone,lac-leman,Saint-Gingolph,l,Gfrags,2.56,82.0,Le Rhône,1.0,4.0,1372.531919,475.233563
32127,"('grand-clos', '2020-05-07')",grand-clos,rhone,lac-leman,Saint-Gingolph,l,Gfrags,2.56,82.0,Le Tové,10.0,10.0,632.923894,1021.844171


## Testing the impacts of river inputs

The method of aggregation leaves a few options for testing the impact of river inputs. Recall that there are river and lake side samples, the sampling conditions are not homogenous and the results differ both in magnitude of pcs/m and the variety of objects found, see [IQAASL -lakes and rivers](https://hammerdirt-analyst.github.io/IQAASL-End-0f-Sampling-2021/lakes_rivers.html) and compare the median pcs/m in the national results.

1. River and lake samples can be tested togethor
2. River and lake samples can be tested separately
3. The distance and length parameters can be aggregated
   1. average distance to intersection per location
   2. total river network length per location
   3. Rivers and lake samples tested together

### Lake surveys

As illustrated above some locations have mulitple intersection within the 1500 m buffer and some have none. Lake locations that have no intersections within the buffer are given a distance of 0 and length of 0. The class and size are both 20, indicating the smallest class and river size possible (example below). The name of the river intersection is given, in the case there are no intersections within a 1500 m buffer then name=None. Removing the records with name=None eliminates all lake records with no intersection data. However, for rivers the intersection name = the name of the feature that was sampled.

In [3]:
data[(data.location == 'versoix') & (data.code == "Gfrags")&(data.loc_date == ("versoix", "2020-10-13"))].head().style.set_table_styles(table_css_styles)

Unnamed: 0,loc_date,location,river_bassin,feature,city,w_t,code,pcs_m,quantity,name,size,class,distance,length
108961,"('versoix', '2020-10-13')",versoix,rhone,lac-leman,Versoix,l,Gfrags,0.68,35.0,0,20.0,20.0,0.0,0.0


### River surveys

The intersection of one river to another was not considered. Therefore, for river surveys the measured parameter is the length of the river in the buffer that was sampled. The class and size are set to 20 and the distance to intersection is 0. River sample locations, for this analysis, are considered the same as a lake locations with no corresponding intersections.

In [4]:
data[(data.location == 'schusspark-strand') & (data.code == "Gfrags")&(data.loc_date == ("schusspark-strand", "2020-08-08"))].style.set_table_styles(table_css_styles)

Unnamed: 0,loc_date,location,river_bassin,feature,city,w_t,code,pcs_m,quantity,name,size,class,distance,length
71862,"('schusspark-strand', '2020-08-08')",schusspark-strand,aare,schuss,Biel/Bienne,r,Gfrags,0.06,3.0,La Suze,20.0,20.0,0.0,3248.0


### Description of the survey results

In [5]:
## The survey results
locations = df.location.unique()
samples = df.loc_date.unique()
lakes = df[df.w_t == "l"].drop_duplicates("loc_date").w_t.value_counts().values[0]
rivers = df[df.w_t == "r"].drop_duplicates("loc_date").w_t.value_counts().values[0]
codes_identified = df[df.quantity > 0].code.unique()
codes_possible = df.code.unique()
total_id = df.quantity.sum()

data_summary = {
    "n locations": len(locations),
    "n samples": len(samples),
    "n lake samples": lakes,
    "n river samples": rivers,
    "n identified object types": len(codes_identified),
    "n possible object types": len(codes_possible),
    "total number of objects": total_id
}

pd.DataFrame(index = data_summary.keys(), data=data_summary.values(), columns=["total"]).style.set_table_styles(table_css_styles)

Unnamed: 0,total
n locations,128
n samples,349
n lake samples,300
n river samples,49
n identified object types,191
n possible object types,211
total number of objects,46832


### River and lake samples

The survey resutls for both river and lakes are tested for correlation. Each intersection is tested once. Consider the example from St. Gingolph under this model each results is tested against the distance and length of each intersection for each location, this includes the size and the class.

__River and lake samples not aggregated__

In [6]:
# the columns being tested against pcs_m
columns = ["distance", "length", "size", "class"]

# conduct the test
corellation_results = collectCorrelation(data, codes, columns)
crp = pd.DataFrame(corellation_results)
pvals = crp.pivot(index="code", columns="variable", values="p")
rhovals = crp.pivot(index="code", columns="variable", values="rho")

# display the results
a_df = pd.DataFrame(resultsDf(rhovals, pvals))
a_df["item"] = a_df.index.map(lambda x: code_d_map.loc[x])
a_df.set_index("item", inplace=True, drop=True)
a_df.index.name = None
styleBufferResults(a_df)

Unnamed: 0,class,distance,length,size
Food containers single use foamed or plastic,-0.14,0.19,-0.11,-0.11
"Clothing, towels & rags",0.0,0.0,0.09,0.0
Paper packaging,0.0,0.0,0.0,0.0
Fireworks paper tubes and fragments,0.0,0.0,0.0,0.0
"Foil wrappers, aluminum foil",-0.23,0.0,0.0,-0.26
"Metal bottle caps, lids & pull tabs from cans",-0.11,0.1,0.0,-0.14
"Glass drink bottles, pieces",0.0,0.0,0.0,0.0
"Construction material; bricks, pipes, cement",0.0,0.0,0.0,0.0
Glass or ceramic fragments > 2.5 cm,0.0,0.0,0.0,0.0
Drink lids,-0.13,0.2,0.0,-0.1


#### The number of correlations

In [7]:
ncorrelated, total = countTheNumberOfCorrelationsPerBuffer(pvals, rhovals)
ncorrelated.style.set_table_styles(table_css_styles)

variable,class,distance,length,size,total
correlated,24,21,10,21,76
positive,1,21,2,0,24
negative,23,0,8,21,52


__Notes:__

__inverse correlations__ The __size__ and __class__ rankings start at zero, zero is the biggest. As an example the Aare and Rhône rivers are class 4. The correlations (if there are any) should be interpreted as 1/x:

1. If the correlation is negative: the density is greater as the size get bigger or the river is more important
2. If the correlation is positive: the density is greater as the size gets smaller of the river is less important

Therefore, objects that are positively correlated with size and class were found more often at the intersects of smaller and less important rivers.

__Lake samples__

The survey results for lakes are tested for correlation, river samples are excluded. Each intersection is tested once. The same as the previous model.

In [8]:
datax = data[data.w_t == "l"]
columns = ["distance", "length", "size", "class"]
corellation_results = collectCorrelation(datax, codes, columns)
crp = pd.DataFrame(corellation_results)
pvals = crp.pivot(index="code", columns="variable", values="p")
rhovals = crp.pivot(index="code", columns="variable", values="rho")

# display the results
a_df = pd.DataFrame(resultsDf(rhovals, pvals))
a_df["item"] = a_df.index.map(lambda x: code_d_map.loc[x])
a_df.set_index("item", inplace=True, drop=True)
a_df.index.name = None
styleBufferResults(a_df)

Unnamed: 0,class,distance,length,size
Food containers single use foamed or plastic,0.0,0.11,0.0,0.0
"Clothing, towels & rags",0.0,0.0,0.0,0.0
Paper packaging,0.0,0.0,0.0,0.0
Fireworks paper tubes and fragments,0.0,0.0,0.0,0.0
"Foil wrappers, aluminum foil",-0.19,0.0,0.09,-0.23
"Metal bottle caps, lids & pull tabs from cans",0.0,0.0,0.11,-0.11
"Glass drink bottles, pieces",0.12,-0.13,0.11,0.0
"Construction material; bricks, pipes, cement",0.09,0.0,0.0,0.09
Glass or ceramic fragments > 2.5 cm,0.0,0.0,0.0,0.0
Drink lids,0.0,0.13,0.0,0.0


#### The number of correlations

In [9]:
ncorrelated, total = countTheNumberOfCorrelationsPerBuffer(pvals, rhovals)
ncorrelated.style.set_table_styles(table_css_styles)

variable,class,distance,length,size,total
correlated,15,13,11,11,50
positive,3,12,11,1,27
negative,12,1,0,10,23


### Aggregated intersection and length

The length and distance parameters are aggregated in the following way:

1. length: the sum of the length of all intersections in the buffer
2. distance: the average distance to an intersection

Because we are aggregating, the size and class values for each intersection are not considered. The example for St. Gingolph is below.

In [10]:
intersects_agg = ind.groupby(["location"], as_index=False).agg({'distance':'mean', 'length':'sum'})

ints_and_data = df.merge(intersects_agg, on=["location"])
locations = df.location.unique()

data = ints_and_data[(ints_and_data.code.isin(codes)) & (ints_and_data.location.isin(locations))].copy()
data.fillna(0, inplace=True)
data.rename(columns=n_c_names, inplace=True)
data.drop("intersects", inplace=True, axis=1)

data[(data.location == 'grand-clos') & (data.code == "Gfrags")&(data.loc_date == ("grand-clos", "2020-05-07"))]

Unnamed: 0,loc_date,location,river_bassin,feature,city,w_t,code,pcs_m,quantity,distance,length
17351,"(grand-clos, 2020-05-07)",grand-clos,rhone,lac-leman,Saint-Gingolph,l,Gfrags,2.56,82,1039.920497,1995.341698


__River and lake samples aggregated__

In [11]:
columns = ["distance", "length"]

corellation_results = collectCorrelation(data, codes, columns)
crp = pd.DataFrame(corellation_results)
pvals = crp.pivot(index="code", columns="variable", values="p")
rhovals = crp.pivot(index="code", columns="variable", values="rho")

# display the results
a_df = pd.DataFrame(resultsDf(rhovals, pvals))
a_df["item"] = a_df.index.map(lambda x: code_d_map.loc[x])
a_df.set_index("item", inplace=True, drop=True)
a_df.index.name = None
styleBufferResults(a_df)

Unnamed: 0,distance,length
Food containers single use foamed or plastic,0.21,0.0
"Clothing, towels & rags",0.0,0.0
Paper packaging,0.0,0.0
Fireworks paper tubes and fragments,0.0,0.0
"Foil wrappers, aluminum foil",0.11,0.0
"Metal bottle caps, lids & pull tabs from cans",0.19,0.0
"Glass drink bottles, pieces",0.0,0.0
"Construction material; bricks, pipes, cement",0.0,0.0
Glass or ceramic fragments > 2.5 cm,0.0,-0.11
Drink lids,0.22,0.0


#### The number of correlations

In [12]:
ncorrelated, total = countTheNumberOfCorrelationsPerBuffer(pvals, rhovals)
ncorrelated.style.set_table_styles(table_css_styles)

variable,distance,length,total
correlated,21,4,25
positive,21,3,24
negative,0,1,1


__Lake samples aggregated__

In [13]:
datax = data[data.w_t == 'l']
corellation_results = collectCorrelation(datax, codes, columns)
crp = pd.DataFrame(corellation_results)
pvals = crp.pivot(index="code", columns="variable", values="p")
rhovals = crp.pivot(index="code", columns="variable", values="rho")

# display the results
a_df = pd.DataFrame(resultsDf(rhovals, pvals))
a_df["item"] = a_df.index.map(lambda x: code_d_map.loc[x])
a_df.set_index("item", inplace=True, drop=True)
a_df.index.name = None
styleBufferResults(a_df)

Unnamed: 0,distance,length
Food containers single use foamed or plastic,0.12,0.14
"Clothing, towels & rags",0.0,0.0
Paper packaging,0.0,0.0
Fireworks paper tubes and fragments,0.0,0.0
"Foil wrappers, aluminum foil",0.0,0.18
"Metal bottle caps, lids & pull tabs from cans",0.13,0.0
"Glass drink bottles, pieces",-0.2,0.0
"Construction material; bricks, pipes, cement",0.0,0.0
Glass or ceramic fragments > 2.5 cm,0.0,-0.09
Drink lids,0.13,0.23


#### The number of correlations

In [14]:
ncorrelated, total = countTheNumberOfCorrelationsPerBuffer(pvals, rhovals)
ncorrelated.style.set_table_styles(table_css_styles)

variable,distance,length,total
correlated,11,23,34
positive,10,22,32
negative,1,1,2


In [15]:
today = dt.datetime.now().date().strftime("%d/%m/%Y")
where = "Biel, CH"

my_block = f"""

This script updated {today} in {where}

> \u2764\ufe0f what you do everyday

*analyst at hammerdirt*
"""

md(my_block)



This script updated 09/05/2023 in Biel, CH

> ❤️ what you do everyday

*analyst at hammerdirt*


In [16]:
%watermark --iversions -b -r

Git repo: https://github.com/hammerdirt-analyst/landuse.git

Git branch: main

PIL    : 9.5.0
pandas : 2.0.0
IPython: 8.12.0
numpy  : 1.24.2
scipy  : 1.10.1

