In [1]:
%load_ext watermark
import pandas as pd
import setvariables as conf_
import reportclass as r_class
from typing import Type, Optional, Callable
from typing import List, Dict, Union, Tuple
from sklearn.preprocessing import MinMaxScaler

In [2]:

def match_topo_attributes_to_surveys(topo_data: pd.DataFrame, survey_data: pd.DataFrame)-> Tuple[pd.DataFrame,List]:
    """
    Match topographic attributes to survey data for specific locations.

    This function takes topographic attribute data and survey data and matches them based on the unique locations (slugs).
    
    Parameters:
        topo_data (pd.DataFrame): A DataFrame containing topographic attribute data.
        survey_data (pd.DataFrame): A DataFrame containing survey data.

    Returns:
        Tuple[pd.DataFrame, List]: A tuple containing two elements:
            - A DataFrame containing topographic attribute data for locations found in both datasets.
            - A list of locations (slugs) from the survey data for which there is no matching topographic data.

    """

    locations = survey_data.slug.unique()
    available = topo_data.index
    # identify the locations that have no topo data
    no_data = [x for x in locations if x not in available]

    # take the available data and names of locations with no data
    locations_with_data = [x for x in locations if x in available]
    
    return topo_data.loc[locations_with_data], no_data

def merge_topodata_and_surveydata(topo, surveys, columns: List[str] = conf_.work_columns)-> pd.DataFrame:
    """
    Merge survey data with topographic data using location information.

    This function merges survey data with topographic data using the 'slug' column in the survey data
    and the index of the topographic data. The merged DataFrame will contain the specified columns from the survey data.

    Parameters:
        topo (pd.DataFrame): A DataFrame containing topographic data with the location index.
        surveys (pd.DataFrame): A DataFrame containing survey data with a 'slug' column for location matching.
        columns (List[str]): A list of column names to include in the merged DataFrame (default is defined in conf.work_columns).

    Returns:
        pd.DataFrame: A merged DataFrame containing the specified survey data columns and topographic data.
    """
    # merges surveys to topo using the slug column in surveys
    # and the index in topo
    return surveys[columns].merge(topo, left_on='slug', right_index=True)

def scale_a_column(df: pd.DataFrame, column_to_scale: str, column_name: str = 'length'):    

    # Calculate the minimum and maximum values in the column
    min_value = df[column_to_scale].min()
    max_value = df[column_to_scale].max()

    # Perform min-max scaling on a temp column
    df['scalex'] = (df[column_to_scale] - min_value) / (max_value - min_value)
    # reassign the value
    df[column_name] = df['scalex']
    # drop the temp
    df.drop('scalex', axis=1, inplace=True)
    return df

def group_topographic_attributes(df: pd.DataFrame, list_of_labels: List = None, locations: List = None, coi: str = 'scale')-> pd.DataFrame:
    """
    Group and aggregate topographic attributes in a DataFrame.

    This function groups and aggregates topographic attributes in the provided DataFrame. You can specify a list of labels
    to group attributes, filter locations, and choose the column of interest for aggregation.

    Parameters:
        df (pd.DataFrame): A DataFrame containing topographic attributes.
        list_of_labels (List, optional): A list of dictionaries with keys as new attribute names and values as properties to group.
        locations (List, optional): A list of locations to filter the data (default is None, no location filtering).
        coi (str, optional): The column of interest for aggregation (default is 'scale').

    Returns:
        pd.DataFrame: A DataFrame with aggregated topographic attributes based on the specified grouping and filtering.
   """
    
    if locations is not None:
        df = df.loc[df.slug.isin(locations)]    
    # list of labels is a list of dictionaries
    if list_of_labels is not None:
        for new_labels in list_of_labels:
            # the attributes, the dictionary values are 
            # properties being grouped
            attributes = list(new_labels.values())
            # the dictionary key is the new name of
            # the attributes in the list
            new_val = list(new_labels.keys())
            df.loc[df['attribute'].isin(attributes[0]), 'attribute'] = new_val[0]
    # sum the occurrences of the same attribute
    r = df.groupby(['slug','attribute'], as_index=False)[coi].sum()

    # pivot and set the index to the locations
    # have the attributes
    r = r.pivot(columns='attribute', index='slug')            
            
    return r.droplevel(0, axis=1).fillna(0)

def statistic_of_critical_value(df, 
                                df_feature_columns, 
                                df_target_column, 
                                sample_id: str = 'loc_date',
                                value_counts: bool = True,
                                average: bool = False):
    """
    Compute statistics of critical values for given data.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing the data.
    df_feature_columns (list): A list of columns to be used as feature columns.
    df_target_column (list): A list of columns to be used as target columns.
    sample_id (str, optional): The column representing sample identifiers (default is 'loc_date').
    value_counts (bool, optional): If True, compute value counts as weights (default is True).
    average (bool, optional): If True, compute the median for the feature columns (default is False).

    Returns:
    pd.DataFrame: A DataFrame containing computed statistics based on the specified options.
    """
    d = pd.melt(df, value_vars=df_feature_columns, id_vars=[df_target_column, sample_id])
    
    if value_counts:
        di = d.groupby('variable', as_index=False)['value'].value_counts()
        di['weight'] = di['count']/d[sample_id].nunique()
        di = di.pivot(columns='variable', index='value', values='weight')
    if average:        
        di = d.groupby(['variable', 'value'], as_index=False)[df_target_column].median()
        di = di.pivot(columns='variable', index='value', values=df_target_column)
                
    return di

class LandUse:
    """
    A class for analyzing and transforming land use data.

    This class provides methods to analyze land cover, land use, and transportation data.
    It allows you to group attributes, scale data, and create ordinal rankings based on quantiles.

    Parameters:
        land_cover (pd.DataFrame): DataFrame containing land cover data.
        land_use (pd.DataFrame): DataFrame containing land use data.
        transportation (pd.DataFrame): DataFrame containing transportation data.
        locations (List): List of locations for filtering data.
        street_groups (List, optional): List of street groups (default is from configuration).
        land_use_groups (List, optional): List of land use groups (default is from configuration).

    Attributes:
        quantiles (List): List of quantile values for ordinal ranking.
        labels (List): List of labels corresponding to quantile groups.

    Properties:
        - land_cover: Grouped and aggregated land cover data.
        - land_use: Grouped and aggregated land use data.
        - trans: Grouped and aggregated transportation data.
        - use_of_land: Combined data with the option to scale the columns between 0 and 1.
        - ordinal_land_rank: Ordinal ranking based on quantiles for land use data.

    Example:
        land_use_data = LandUse(land_cover_data, land_use_data, transportation_data, locations)
        land_cover = land_use_data.land_cover
        land_use = land_use_data.land_use
        trans_data = land_use_data.trans(new_labels=[{'new_attr': ['attr1', 'attr2']}])
        land_rankings = land_use_data.ordinal_land_rank
    """
    street_groups = conf_.street_groups
    land_use_groups = conf_.lu_groups

    def __init__(self, land_cover, land_use, transportation, locations, street_groups=street_groups, land_use_groups=land_use_groups):
        self.lc = land_cover
        self.lu = land_use
        self.tr = transportation
        self.locations = locations
        self.lug = land_use_groups
        self.stg = street_groups
        self.quantiles = [0.0, 0.03, 0.25, 0.75, 0.97, 1.0]
        self. labels = ['lowest', 'low', 'middle', 'high', 'highest']
        
    @property
    def land_cover(self, list_of_labels=None):
        return group_topographic_attributes(self.lc, locations=self.locations, list_of_labels=list_of_labels)

    @property
    def land_use(self, new_labels=None):
        if new_labels is not None:
            return group_topographic_attributes(self.lu, locations=self.locations, list_of_labels=new_labels)
        else:
            return group_topographic_attributes(self.lu, locations=self.locations, list_of_labels=self.slug)
    
    @property
    def trans(self,new_labels=None):
        if new_labels is not None:
            return group_topographic_attributes(self.tr, locations=self.locations, list_of_labels=new_labels, coi='length')
        else:
            return group_topographic_attributes(self.tr, locations=self.locations, list_of_labels=self.stg, coi='length')

    @property
    def use_of_land(self):
        a = self.land_cover.merge(self.land_use, left_index=True, right_index=True)
        b = a.merge(self.trans, left_index=True, right_index=True)
        
        return b
    
    @property
    def ordinal_land_rank(self):
        ranked_df = self.use_of_land.copy()
        columns_to_rank = ranked_df.columns
        for column in columns_to_rank:
            label = f'{column}_ordinal_rank'
            ranked_df[label] = pd.cut(ranked_df[column], bins=self.quantiles, labels=self.labels, include_lowest=True)
            ranked_df[column] = ranked_df[label]
            ranked_df.drop(label, inplace=True, axis=1)
        return ranked_df

# Checking the assistant

This page is a reference point for testing the accuracy of the GPT assigned to accompany readers of the federal report. The GPT should reproduce the calculations on this page at any time. This includes values not in the federal report. Stakeholders will need to apply these results to their proper geographic or administrative responsibilities. The hammerdirt GPT assists in this process.

The product is a dataframe that is the combination of columns from the `ReportClass` and columns from the `LandUseClass`. The intention is to allow easy access to the magnitude of toprgaphical features within 1 500 m of the observed density for any object in the data.

```{important}

__April 17, 2023:__ The app that uses the hammerdirtGPT is in demo-form. We have abandoned the intial method of defining the prompt through the api. We are now developing a RAG application. A component of the context for the prompt is the results from users request. With this we combine the references from the federal report and any updated references that can be included.

**Changes to class definitions:** Building a RAG application means that we have to consider both the user visualisation of the report and the consumption of that data for the AI model, data_frame or array for the former, .JSON for the latter. These considerations will have a transformative effect on all the code in this module.

__PREVIOUS__

November 20, 2023: There is a known issue we are working on now. Remind the assistant to follow intsructions. Specifically in the following cases:

1. Always getting a value of zero for the median sample total
   * The GPT has specific instructions on this
2. Tells you the correct columns are not available
   * The GPT has the column names and definitions from this page

The data has a two column index, somtheing the GPT does not always recognize. An issue has been submitted [issue](https://github.com/hammerdirt-analyst/feb_2024/issues/1)
```

```{note}
The assistants role is to provide mathematical and graphical representations of the data in response to the researchers request. This often involves aggregating values at different levels, combining attributes and the like.

This page allows all users to verify that these complex transactions are happening correctly. The GPT may not use the same method to calculate the final result, but the results should be same.
```
## Default data of hammerdirt GPT:

beta version = .01

The default data for the GPT can be reproduced on the command line if the `hammerdirtgpt` package is installed:

```python
# Collecting required data to establish a report
# This includes the language maps for all the common
# abbreviations and columns or index labels.
c_l = r_class.language_maps()

# The survey data in units of pcs/m or pcs/m². The reports
# are aggregated first to the sample_id. Which means that the operations
# are the same wether using pcs/m or pcs/m².
surveys = r_class.collect_survey_data_for_report()

# The support or evnvironmental data. This includes plain text descriptions 
# of the Codes. Details for each survey location and topogrphical data
# extracted from the buffer around each survey location.
codes, beaches, land_cover, land_use, streets, river_intersect_lakes = r_class.collect_env_data_for_report()

# Add columns to survey data. The support data contains information that can be used to
# group objects or survey locations that may not be stored with the survey data. In this
# example an adiminstrative label is attached to each survey_id. The cantonal label:
survey_data = surveys.merge(beaches['canton'], left_on='slug', right_index=True, validate='many_to_one')
# survey_data = survey_data.loc[survey_data.code == 'G27'].copy()
survey_data = survey_data[survey_data.feature_name != 'aare'].copy()

# ! USER DEFINED INPUT
# Temporal and geographic boundaries.
boundaries = dict(feature_type ='l', language='fr', start_date='2015-01-01', end_date='2022-01-01')
# Make the report data and report
top_label, language, w_df, w_di = r_class.report_data(boundaries, survey_data.copy(), beaches, codes)
a_report = r_class.ReportClass(w_df, boundaries, top_label, 'fr', c_l)
w_df_locations = w_df.slug.unique()

# call the land use class on the two different location groups
m_ui = LandUse(land_cover, land_use, streets, w_df_locations)

# for the region of interest
lcui = m_ui.use_of_land.copy()
lc_sti, no_datai = match_topo_attributes_to_surveys(lcui, a_report.w_df)

# the basic work data contains the survey results and the 
# topographical data merged on the <slug> column
work_data_i = merge_topodata_and_surveydata(lc_sti, a_report.w_df)

new_names = {
    'slug':'location',
    'loc_date':'sample_id',
    'pcs_m':'pcs/m',
    'Obstanlage': "orchards",
    'Reben':'vineyards',
    'Siedl':'buildings',
    'Wald':'forest',
    'land_use':'public services'
}
gptdf = work_data_i.rename(columns=new_names)
```

The preceding code produces the following table:

In [3]:
# Collecting required data to establish a report
# This includes the language maps for all the common
# abbreviations and columns or index labels.
c_l = r_class.language_maps()

# The survey data in units of pcs/m or pcs/m². The reports
# are aggregated first to the sample_id. Which means that the operations
# are the same wether using pcs/m or pcs/m².
surveys = r_class.collect_survey_data_for_report()

# The support or evnvironmental data. This includes plain text descriptions 
# of the Codes. Details for each survey location and topogrphical data
# extracted from the buffer around each survey location.
codes, beaches, land_cover, land_use, streets, river_intersect_lakes = r_class.collect_env_data_for_report()

# Add columns to survey data. The support data contains information that can be used to
# group objects or survey locations that may not be stored with the survey data. In this
# example an adiminstrative label is attached to each survey_id. The cantonal label:
survey_data = surveys.merge(beaches['canton'], left_on='slug', right_index=True, validate='many_to_one')
# survey_data = survey_data.loc[survey_data.code == 'G27'].copy()
survey_data = survey_data[survey_data.feature_name != 'aare'].copy()

# ! USER DEFINED INPUT
# Temporal and geographic boundaries.
boundaries = dict(feature_type ='l', language='fr', start_date='2015-01-01', end_date='2022-01-01')
# Make the report data and report
top_label, language, w_df, w_di = r_class.report_data(boundaries, survey_data.copy(), beaches, codes)
a_report = r_class.ReportClass(w_df, boundaries, top_label, 'fr', c_l)
w_df_locations = w_df.slug.unique()

# call the land use class on the two different location groups
m_ui = LandUse(land_cover, land_use, streets, w_df_locations)

# for the region of interest
lcui = m_ui.use_of_land.copy()
lc_sti, no_datai = match_topo_attributes_to_surveys(lcui, a_report.w_df)

# the basic work data contains the survey results and the 
# topographical data merged on the <slug> column
work_data_i = merge_topodata_and_surveydata(lc_sti, a_report.w_df)

# column categories
geo_features = ['feature_type', 'vineyards', 'orchards', 'buildings', 'forest', 'undefined', 'public services', 'streets','parent_boundary']
admin_boundaries = ['city', 'canton', 'feature_name']
sample_variables = ['location', 'sample_id', 'date']
target_variables = ['pcs/m', 'quantity']

new_names = {'slug':'location', 'loc_date':'sample_id', 'pcs_m':'pcs/m', 'Obstanlage': "orchards", 'Reben':'vineyards', 'Siedl':'buildings', 'Wald':'forest', 'land_use':'public services'}
gptdf = work_data_i.rename(columns=new_names)

groupby = ['sample_id','location',  'date', 'feature_name', 'parent_boundary',
       'city', 'canton', 'feature_type',
       'orchards', 'vineyards', 'buildings', 'forest', 'undefined',
       'public services', 'streets', 'code']

gptdfx = gptdf.groupby(groupby, as_index=False).agg({'pcs/m': 'sum', 'quantity':'sum'})
gptdf.head()

AttributeError: 'LandUse' object has no attribute 'slug'

### Hand file to assistant

#### Add language definitions

The language definitions ensure an efficient transmission of intent from the observer to the model. We could leave the translations and definitions up to a translator and thus reduce the weight of the .csv file or API request. Howver this would generate an additional service by the client to get the requested information translated. Providing the definitions according to the standard set in the Federal report is a good baseline. If their is support amongst stakeholders to change the definitions then this can be handled by a pull request or raising an issue on the repo.

```python
gptdf['fr'] = gptdf.code.map(lambda x: codes.loc[x, 'fr'])
gptdf['en'] = gptdf.code.map(lambda x: codes.loc[x, 'en'])
gptdf['de'] = gptdf.code.map(lambda x: codes.loc[x, 'de'])

gptdf.to_csv('data/in_process/lakes.csv', index=False)
```

In [None]:
gptdfx['fr'] = gptdfx.code.map(lambda x: codes.loc[x, 'fr'])
gptdfx['en'] = gptdfx.code.map(lambda x: codes.loc[x, 'en'])
gptdfx['de'] = gptdfx.code.map(lambda x: codes.loc[x, 'de'])

gptdfx.to_csv('data/in_process/lakes.csv', index=False)

In [None]:
gptdfx.head()

In [None]:
gptdfx[gptdfx.code == 'G79']

In [None]:
gptdfx.columns

### Column names and definitions

These column names and definitions are given to the GPT assistant.

1. location: the name of the location used by people doing the survey
2. sample_id: the combination of the location and date, the unique identifier of a sampling event
3. date: the data of the sample
4. feature_name: the name of the park, lake, or river where the sample was collected
5. parent_boundary: a designated survey area, usually a river basin or regional label
6. city: the muniicpality where the sample was taken
7. canton: the canton where the sample was taken
8. pcs/m: the number of objects identified by the column _code_ collected at the sampling event divided by the length of shoreline, river bank or trail that was sampled.
9. quantity: the number of objects identified by the column _code_ collected at the sampling event
10. code: the Marine Litter Watch object code
11. feature_type: identifies the sample location as either a park, lake or river
12. orchard: % of dry land attributed to this land-use within 1'500 m of the survey location
13. vineyards: % of dry land attributed to this land-use within 1'500 m of the survey location
14. buildings: % of dry land attributed to this land-use within 1'500 m of the survey location
15. forest: % of dry land attributed to this land-use within 1'500 m of the survey location
16. undefined: % of dry land with no land-use label
17. public services: % of dry land attributed to hospitals, schools, sports, administration
18. streets: the number of meters of streets within 1 500 m of the survey location. scaled between 0 - 1.
19. fr: french code definitions
20. en: english code definitions
21. de: german code definitions

```{note}
The GPT will go through data exploration at the begining of the chat. These column defintions are given to the GPT and can be requested at any time. The definitions the GPT gives you should be very close to these definitions, it is not tell the GPT to use the provided definition in its instructions. These definitions should come back.
``` 

## Verifying the output

### Test statistics

Asking for each of these individually or telling the assistant to produce them all should yield the following results:

* the median sample total of the data frame
* the total quantity
* the number of lakes
* the number of samples
* the number of cantons
* the number of cities

In [None]:
gp_dt = gptdfx.groupby(['sample_id', *geo_features], as_index=False).agg({'pcs/m':'sum', 'quantity':'sum'})

lakes = gptdfx[gptdfx.feature_type == 'l'].feature_name.nunique()
cities = gptdfx.city.nunique()
quantity = gptdfx.quantity.sum()
samples = gptdfx.sample_id.nunique()
cantons = gptdfx.canton.nunique()
pc_med = gp_dt['pcs/m'].median()

test_1 = dict(lakes=lakes, cities=cities, quantity=quantity, samples=samples, cantons=cantons, median_pcs_m = pc_med)
print(test_1)

### Most common

The most common codes are those codes that are either in the top ten by quantity or present in at lease 50% of the surveys.

In [None]:
most_common, weight = a_report.most_common
most_common

### Aggregating samples

#### Sample total pcs/m

In [None]:
gp_dt['pcs/m'].describe()

#### Single code

cigarette ends

In [None]:
gp_dtcode = gptdfx[gptdfx.code.isin(['G27'])].groupby(['sample_id', *geo_features], as_index=False).agg({'pcs/m':'sum', 'quantity':'sum'})
gp_dtcode['pcs/m'].describe()

#### Combining codes

combining cigarette ends and snack wrappers

In [None]:
gp_dtcodes = gptdfx[gptdfx.code.isin(['G27', 'G30'])].groupby(['sample_id', *geo_features], as_index=False).agg({'pcs/m':'sum', 'quantity':'sum'})
gp_dtcodes['pcs/m'].describe()

### Single feature

the results on Bielersee

In [None]:
gp_dtbsee = gptdfx[gptdfx.feature_name == 'bielersee'].groupby(['sample_id', *geo_features], as_index=False).agg({'pcs/m':'sum', 'quantity':'sum'})
gp_dtbsee['pcs/m'].describe()

### Combined features

Bielersee and Thunersee

In [None]:
gp_dtbt = gptdfx[gptdfx.feature_name.isin(['bielersee', 'thunersee'])].groupby(['sample_id', *geo_features], as_index=False).agg({'pcs/m':'sum', 'quantity':'sum'})
gp_dtbt['pcs/m'].describe()

### Land use

Correlation matrix of the land use variables with each other

In [None]:
corrs = gp_dtbt[geo_features[1:-1]].corr()
corrs

In [None]:
%watermark -a hammerdirt-analyst -co --iversions