In [None]:
import sys, os
cwd=os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.insert(0, cwd)
import utils.s3_utils as s3
import pandas as pd
import shapefile as shp
import geopandas as gpd
import re
import matplotlib.pyplot as plt
import numpy as np
import folium
import utils.processing as pr
import os 
import rioxarray as rxr
import xarray as xr
from pyproj import Geod
import geopandas as gpd
import config.paths as path
from sklearn.metrics.pairwise import euclidean_distances
pd.set_option('display.max_columns', None)

In [None]:
region_list = [0,1,2]
modelname = path.model_path

### This section combines predictor data by region

In [None]:
#Grab environment mapping variables
env_db_path = '../../data/raw-data/landcover-EEA/eea_r_3035_100_m_ecosystem-types-terrestrial-c_p_2012_v03_r01/mapping-dict-eea_r_3035_100_m_etm-terrestrial-c_2012_v3-1_r00.tif.vat.csv'
env_db = pd.read_csv(env_db_path).rename({'Value,N,10,0':'env_code','EUNIS_L2,C,254':'env_type'},axis=1)
env_map = dict(zip(env_db.env_code,env_db.env_type))
env_map

In [None]:
### Newer method - sjoins each weather data in piecemeal so we don't have missing data from outer joining all climate variables
### Loop through country list to merge landcover, elevation, reservoir, weather data (in order)
for region_code in region_list:
    print(f'Beginning to process {region_code} predictors')

    ### Folding in Raster data ###
    print(f'Pulling in raster data for {region_code}')

    fp = f'../../data/{modelname}/raster-combined/eea-.1degclim-deer-reservoir-processed/clustered/raster_region_{region_code}.tif' #local
    ds = rxr.open_rasterio(fp, band_as_variable=True)

    env = ds.to_dataframe().reset_index()
    env = env.rename({
        'x':'lon_env', 
        'y':'lat_env', 
        'band_1':'Apodemus_flavicollis',
        'band_2':'Apodemus_sylvaticus',
        'band_3':'Cervus_elaphus',
        'band_4':'Dama_dama',
        'band_5':'Microtus_subterraneus',
        'band_6':'Myodes_glareolus',
        'band_7':'landcover',
        'band_8':'elev',
        'band_9':'red_deer',
        'band_10':'roe_deer'
        }, axis=1) 
    env = env.drop(['spatial_ref'],axis=1)
    del ds
    print(f'Shape of {region_code} env raster df before trimming: {env.shape}')

    #remove -9999 (NODATA from raster proc) from landcover
    env = env[env.landcover !=-9999]

    env['cat'] = [env_map[int(i)] for i in env['landcover']]
    
    #remove elevation NODATA values
    env=env[env['elev']!=-9999]

    
    #set red_deer and roe_deer nodata variables to 0. 
    env.loc[env['red_deer'] == -9999, 'red_deer'] = 0
    env.loc[env['roe_deer'] == -9999, 'roe_deer'] = 0

    # #set agustin reservoir nodata variables to 0
    env.loc[env['Apodemus_flavicollis'] == -9999, 'Apodemus_flavicollis'] = 0
    env.loc[env['Apodemus_sylvaticus'] == -9999, 'Apodemus_sylvaticus'] = 0
    env.loc[env['Cervus_elaphus'] == -9999, 'Cervus_elaphus'] = 0
    env.loc[env['Dama_dama'] == -9999, 'Dama_dama'] = 0
    env.loc[env['Microtus_subterraneus'] == -9999, 'Microtus_subterraneus'] = 0
    env.loc[env['Myodes_glareolus'] == -9999, 'Myodes_glareolus'] = 0

    #create env_gdf
    env_gdf = gpd.GeoDataFrame(
        env, geometry=gpd.points_from_xy(env['lon_env'], env['lat_env']))

    print(f'Shape of {region_code} env raster df after trimming and creating gdf from df: {env.shape}')
    del env

    #get env range to clip weather+res data, plus some buffer to include nearest weather stations outside of borders
    lat_min = env_gdf['lat_env'].min()
    lon_min = env_gdf['lon_env'].min()
    lat_max = env_gdf['lat_env'].max()
    lon_max = env_gdf['lon_env'].max()

    env_lat_range = [lat_min-.5, lat_max+.5]
    env_lon_range = [lon_min-.5, lon_max+.5]
    print(f'weather clipping range from elevation/landcover is: {env_lat_range}, {env_lon_range}')

    ### Use weather data from earlier and merge in

    ### Grab weather data here once
    print('Merging in weather data')
    fp = f'../../data/{modelname}/feature_engineering/0.10deg/'
    feature_df_list = sorted([f for f in os.listdir(fp) if not f.startswith('.')])
    merge_ls = []

    final_pred = env_gdf.copy()
    for fn in feature_df_list:
        #pull in min temp data from local
        filename = fp+fn
        print(f'Merging in {filename}')
        main_df = pd.read_parquet(filename, engine='pyarrow') # filter clause, filter = [("tn", ">", 12)])
        
        #clip weather data from outside country

        # #next section is for any aggregations done by collection group (ie 2000-2022 windSpeed)
        col_var = main_df.columns[3]
        p = fn[0:3]
        aggtype = 'grp-mean-'
        df = main_df.groupby([f'longitude',f'latitude']).agg({
            col_var : 'mean'
            }).reset_index()
        df.columns = ['longitude','latitude', p+aggtype+col_var]

        cntry_clim = df[((df['latitude'] >= env_lat_range[0]) & (df['latitude'] <= env_lat_range[1])) & ((df['longitude'] >= env_lon_range[0]) & (df['longitude'] <= env_lon_range[1]))]

        clim_gdf = gpd.GeoDataFrame(
            cntry_clim,geometry=gpd.points_from_xy(cntry_clim['longitude'], cntry_clim['latitude']))

        # join country climate gdf to env raster variables

        final_pred = final_pred.sjoin_nearest(clim_gdf,exclusive=True)
        final_pred = final_pred.drop(['index_right','latitude','longitude'],axis=1)

        del cntry_clim
    
    print('joining nuts 3 cntr_code and nuts_id/name')
    final_pred = pr.nuts_join_regions(final_pred)
    print('done joining nuts3')

    print(f'Shape of {region_code} finalpred df after adding climate data: {final_pred.shape}')

    print('writing parquet file to directory')
    write_filename = f'../../data/{modelname}/processed-predictor-parquets/clustered/{region_code}-predictors.parquet'
    final_pred.to_parquet(write_filename, compression ='snappy')
    print(f'{region_code} predictors complete')
    del final_pred
    print('final_pred df deleted')
    print('***********************************************')
    print(' ')

### Tick Training Data (sample with data) + Pseudabsence Processing Data

Combine covariates and tick foci data into training data set for sklearn models.
Combine covariates and random pseudabsence points into pseudabsence training data set for sklearn models.

Uses buffered data for training data and uses unbuffered data to return pseudoabsence by country. Buffered means the NUTS country shapefile was expanded outside of the country's borders to include islands, etc. Unbuffered means we used the NUTS shapefile as is when cutting out covariates. If we use Buffered data for pseudoabsence, it leads to double sampling of regions where buffered region data overlap between bordering countries. 

You can scale the number of pseudabsence points by a factor given by the **scaler** parameter. 

In [None]:
trainingdata_path = f'../../data/{modelname}/training/final/clustered/'
region_code_list = [0,1,2]

In [None]:
#requires all data to be present (in predictors data folder)
cov_dict = path.cov_dict


#Total Region area by predictor length
region_area_dict = {}
for region_code in region_list:
    read_filename =  f'../../data/{modelname}/processed-predictor-parquets/clustered/{region_code}-predictors.parquet'
    blah = pd.read_parquet(read_filename)
    region_area_dict[region_code] = len(blah)



## Current method: batch by kmeans clusters


In [None]:
for region_code in region_code_list:

    ### Combine tick foci data with covariates by country, create pseudoabsence points. Save both files by country.
    #read in processed buffered country predictor data
    read_filename =  f'../../data/{modelname}/processed-predictor-parquets/clustered/{region_code}-predictors.parquet'
    pred_df = pd.read_parquet(read_filename)
    pred_df = gpd.GeoDataFrame(
        pred_df, geometry=gpd.points_from_xy(pred_df['lon_env'], pred_df['lat_env']))
    pred_df = pred_df.drop(['nuts_id','nuts_name','levl_code','cntr_code'],axis=1)
    print(f'region - {region_code}')

    #grab processed tick data
    path_to_read_file=f'../../data/{modelname}/processed-master-database/'
    print(path_to_read_file)
    file_name_microfoci = 'cluster_df.csv'

    tick_df = pd.read_csv(path_to_read_file+file_name_microfoci)
    tick_df = tick_df[tick_df['region']==region_code]
    tdf = tick_df[['presence', 'longitude','latitude','country_code','nuts_id','nuts_name','region']] #presence and obs_type seem to matchup
    tgdf = gpd.GeoDataFrame(
        tdf, geometry=gpd.points_from_xy(tdf['longitude'], tdf['latitude']))

    #join tick and predictor data
    train_df = tgdf.sjoin_nearest(pred_df, how='left').drop_duplicates(['latitude','longitude'])
    training = train_df.drop(['geometry','index_right'],axis=1).reset_index(drop=True)

    #output training data 
    output_region = f'region_{region_code}/'
    training_filename = trainingdata_path + output_region + f'training_data/{region_code}-training-data.csv'
    training.to_csv(training_filename)

    ### pseudoabsence data section
    print(f'Beginning pseudoabsence point df creation for region-{region_code}.')
    #get subset of env data without tick data points
    pseudoabsence = pred_df[~(pred_df['geometry'].isin(list(train_df['geometry'])))]

    #sample proportion of pseudoabsence points relative to whole area under study
    num_pseudo_pts = round(10000 * region_area_dict[region_code] / sum(list(region_area_dict.values())))
    prop_value = num_pseudo_pts
    print(f'Number of Pseudoabsence for region {region_code}: {num_pseudo_pts}')


    #Sample covariates now using imported proportion
    pseudoabsence['presence'] = 0
    pseudoabsence_sample = pseudoabsence.sample((prop_value))
        
    for landcover in set(pred_df['cat']).difference(set(pseudoabsence_sample['cat'])):
        pa_strat_sample = pseudoabsence[pseudoabsence['cat']==landcover].sample(1)
        pseudoabsence_sample = pd.concat([pseudoabsence_sample, pa_strat_sample])


    ### map nuts and admin localities to pseudoabsence points
    print('Mapping Nuts locations to pseudoabsence points')
    pseudoabsence_sample['region']=region_code
    pseudoabsence_final = pr.nuts_join_regions(pseudoabsence_sample).drop(['levl_code'],axis=1)
    pseudoabsence_final = pseudoabsence_final.rename({
                'tg-grp-mean-days-above-5degc-monthly-ratio':'tg-grp-mean-days-above-5degC-monthly-ratio', 
                }, axis=1)  #rename the lowercase col that happened in the pr.nuts_join func)

    # Sample now
    pseudo_filename = trainingdata_path + output_region +f'pseudoabsence_data/{region_code}-pseudoabsence-data.csv'
    pseudoabsence_final.to_csv(pseudo_filename)
    print(f'{len(pseudoabsence_final)} Pseudoabsence points Saved for region-{region_code} at: {pseudo_filename}')

